Django HttpResponse 响应对象-CFANZ编程社区

import os
import threading
import time
from threading import Thread

import requests
from lxml import etree
from queue import Queue
from Fiction_Spider.settings import headers

# 创建了一个具有最大长度为 10000 的先进先出（FIFO）队列。
q = Queue(10000)


# 1. 获取完本小说排行榜中的小说信息
def get_fiction_list():
	fiction_list_url = "https://www.52bqg.org/wanben/"

	req = requests.get(url=fiction_list_url, headers=headers).text
	# print(req)
	html = etree.HTML(req)

	fiction_types = html.xpath('//div[@id="newscontent"]/div[2]//span[@class="s1"]/text()')
	# print(fiction_types)
	fiction_urls = html.xpath('//div[@id="newscontent"]/div[2]//a/@href')
	# print(fiction_urls)

	return fiction_types, fiction_urls, fiction_titles, fiction_authors


# 2.获取一本小说的所有章节信息
def get_chapter(fiction_url):
	# fiction_url = "https://www.52bqg.org/book_84747/"
	req = requests.get(url=fiction_url, headers=headers).text

	html = etree.HTML(req)
	chapter_urls = html.xpath('//div[@id="list"]//a/@href')[12:22]    # 去掉22获取所有章节url
	chapter_titles = html.xpath('//div[@id="list"]//a/text()')[12:22]    # 去掉22获取所有章节标题
	# print(chapter_urls)
	# print(chapter_titles)

	return chapter_titles


# 3.获取小说的一个章节的内容
def get_content(chapter_url, chapter_title):
	max_retries = 3
	session = requests.Session()
	retries = 0
	while retries < max_retries:
		try:
			resp = session.get(chapter_url)
			resp.raise_for_status()  # 如果响应状态码不是200，则引发HTTPError异常
			html = etree.HTML(resp.text)
			content = html.xpath('//div[@id="content"]/text()')[1:]
			content = f'{chapter_title}\n\n' + '\n\n'.join(content) + '\n\n'
			# print(content)
			return content


# 4.下载一本小说的所有章节的内容
def download():
	while not q.empty():
		chapter_url, chapter_title = q.get()
		content = get_content(chapter_url, chapter_title)
		if content != "":
			with open(f'data/{chapter_title}.txt', 'w', encoding='utf-8') as f:
				f.write(content)
			print(f'{threading.current_thread().name}已下载.....{chapter_title}')
		else:
			q.put([chapter_url, chapter_title])
		time. Sleep(1)


# 5.合并一本小说的所有章节
def merge(chapter_titles, book_name):
	with open(f'data/{book_name}.txt', 'a', encoding='utf-8') as f:
		for chapter_title in chapter_titles:
			file_path = f"data/{chapter_title}.txt"
			if os.path.exists(file_path):
				with open(file_path, 'r', encoding='utf-8') as fp:
					content = fp.read()
					f.write(content)
					print(f'已合并....{chapter_title}')

			os.remove(file_path)
			print(f'已删除....{chapter_title}')


# 主程序
def main():
	fiction_types, fiction_urls, fiction_titles, fiction_authors = get_fiction_list()
	for fiction_type, fiction_url, fiction_title, fiction_author in\
			zip(fiction_types, fiction_urls, fiction_titles, fiction_authors):
		chapter_titles = get_chapter(fiction_url)
		book_name = f'【{fiction_type}】_{fiction_title}_{fiction_author}'

		tasks = []
		for i in range(3):
			th = Thread(target=download, name=f'线程{i}')
			th.start()
			tasks.append(th)

		for task in tasks:
			task.join()

		merge(chapter_titles, book_name)
		print(f'已爬取 {book_name} 全部章节....休息三秒继续\n\n\n')
		time.sleep(3)


# 主控制程序
if __name__ == '__main__':
	main()

注意：篇幅有限，这里仅提供部分源码！需要完整源码可以通过开头的名片或文末的名片联系我！