爬取排行榜多本小说
前言:
本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。
记录自己的学习过程:
- 需要爬取的小说的路径 https://www.xbiquge.so/top/allvisit/
- 获取小说的名称以及内容
- 获取需要爬取小说的章节以及对应的url地址
- 下载对应的内容保存写入到本地
一. 获取小说排行榜信息
top_list = requests.get('https://www.xbiquge.so/top/allvisit/').text
selector = parsel.Selector(top_list)
二.获取小说的名称以及内容
# 获取排行榜小说的路径地址
href=selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/@href").getall()
# span[contains(@class,'s2')]
# 只匹配class = s2的标签
print(href)
# 获取排行榜小说的名称
title=selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/text()").getall()
print(title)
三.获取需要爬取小说的章节以及对应的url地址
for book_url, book_title in zip(href, title):
url = book_url
print(url)
response = requests.get(url)
data_html = response.text
url_list = re.findall('<dd><a rel="nofollow" href="(.*?)">(.*?)</a></dd>',data_html)
print(url_list)
四:利用with open 写入到本地进行保存
for url_1, title in url_list:
url_1 = book_url + url_1
# print(url_1)
response_1 = requests.get(url_1)
text = response_1.text
content = re.findall('<div id="content" name="content">(.*?)<p.*?</div>',text,re.S)[0]
content = content.replace(u' ', u'').replace(u'<br>', u'').replace(u';', u'').replace(u'<br />', u'\n').replace(u'笔趣阁 www.xbiquge.so,最快更新深空彼岸 !', u'')
with open ('./txt/'+book_title+'/'+title+'.txt',mode ='w',encoding='utf-8') as f:
f.write(content)
print(title,'爬取成功!!!')
完整代码:
"""
@Name: biquwang02.py
@Auth: MyName
@Date: 2022-06-09-23:21
@Desc:
@Ver : 0.0.0
"""
import functools
import requests
import re
import parsel
import os
import time
import datetime
#程序计时器,启动计时器
start = time.perf_counter()
top_list = requests.get('https://www.xbiquge.so/top/allvisit/').text
# print(top_list)
# 解析工具
# xpath
# css
# re
# parsel.Select : 可以提取的对象类型
selector = parsel.Selector(top_list)
href = selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/@href").getall()
# 只匹配class = s2的标签
# print(href)
title = selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/text()").getall()
# print(title)
#
for book_url, book_title in zip(href, title):
# print(book_url,book_title)
print(f'---------------------------正在爬取{book_title}---------------------------')
if not os.path.exists('./txt/' + book_title):
os.makedirs('./txt/' + book_title)
url = book_url
response = requests.get(url)
data_html = response.text
url_list = re.findall('<dd><a rel="nofollow" href="(.*?)">(.*?)</a></dd>',data_html)
# print(url_list)
for url_1, title in url_list:
url_1 = book_url + url_1
# print(url_1)
response_1 = requests.get(url_1)
text = response_1.text
content = re.findall('<div id="content" name="content">(.*?)<p.*?</div>',text,re.S)[0]
content = content.replace(u' ', u'').replace(u'<br>', u'').replace(u';', u'').replace(u'<br />', u'\n').replace(u'笔趣阁 www.xbiquge.so,最快更新深空彼岸 !', u'')
with open ('./txt/'+book_title+'/'+title+'.txt',mode ='w',encoding='utf-8') as f:
f.write(content)
print(title,'爬取成功!!!')
#计算启动时间和结束时间的时间差
end = time.perf_counter()
print('运行时间 : %s 秒'%(end-start))