0
点赞
收藏
分享

微信扫一扫

爬取排行榜多本小说

晚熟的猫 2022-06-16 阅读 90

爬取排行榜多本小说

前言

本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。

记录自己的学习过程:

  1. 需要爬取的小说的路径 https://www.xbiquge.so/top/allvisit/
  2. 获取小说的名称以及内容
  3. 获取需要爬取小说的章节以及对应的url地址
  4. 下载对应的内容保存写入到本地

一. 获取小说排行榜信息

top_list = requests.get('https://www.xbiquge.so/top/allvisit/').text
selector = parsel.Selector(top_list)

二.获取小说的名称以及内容

# 获取排行榜小说的路径地址
href=selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/@href").getall()
# span[contains(@class,'s2')]
# 只匹配class = s2的标签
print(href)
# 获取排行榜小说的名称
title=selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/text()").getall()
print(title)

01.jpg

02.jpg

三.获取需要爬取小说的章节以及对应的url地址

for book_url, book_title in zip(href, title):
    url = book_url
    print(url)
    response = requests.get(url)
    data_html = response.text
    url_list = re.findall('<dd><a rel="nofollow" href="(.*?)">(.*?)</a></dd>',data_html)
    print(url_list)

03.jpg

四:利用with open 写入到本地进行保存

    for url_1, title in url_list:
        url_1 = book_url + url_1
        # print(url_1)

        response_1 = requests.get(url_1)
        text = response_1.text
        content = re.findall('<div id="content" name="content">(.*?)<p.*?</div>',text,re.S)[0]
        content = content.replace(u'&nbsp', u'').replace(u'<br>', u'').replace(u';', u'').replace(u'<br />', u'\n').replace(u'笔趣阁 www.xbiquge.so,最快更新深空彼岸 !', u'')
        with open ('./txt/'+book_title+'/'+title+'.txt',mode ='w',encoding='utf-8') as f:
            f.write(content)
        print(title,'爬取成功!!!')

完整代码:

"""
@Name: biquwang02.py
@Auth: MyName
@Date: 2022-06-09-23:21
@Desc: 
@Ver : 0.0.0
"""
import functools

import  requests
import re
import parsel
import os
import time
import datetime

#程序计时器,启动计时器
start = time.perf_counter()

top_list = requests.get('https://www.xbiquge.so/top/allvisit/').text
# print(top_list)

# 解析工具
# xpath
# css
# re
# parsel.Select : 可以提取的对象类型

selector = parsel.Selector(top_list)
href = selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/@href").getall()
# 只匹配class = s2的标签
# print(href)

title = selector.xpath("//div[@class='novelslistss']/li/span[contains(@class,'s2')]/a/text()").getall()
# print(title)

#
for book_url, book_title in zip(href, title):
    # print(book_url,book_title)
    print(f'---------------------------正在爬取{book_title}---------------------------')
    if not os.path.exists('./txt/' + book_title):
        os.makedirs('./txt/' + book_title)
    url = book_url
    response = requests.get(url)
    data_html = response.text
    url_list = re.findall('<dd><a rel="nofollow" href="(.*?)">(.*?)</a></dd>',data_html)
    # print(url_list)
    for url_1, title in url_list:
        url_1 = book_url + url_1
        # print(url_1)

        response_1 = requests.get(url_1)
        text = response_1.text
        content = re.findall('<div id="content" name="content">(.*?)<p.*?</div>',text,re.S)[0]
        content = content.replace(u'&nbsp', u'').replace(u'<br>', u'').replace(u';', u'').replace(u'<br />', u'\n').replace(u'笔趣阁 www.xbiquge.so,最快更新深空彼岸 !', u'')
        with open ('./txt/'+book_title+'/'+title+'.txt',mode ='w',encoding='utf-8') as f:
            f.write(content)
        print(title,'爬取成功!!!')

#计算启动时间和结束时间的时间差
end = time.perf_counter()
print('运行时间 : %s 秒'%(end-start))
举报

相关推荐

0 条评论