Python爬取单篇漫画并下载到指定的文件夹

初衷

新年在老家网络信号不好，又想看漫画，所以在线指望不上了，所以写个爬虫，把漫画图片抓取到本地离线观看

准备工作

所需要的包，os，requests，urllib，time，BeautifulSoup
先在cmd用pip list查看是否又安装，如果没安装用pip install 安装相应的包
例如：
pip install requests

源码分享

import os
import requests
import urllib
import time
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

class hanman_img_spider(object):
    # 类的初始化操作
    def __init__(self):
        # 要访问的网页地址
        self.url = 'https://www.****.top/book/334'
        self.Hurl = 'https://www.****.top'
        # 设置图片要存放的文件目录
        self.filename_path = './***/'
        # 初始化请求头来模拟浏览器
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50'
        }

    def get_soup(self, url):
        # 像目标url地址发送get请求，返回一个response对象
        resp = requests.get(url, verify=False, headers=self.headers, timeout=20)
        #
        html = resp.content.decode('utf-8')
        # 声明BeautifulSoup对象
        soup = BeautifulSoup(html, 'html.parser')
        return soup

    def save_img(self, url, name):
        opener = urllib.request.build_opener()
        # opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36')]
        opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(url, name)

    # 这个函数创建文件夹
    def create_mkdir(self, filename_path):
        path = filename_path.strip()
        isExists = os.path.exists(filename_path)
        if not isExists:
            print('创建名字叫做', filename_path, '的文件夹')
            os.makedirs(path)
            print('创建成功！')
        else:
            print(path, '文件夹已经存在了，不再创建')

    def get_book_url(self):
        # 定义文件夹名称变量，并初始化为1
        file_id = 0
        soup_book_list = self.get_soup(self.url)
        for new in soup_book_list.select('.view-win-list'):
            if len(new.select('a')) > 0:
                # 获取所有的超链接
                url_list = new.findAll('a')
                for j in url_list:
                    # 获取完整的图片地址
                    url = self.Hurl + j['href']
                    # 截取url最后一个/后面的参数
                    # url_id = url.split('/')[-1]
                    # 拼接图片地址
                    # book_url = 'https://www.***.top/chapter/'+url_id
                    # 创建文件夹
                    self.create_mkdir(self.filename_path + "第{}话".format(file_id))
                    soup_img = self.get_soup(url)
                    # 定义图片名称变量，并初始化为0
                    x = 0
                    for img_list in soup_img.select('.comicpage'):
                        # 获取所有的图片
                        img = img_list.findAll('img')
                        for i in img:
                            img_url = i['data-original']
                            # 输出图片地址
                            print(img_url)
                            # 获取图片地址状态码
                            status = requests.get(img_url)
                            # 打印状态码
                            print(f'图片地址状态码：', status.status_code)
                            # 判断状态码不等于200则破坏循环，否则就保存
                            if status.status_code != 200:
                                break
                            else:
                                # 保存图片
                                self.save_img(img_url, self.filename_path + "第{}话".format(file_id) + '/%s.jpg' % x)
                            x += 1
                            # 输出下载第几张
                            print('正在下载第%d张' % x)
                    file_id += 1
                    # 推迟调用线程的运行
                    time.sleep(5)

    def run(self):
        self.get_book_url()


if __name__ == '__main__':
    spider = hanman_img_spider()
    spider.run()

运行结果

改善

爬虫请求时间建议设置长一些，避免返回各种网页错误