0
点赞
收藏
分享

微信扫一扫

Python爬取单篇漫画并下载到指定的文件夹

汤姆torn 2022-02-17 阅读 75

Python爬取单篇漫画并下载到指定的文件夹

初衷

新年在老家网络信号不好,又想看漫画,所以在线指望不上了,所以写个爬虫,把漫画图片抓取到本地离线观看

准备工作

所需要的包,os,requests,urllib,time,BeautifulSoup
先在cmd用pip list查看是否又安装,如果没安装用pip install 安装相应的包
例如:
pip install requests

源码分享

import os
import requests
import urllib
import time
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

class hanman_img_spider(object):
    # 类的初始化操作
    def __init__(self):
        # 要访问的网页地址
        self.url = 'https://www.****.top/book/334'
        self.Hurl = 'https://www.****.top'
        # 设置图片要存放的文件目录
        self.filename_path = './***/'
        # 初始化请求头来模拟浏览器
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50'
        }

    def get_soup(self, url):
        # 像目标url地址发送get请求,返回一个response对象
        resp = requests.get(url, verify=False, headers=self.headers, timeout=20)
        #
        html = resp.content.decode('utf-8')
        # 声明BeautifulSoup对象
        soup = BeautifulSoup(html, 'html.parser')
        return soup

    def save_img(self, url, name):
        opener = urllib.request.build_opener()
        # opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36')]
        opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(url, name)

    # 这个函数创建文件夹
    def create_mkdir(self, filename_path):
        path = filename_path.strip()
        isExists = os.path.exists(filename_path)
        if not isExists:
            print('创建名字叫做', filename_path, '的文件夹')
            os.makedirs(path)
            print('创建成功!')
        else:
            print(path, '文件夹已经存在了,不再创建')

    def get_book_url(self):
        # 定义文件夹名称变量,并初始化为1
        file_id = 0
        soup_book_list = self.get_soup(self.url)
        for new in soup_book_list.select('.view-win-list'):
            if len(new.select('a')) > 0:
                # 获取所有的超链接
                url_list = new.findAll('a')
                for j in url_list:
                    # 获取完整的图片地址
                    url = self.Hurl + j['href']
                    # 截取url最后一个/后面的参数
                    # url_id = url.split('/')[-1]
                    # 拼接图片地址
                    # book_url = 'https://www.***.top/chapter/'+url_id
                    # 创建文件夹
                    self.create_mkdir(self.filename_path + "第{}话".format(file_id))
                    soup_img = self.get_soup(url)
                    # 定义图片名称变量,并初始化为0
                    x = 0
                    for img_list in soup_img.select('.comicpage'):
                        # 获取所有的图片
                        img = img_list.findAll('img')
                        for i in img:
                            img_url = i['data-original']
                            # 输出图片地址
                            print(img_url)
                            # 获取图片地址状态码
                            status = requests.get(img_url)
                            # 打印状态码
                            print(f'图片地址状态码:', status.status_code)
                            # 判断状态码不等于200则破坏循环,否则就保存
                            if status.status_code != 200:
                                break
                            else:
                                # 保存图片
                                self.save_img(img_url, self.filename_path + "第{}话".format(file_id) + '/%s.jpg' % x)
                            x += 1
                            # 输出下载第几张
                            print('正在下载第%d张' % x)
                    file_id += 1
                    # 推迟调用线程的运行
                    time.sleep(5)

    def run(self):
        self.get_book_url()


if __name__ == '__main__':
    spider = hanman_img_spider()
    spider.run()

运行结果

运行结果

改善

爬虫请求时间建议设置长一些,避免返回各种网页错误

举报

相关推荐

0 条评论