Python爬取单篇漫画并下载到指定的文件夹
初衷
新年在老家网络信号不好,又想看漫画,所以在线指望不上了,所以写个爬虫,把漫画图片抓取到本地离线观看
准备工作
所需要的包,os,requests,urllib,time,BeautifulSoup
先在cmd用pip list查看是否又安装,如果没安装用pip install 安装相应的包
例如:
pip install requests
源码分享
import os
import requests
import urllib
import time
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class hanman_img_spider(object):
# 类的初始化操作
def __init__(self):
# 要访问的网页地址
self.url = 'https://www.****.top/book/334'
self.Hurl = 'https://www.****.top'
# 设置图片要存放的文件目录
self.filename_path = './***/'
# 初始化请求头来模拟浏览器
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50'
}
def get_soup(self, url):
# 像目标url地址发送get请求,返回一个response对象
resp = requests.get(url, verify=False, headers=self.headers, timeout=20)
#
html = resp.content.decode('utf-8')
# 声明BeautifulSoup对象
soup = BeautifulSoup(html, 'html.parser')
return soup
def save_img(self, url, name):
opener = urllib.request.build_opener()
# opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36')]
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(url, name)
# 这个函数创建文件夹
def create_mkdir(self, filename_path):
path = filename_path.strip()
isExists = os.path.exists(filename_path)
if not isExists:
print('创建名字叫做', filename_path, '的文件夹')
os.makedirs(path)
print('创建成功!')
else:
print(path, '文件夹已经存在了,不再创建')
def get_book_url(self):
# 定义文件夹名称变量,并初始化为1
file_id = 0
soup_book_list = self.get_soup(self.url)
for new in soup_book_list.select('.view-win-list'):
if len(new.select('a')) > 0:
# 获取所有的超链接
url_list = new.findAll('a')
for j in url_list:
# 获取完整的图片地址
url = self.Hurl + j['href']
# 截取url最后一个/后面的参数
# url_id = url.split('/')[-1]
# 拼接图片地址
# book_url = 'https://www.***.top/chapter/'+url_id
# 创建文件夹
self.create_mkdir(self.filename_path + "第{}话".format(file_id))
soup_img = self.get_soup(url)
# 定义图片名称变量,并初始化为0
x = 0
for img_list in soup_img.select('.comicpage'):
# 获取所有的图片
img = img_list.findAll('img')
for i in img:
img_url = i['data-original']
# 输出图片地址
print(img_url)
# 获取图片地址状态码
status = requests.get(img_url)
# 打印状态码
print(f'图片地址状态码:', status.status_code)
# 判断状态码不等于200则破坏循环,否则就保存
if status.status_code != 200:
break
else:
# 保存图片
self.save_img(img_url, self.filename_path + "第{}话".format(file_id) + '/%s.jpg' % x)
x += 1
# 输出下载第几张
print('正在下载第%d张' % x)
file_id += 1
# 推迟调用线程的运行
time.sleep(5)
def run(self):
self.get_book_url()
if __name__ == '__main__':
spider = hanman_img_spider()
spider.run()
运行结果
改善
爬虫请求时间建议设置长一些,避免返回各种网页错误