开发环境:Python3.9、idea
相较于线程池大约优化了50%的速度,在学校网100M宽带下,爬取一页24张需要60s
代码中默认是下载第一页的图片,想要多爬点的话加个循环就好了
整套代码如下,仅供参考
import time
import aiohttp
import asyncio
import requests
import aiofiles
from lxml import etree
async def aiodownload(li):
# 发送下载请求
# 得到图片内容
# 保存到文件
# resp.content.read() ==> requests.content()
async with aiohttp.ClientSession() as session:
sub_url = "".join(li.xpath('./figure/a/@href'))
sub_resp = requests.get(sub_url, headers=headers) # get请求子网页
if len(sub_resp.text) < 1000: # 粗糙的429处理办法 :(
await asyncio.sleep(3)
sub_html = etree.HTML(sub_resp.text)
img_url = "".join(sub_html.xpath('/html/body/main/section/div[1]/img/@src')) # 图片的下载链接
async with session.get(img_url, headers=headers) as img_resp:
img_name = img_url.split('/')[-1]
async with aiofiles.open("img/" + img_name, mode="wb") as f:
await f.write(await img_resp.content.read())
print("下载完成", img_name)
await asyncio.sleep(0)
async def main():
url = "https://wallhaven.cc/toplist?page=1"
resp = requests.get(url, headers=headers)
html = etree.HTML(resp.text)
# 拿到图片组
lis = html.xpath('/html/body/main/div[1]/section[1]/ul/li')
tasks = []
for li in lis:
tasks.append(asyncio.create_task(aiodownload(li))) # 将转换后的协程对象添加进入任务列表
await asyncio.wait(tasks)
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44"
}
t1 = time.time()
print("开始下载...")
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print("下载完成...")
t2 = time.time()
print('运行时间:', t2 - t1)