关注过我的老粉,想必都知道,本渣渣是写过一些Pyhton爬虫的,虽然本渣渣代码水平跟垃圾佬捡的垃圾一样垃圾,一样菜,但是不妨碍本渣渣装比!
近期日更了一波DIY电脑文章,想必那批老粉都取关了吧?!
很久没写爬虫了,现在就水一篇,证明我还在!
写的比较渣,将就着看,写一篇少一篇?!
案例网站:
爬取效果:
爬取日志:
关键数据获取源码:
html=response.content.decode('utf-8')
tree=etree.HTML(html)
imgs=tree.xpath('//div[@class="worksList"]/a[@class="item"]/img/@data-funlazy')
names=tree.xpath('//div[@class="worksList"]/a[@class="item"]/div[@class="info"]/div[@class="fnt_16"]/text()')
colleges=tree.xpath('//div[@class="worksList"]/a[@class="item"]/div[@class="info"]/div[@class="college fnt_16"]/text()')
awards=re.findall(r'<div class="line"></div>\n<div class="desc fnt_14">(.+?)</div>\n<div class="desc fnt_14">',html,re.S)
designs=re.findall(r'<div class="desc fnt_14">.+?</div>\n<div class="desc fnt_14">(.+?)</div>',html,re.S)
for img,name,college,award,design in zip(imgs,names,colleges,awards,designs):
img_name=f'{name}-{college}-{award}-{design}'
print(img,img_name)
用了xml以及re库,主要是有两个节点的class名称是一致的,搞了很久不知道怎么处理好,所以用了两个不同的库获取到文本内容!
如果有更好的获取方法也可以评论区留言分享!
感谢!
附源码:
#大赛图片采集
# -*- coding: UTF-8 -*-
#公众号:Python与SEO学习
import requests,re
import random,time
import logging
from lxml import etree
# 日志的基本配置
logging.basicConfig(filename='access.log',
format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S %p',
level=10)
class Dtb(object):
def __init__(self):
self.ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
]
def get_ua(self):
ua = random.choice(self.ua_list)
return ua
def get_data(self, page):
url = f'https://www.cdec.org.cn/winningWorks/13447?pageNo={page}'
print(f">> 正在爬取第{page}页列表页数据..")
logging.info(f">> 正在爬取第{page}页列表页数据..")
headers={"User-Agent":self.get_ua()}
response = requests.get(url=url, headers=headers, timeout=6)
html=response.content.decode('utf-8')
tree=etree.HTML(html)
imgs=tree.xpath('//div[@class="worksList"]/a[@class="item"]/img/@data-funlazy')
names=tree.xpath('//div[@class="worksList"]/a[@class="item"]/div[@class="info"]/div[@class="fnt_16"]/text()')
colleges=tree.xpath('//div[@class="worksList"]/a[@class="item"]/div[@class="info"]/div[@class="college fnt_16"]/text()')
awards=re.findall(r'<div class="line"></div>\n<div class="desc fnt_14">(.+?)</div>\n<div class="desc fnt_14">',html,re.S)
designs=re.findall(r'<div class="desc fnt_14">.+?</div>\n<div class="desc fnt_14">(.+?)</div>',html,re.S)
for img,name,college,award,design in zip(imgs,names,colleges,awards,designs):
img_name=f'{name}-{college}-{award}-{design}'
print(img,img_name)
self.down_img(img,img_name)
def down_img(self,img,img_name):
print(f">>开始下载图片文件:{img_name}")
logging.info(f">>开始下载图片:{img_name}")
r = self.get_resp(img)
img_name=f'{img_name}.{img.split('.')[-1]}'
with open(f'{img_name}', 'wb') as f:
f.write(r.content)
print(f"下载图片文件:{img_name}完毕!")
logging.info(f"下载图片文件:{img_name}完毕!")
time.sleep(2)
# 3次重试
def get_resp(self,url):
i = 0
while i < 4:
try:
response = self.get_response(url, time=10)
# print(response.status_code)
return response
except requests.exceptions.RequestException:
i += 1
print(f">> 获取网页出错,{i * 2}S后将重试获取第:{i} 次")
logging.error(f">> {url}---获取网页出错,{i * 2}S后将重试获取第:{i} 次")
time.sleep(i * 2)
def get_response(self,url, time):
ua = random.choice(self.get_ua())
headers = {
'User-Agent': ua,
}
response = requests.get(url=url, headers=headers, timeout=time)
return response
def main(self):
pagenum=63
for page in range(1,pagenum+1):
spider.get_data(page)
time.sleep(8)
if __name__ == '__main__':
spider=Dtb()
spider.main()
出现的问题:
获取不到图片地址
原网页图片不存在
补救方法直接用if选择语句判断一下跳过没有图片的作品
print(img,img_name)
print(type(img))
if str(img)=='[]':
print("!!图片失效!!跳过!")
pass
else:
self.down_img(img, img_name)
扫一扫下方二维码即可关注我噢~
关注我的都变秃了
说错了,都变强了!
不信你试试
扫码关注最新动态
公众号ID:eryeji