0
点赞
收藏
分享

微信扫一扫

Python爬取今日头条, 解决崔叔书上的那些事


入手了本崔叔爬虫的书, 书上一些项目代码, 可能在今天用不了, 毕竟网站啥的也不是一层不变的东西

教训:  在看书的过程中 得多关注一下作者的GitHub : ​​https://github.com/Python3WebSpider/Jiepai/issues​​

更改了save_image()和get_image()

添加了headers (记得换上自己的Cookie), 频繁运行代码会导致爬取失败(原因:频繁访问时, 今日头条会给出一个验证码, 跳验证码的模块还没出,到时候再更新)

                                                 Python爬取今日头条, 解决崔叔书上的那些事_3d

还有一个坑, 定义变量,文件夹,文件名字的时候尽量不要以库的名字命名,


  19/7/13 更改之后的代码

import requests
from urllib.parse import urlencode
import os
from hashlib import md5
import json
from multiprocessing.pool import Pool
import re

headers = {
'Cookie':r'UM_distinctid=16be9d1faa4450-0419f190f582f7-3f71045b-15f900-16be9d1faa5aae; tt_webid=6713016542788961804; csrftoken=87e5484ca1d3293ce6984ed250fc6fdc; W2atIF=1; _ga=GA1.2.1603613204.1562996055; _gid=GA1.2.837554184.1562996055; odin_tt=2a8f77cb2f0ef60dbe58be561403f60efa10baa63567b9e9ee68daf881948ce9e82c871a469a945d24d6215f639b3068b41c538da9402cdbc9ee50173f497d01; toutiao_sso_user=7cbd6d2fb49e978a873b305ffb94cc93; login_flag=3d61c489019ff1a5fcdf456249d2f3ad; sessionid=ff2136e1c473a1713632798b79dfd05c; sid_tt=ff2136e1c473a1713632798b79dfd05c; __tasessionId=sc9lztan61563006768623; passport_auth_status=17ef67d4ccc7ba49ba37b75303d87112; sso_auth_status=fbbb246306efa457225647d08939234b%2C3fecd7ab9733631a48155a39de679dd0; sso_uid_tt=b73d5f1565c2577ebb0dbc4d6668b6da; uid_tt=324c5a1beb229e210fc61e9ec64237ee; sid_guard="ff2136e1c473a1713632798b79dfd05c|1563007075|15552000|Thu\054',
'Referer':'https://www.toutiao.com/search/?keyword=%E6%97%85%E6%B8%B8',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}
#记得添加自己的Cookie

def get_page(offset):
params = {
'aid':'24',
'app_name':'web_search',
'offset':offset,
'format':'json',
'keyword':'旅游',
'autoload':'true',
'count':'20',
'en_qc':'1',
'cur_tab':'1',
'from':'search_tab',
'pd':'synthesis'

}
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
try:
response = requests.get(url,headers=headers,timeout=2)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print("Erroe",e.args)
return None

def get_images(json_):
if json_.get('data'):
for item in json_.get('data'):
if item.get('title') is None:
continue
title = re.sub('[\t]', '', item.get('title'))
images = item.get('image_list')
if images:
for image in images:
yield {
'image':image['url'],
'title':title
}

def save_image(item):
image_path = 'images' + os.path.sep + item.get('title')
if not os.path.exists(image_path):
os.makedirs(image_path)
# '''创建用于存储图片的文件夹'''
try:
response = requests.get(item.get('image'))
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(image_path,md5(response.content).hexdigest(),'jpg')
# '''编辑图片名,利用md5算法实现图片名不重复'''

if not os.path.exists(file_path):
with open(file_path,'wb') as file:
file.write(response.content)
else:
print('Already Dowload',file_path)
except requests.ConnectionError:
print('Failed to Save Image')


def main(offset):
json_ = get_page(offset)
for item in get_images(json_):
save_image(item)

GROUP_START = 0
GROUP_END = 9

if __name__ == '__main__':
pool = Pool()
# '''添加多线程任务'''
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
print(groups)
pool.map(main, groups)
pool.close()
pool.join()



举报

相关推荐

0 条评论