Python爬取今日头条, 解决崔叔书上的那些事-CFANZ编程社区

入手了本崔叔爬虫的书, 书上一些项目代码, 可能在今天用不了, 毕竟网站啥的也不是一层不变的东西

教训: 在看书的过程中得多关注一下作者的GitHub : https://github.com/Python3WebSpider/Jiepai/issues

更改了save_image()和get_image()

添加了headers (记得换上自己的Cookie), 频繁运行代码会导致爬取失败(原因:频繁访问时, 今日头条会给出一个验证码, 跳验证码的模块还没出,到时候再更新)

Python爬取今日头条, 解决崔叔书上的那些事_3d

还有一个坑, 定义变量,文件夹,文件名字的时候尽量不要以库的名字命名,

19/7/13 更改之后的代码

import requests
from urllib.parse import urlencode
import os
from hashlib import md5
import json
from multiprocessing.pool import Pool
import re

headers = {
    'Cookie':r'UM_distinctid=16be9d1faa4450-0419f190f582f7-3f71045b-15f900-16be9d1faa5aae; tt_webid=6713016542788961804; csrftoken=87e5484ca1d3293ce6984ed250fc6fdc; W2atIF=1; _ga=GA1.2.1603613204.1562996055; _gid=GA1.2.837554184.1562996055; odin_tt=2a8f77cb2f0ef60dbe58be561403f60efa10baa63567b9e9ee68daf881948ce9e82c871a469a945d24d6215f639b3068b41c538da9402cdbc9ee50173f497d01; toutiao_sso_user=7cbd6d2fb49e978a873b305ffb94cc93; login_flag=3d61c489019ff1a5fcdf456249d2f3ad; sessionid=ff2136e1c473a1713632798b79dfd05c; sid_tt=ff2136e1c473a1713632798b79dfd05c; __tasessionId=sc9lztan61563006768623; passport_auth_status=17ef67d4ccc7ba49ba37b75303d87112; sso_auth_status=fbbb246306efa457225647d08939234b%2C3fecd7ab9733631a48155a39de679dd0; sso_uid_tt=b73d5f1565c2577ebb0dbc4d6668b6da; uid_tt=324c5a1beb229e210fc61e9ec64237ee; sid_guard="ff2136e1c473a1713632798b79dfd05c|1563007075|15552000|Thu\054',
    'Referer':'https://www.toutiao.com/search/?keyword=%E6%97%85%E6%B8%B8',
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    'X-Requested-With':'XMLHttpRequest',
}
#记得添加自己的Cookie

def get_page(offset):
    params = {
        'aid':'24',
        'app_name':'web_search',
        'offset':offset,
        'format':'json',
        'keyword':'旅游',
        'autoload':'true',
        'count':'20',
        'en_qc':'1',
        'cur_tab':'1',
        'from':'search_tab',
        'pd':'synthesis'

    }
    url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
    try:
        response = requests.get(url,headers=headers,timeout=2)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:
        print("Erroe",e.args)
        return None

def get_images(json_):
    if json_.get('data'):
        for item in json_.get('data'):
            if item.get('title') is None:
                continue
            title = re.sub('[\t]', '', item.get('title'))
            images = item.get('image_list')
            if images:
                for image in images:
                    yield {
                        'image':image['url'],
                        'title':title
                    }

def save_image(item):
    image_path = 'images' + os.path.sep + item.get('title')
    if not os.path.exists(image_path):
        os.makedirs(image_path)
# '''创建用于存储图片的文件夹'''
    try:
        response = requests.get(item.get('image'))
        if response.status_code == 200:
            file_path = '{0}/{1}.{2}'.format(image_path,md5(response.content).hexdigest(),'jpg')
# '''编辑图片名,利用md5算法实现图片名不重复'''

        if not os.path.exists(file_path):
            with open(file_path,'wb') as file:
                file.write(response.content)
        else:
            print('Already Dowload',file_path)
    except requests.ConnectionError:
        print('Failed to Save Image')


def main(offset):
    json_ = get_page(offset)
    for item in get_images(json_):
        save_image(item)

GROUP_START = 0
GROUP_END = 9

if __name__ == '__main__':
    pool = Pool()
# '''添加多线程任务'''
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    print(groups)
    pool.map(main, groups)
    pool.close()
    pool.join()