0
点赞
收藏
分享

微信扫一扫

Python抓取微信公众号文章

西特张 2021-09-28 阅读 53
技术

公众号

  • 灏泽异谈

文章列表链接

使用charles分析公众号请求

  • 如图


找有用信息

  • 完整URL请求地址
  • 完整的请求头(headers)信息,Headers里面包括了cookie、User-agent、Host 等信息。
  • 因为 requests.get 方法里面的 headers 参数必须是字典对象,所以,先要写个函数把刚刚拷贝的字符串转换成字典对象。
def headers_to_dict(headers):
    """
    将字符串
    '''
    Host: mp.weixin.qq.com
    Connection: keep-alive
    Cache-Control: max-age=
    '''
    转换成字典对象
    {
        "Host": "mp.weixin.qq.com",
        "Connection": "keep-alive",
        "Cache-Control":"max-age="
    }
    :param headers: str
    :return: dict
    """
    headers = headers.split("\n")
    d_headers = dict()
    for h in headers:
        if h:
            k, v = h.split(":", 1)
            d_headers[k] = v.strip()
    return d_headers

完整源代码

import requests

# -*- coding: utf-8 -*-
__author__ = "zoranlee"
def headers_to_dict(headers):
    """
    将字符串
    '''
    Host: mp.weixin.qq.com
    Connection: keep-alive
    Cache-Control: max-age=
    '''
    转换成字典对象
    {
        "Host": "mp.weixin.qq.com",
        "Connection": "keep-alive",
        "Cache-Control":"max-age="
    }
    :param headers: str
    :return: dict
    """
    headers = headers.split("\n")
    d_headers = dict()
    for h in headers:
        if h:
            k, v = h.split(":", 1)
            d_headers[k] = v.strip()
    return d_headers

#提取数据内容
def extract_data(html_content):
    """
       从html页面中提取历史文章数据
       :param html_content 页面源代码
       :return: 历史文章列表
       """
    import re
    import html
    import json
    # rex = "data = '({.*?})'"
    rex = "data=({.*?\n)"
    pattern = re.compile(pattern=rex, flags=re.S)
    match = pattern.search(html_content)
    if match:
        data = match.group(1)
        data = html.unescape(data)
        # 按换行符
        data = data[:-2]
        data = json.loads(data)
        articles = data.get("appmsg_list")
        for item in articles:
            print(item)
        return articles

def crawl():
    url = "https://mp.weixin.qq.com/mp/homepage?__biz=MzI4OTUyODgwMQ==&hid=1&sn=2324eec706f1b6ceb8f8b2a1e35671ee&scene=18&devicetype=iOS13.6.1&version=17001127&lang=zh_CN&nettype=WIFI&ascene=7&session_us=gh_e340a4c9f6df&fontScale=100&pass_ticket=Gd7oyTKM6dlbkNgUH3qtICelGKGOz2qQ8S56kql%2FhvSnK5zySYhIP2UsniJPiTox&wx_header=1"
    headers="""
accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
x-wechat-uin:MTI0MzQ2NQ%3D%3D
user-agent:Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.17(0x17001127) NetType/WIFI Language/zh_CN
accept-language:zh-cn
accept-encoding:gzip, deflate, br
"""

    headers = headers_to_dict(headers)
    response = requests.get(url, headers=headers, verify=False)
    # print(response.text)
    # 将抓取到的数据写成网页
    with open("weixin_history.html", "w", encoding="utf-8") as f:
        f.write(response.text)
    #提取数据内容
    articles = extract_data(response.text)
    print(articles)


if __name__ == '__main__':
    crawl()

举报

相关推荐

0 条评论