0
点赞
收藏
分享

微信扫一扫

python学习之-加密字体反扒

东言肆语 2023-04-17 阅读 105

# coding = utf-8

'''
# 获取实习僧招聘信息
# (https://www.shixiseng.com/interns?page=2&type=intern&keyword=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&area=&months=&days=°ree=&official=&enterprise=&salary=-0&publishTime=&sortType=&city=%E5%85%A8%E5%9B%BD&internExtend=)
# 获取前5页全部数据存储csv文件
'''
import csv
import io
import json
import re

import crawles
import fontTools.ttx
import requests
from fontTools.ttLib import TTFont

url = 'https://www.shixiseng.com/app/interns/search/v2'

cookies = {
    'utm_source_first': 'PC',
    'utm_source': 'PC',
    'utm_campaign': 'PC',
    'Hm_lvt_03465902f492a43ee3eb3543d81eba55': '1681560188',
    'RANGERS_WEB_ID': '7147610799707866638',
    'RANGERS_SAMPLE': '0.31832347309583',
    'adCloseOpen': 'true',
    'position': 'pc_search_syss',
    'Hm_lpvt_03465902f492a43ee3eb3543d81eba55': '1681560491',
}

headers = {
    'authority': 'www.shixiseng.com',
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'no-cache',
    'content-type': 'application/x-www-form-urlencoded',
    'pragma': 'no-cache',
    'referer': 'https://www.shixiseng.com/interns?keyword=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&city=%E5%85%A8%E5%9B%BD&type=intern',
    'sec-ch-ua': '\\',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '\\',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

params = {
    'build_time': '1681560501489',
    'page': '{page}',
    'type': 'intern',
    'keyword': '数据挖掘',
    'area': '',
    'months': '',
    'days': '',
    'degree': '',
    'official': '',
    'enterprise': '',
    'salary': '-0',
    'publishTime': '',
    'sortType': '',
    'city': '全国',
    'internExtend': '',
}
#存储文件
csv_file = open('sxs_info.csv', 'w+', encoding='gbk', newline='')  # 文件存储
csv_f = csv.writer(csv_file)
csv_f.writerow(['岗位', '日薪', '行业', '公司名称', '地点时间', '工作描述', '内容描述', ])


# 请求网页数据解密返回输出
def sxs_get(page):
    response = crawles.get(url, headers=headers, params=params, cookies=cookies)
    text = response.text
    # print(text)

    #  查看内容
    # for i in response.json['msg']['data']:
    #     print(i)
    # 获取字体地址解析地址保存xml文件
    font_url = 'https://www.shixiseng.com/interns/iconfonts/file'
    ttf = TTFont(io.BytesIO(requests.get(font_url).content))
    # ttf.saveXML('sxs.xml') #首次运行打开

    # 对文件读取
    f = open('sxs.xml', 'r+', encoding='utf-8')
    sxs_data = f.read()  # 读取sxs文件存放到sxs_data
    f.close()
    # 用re取文件里面的编码及索引
    grapy = re.findall('<map code="0x(.*?)" name="(.*?)"/>', sxs_data)  # 取数字及汉字对应编码
    sequence = re.findall(' <GlyphID id="(\d+)" name="(\w+)"/>', sxs_data)  # 取编码对应的索引

    grapy_dict = {k: v for k, v in grapy}
    sequence_dict = {k: v for v, k in sequence}

    str_data = '0123456789一师x会四计财场DHLPT聘招工d周L端p年hx设程二五天tXG前KO网SWcgkosw广市月个BF告NRVZ作bfjnrvz三互生人政AJEI件M行QUYaeim软qu银y联'
    str_list = [' ', ''] + [i for i in str_data]  # 第一个没有,第二个空 加入字符列表

    # 对数据进行关联放入字典里,按照下面逻辑进行匹配
    #  -> 0xe283 -> uni30 -> 数据索引 -> 文字
    grapy_dict = {f'&#x{k}': str_list[int(sequence_dict[v])] for k, v in grapy}
    # print(grapy_dict)
    # 对文件进行解码
    for k, v in grapy_dict.items():
        text = text.replace(k, v)

    # 将解密后的数据转换回去
    data_list = json.loads(text)['msg']['data']
    # print(data_list)
    for i, d in enumerate(data_list):
        print(i + 1, f"岗位:{d['name']},日薪:{d['minsal']}-{d['maxsal']}/'天',"
                     f"行业:{d['industry']}/{d['scale']},公司名称:{d['cname']}"
                     f"地点时间:{d['city']}|{d['day']}天/周|{d['month_num']}个月,"
                     f"描述1:{d['i_tags']}, 描述2:{d['c_tags']}"
              )
        sxs_data = d['name'], d['minsal'] + '-' + d['maxsal'] + '/' + '天', d['industry'] + '/' + d['scale'], d['cname'], \
                   d['city'] + '|' + d['day'] + '天/周' + '|' + d['month_num'] + '个月', d['i_tags'], d['c_tags']

        # 文件写入
        csv_f.writerow(sxs_data)


for p in range(1, 6):  # 循环翻页取前5页数据
    print(f'爬取第{p}页...')
    print('-' * 50)
    sxs_get(p)
csv_file.close()

举报

相关推荐

0 条评论