文章目录
前言
破解:通过AES对列表页内详情页链接进行加密的反爬手段
一、反爬手段
详情页链接加密
跳转干扰调试
二、分析流程
页面原链接:
http://ggzy.zwfwb.tj.gov.cn:80/jyxxcggg/985325.jhtml
跳转后链接: http://ggzy.zwfwb.tj.gov.cn/jyxxcggg/1jcrAk61aP9HW3VSnwAuKw.jhtml
分析:
1. 点击触发页面跳转,找到全局事件监听 ”click事件“,
2. 打开对应的js文件,找到加密入口,出口,
3. 分析加密逻辑,抠出放node.js上跑,跑通后分析逻辑,转python跑
三、具体流程
1…查看事件监听器,进入click事件
2.通过分析确定是目标加密内容(有window.open函数)
3.对加密过程进行分析,加以注释
$("a").bind('click', function() {
// 获取页面内原链接
var hh = $(this).attr("href");
// 判定内容是否正常,做错误处理
if (typeof (hh) == 'undefined' || hh == '#') {
hh = $(this).attr("url");
if (typeof (hh) == 'undefined' || hh == '#') {
return
}
}
// 切割链接
var aa = hh.split("/");
// 获取列表长度
var aaa = aa.length;
// 获取链接内数字
var bbb = aa[aaa - 1].split('.');
var ccc = bbb[0];
var cccc = bbb[1];
var r = /^\+?[1-9][0-9]*$/;
// 查看是否有跳转标签
var ee = $(this).attr('target');
// 判定拿到的数字是否符合规则
//(test:从字符串内全局搜索匹配,返回bool类型)
if (r.test(ccc) && cccc.indexOf('jhtml') != -1) {
// 把字符串转成UTF8编码(字符串转对象)
var srcs = CryptoJS.enc.Utf8.parse(ccc);
// 打印出s全局搜索,发现写在html页面内
var k = CryptoJS.enc.Utf8.parse(s);
var en = CryptoJS.AES.encrypt(srcs, k, {
mode: CryptoJS.mode.ECB,
padding: CryptoJS.pad.Pkcs7
});
// 对象转字符串
var ddd = en.toString();
// 全局匹配,如果有/则替换为^
ddd = ddd.replace(/\//g, "^");
// 切片去掉最后两位
ddd = ddd.substring(0, ddd.length - 2);
// 拼接链接
var bbbb = ddd + '.' + bbb[1];
// 替换列表内数字替换为加密后内容
aa[aaa - 1] = bbbb;
// 拼接回链接
var uuu = '';
for (i = 0; i < aaa; i++) {
uuu += aa[i] + '/'
}
uuu = uuu.substring(0, uuu.length - 1);
// 判定是否有_blank
if (typeof (ee) == 'undefined') {
window.location = uuu
} else {
window.open(uuu)
}
} else {
if (typeof (ee) == 'undefined') {
window.location = hh
} else {
window.open(hh)
}
}
return false
});
4.分析后简化代码
// 导包
CryptoJS = require('crypto-js');
// 补全变量,测试是否符合预期
ccc = 985320;
s = 'qnbyzzwmdgghmcnm';
var srcs = CryptoJS.enc.Utf8.parse(ccc);
var k = CryptoJS.enc.Utf8.parse(s);
var en = CryptoJS.AES.encrypt(srcs, k, {
mode: CryptoJS.mode.ECB,
padding: CryptoJS.pad.Pkcs7
});
var ddd = en.toString();
ddd = ddd.replace(/\//g, "^");
ddd = ddd.substring(0, ddd.length - 2);
console.log(ddd);
5.确定加密结果和原网站加密结果一致,完成破解,写成js文件
var CryptoJS = require("crypto-js");
function lx(hh,s) {
var aa = hh.split("/");
var aaa = aa.length;
var bbb = aa[aaa - 1].split('.');
var ccc = bbb[0];
var cccc = bbb[1];
var r = /^\+?[1-9][0-9]*$/;
var srcs = CryptoJS.enc.Utf8.parse(ccc);
var k = CryptoJS.enc.Utf8.parse(s);
var en = CryptoJS.AES.encrypt(srcs, k, {
mode: CryptoJS.mode.ECB,
padding: CryptoJS.pad.Pkcs7
});
var ddd = en.toString();
ddd = ddd.replace(/\//g, "^");
ddd = ddd.substring(0, ddd.length - 2);
var bbbb = ddd + '.' + bbb[1];
aa[aaa - 1] = bbbb;
var uuu = '';
for (i = 0; i < aaa; i++) {
uuu += aa[i] + '/'
}
uuu = uuu.substring(0, uuu.length - 1);
return uuu;
}
// var s = "qnbyzzwmdgghmcnm";
// var hh="http://ggzy.zwfwb.tj.gov.cn:80/jyxxcgjg/970369.jhtml";
// console.log(lx(hh));
简易爬取详情页链接代码
# 逻辑爬取列表页,获得详情页链接,进行解密后,获得详情页链接
import execjs
import requests
from lxml import etree
def execute_js(origin_url):
with open('deal_aes.js', 'r', encoding='utf-8') as f:
js = f.read()
ctx = execjs.compile(js)
target_url = ctx.call('lx',origin_url)
return target_url
def get_data(url):
response = requests.get(
url=url,
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'clientlanguage=zh_CN; JSESSIONID=344E04DD27EB45F4AC52C49E9936BDB5',
'Host': 'ggzy.zwfwb.tj.gov.cn',
'Pragma': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
},
)
html_text = bytes(bytearray(response.text, encoding='utf-8'))
html = etree.HTML(html_text)
contents = html.xpath('//div[2]/div[3]/div/ul/li/div/a/@url')
for i in contents:
# 加密
data_url = execute_js(i)
print(data_url)
if __name__ == '__main__':
s = "qnbyzzwmdgghmcnm"
url = 'http://ggzy.zwfwb.tj.gov.cn/queryContent-jyxx.jspx?title=&inDates=&ext=&ext1=&origin=&channelId=76&beginTime=&endTime='
get_data(url)
总结
确定加密入口
1.对于对链接进行加密的反爬可通过:来源页事件监听器,监听click来查找加密入口
2.对于简单的加密,也可通过全局搜索CryptoJS等加密函数关键字来定位
待解决问题
原打算转python来跑,发现Python的AES加密处理数字内容报错,而用对其补全后加密则得不到预期内容,以下是补全处理方法
from Crypto.Util.Padding import pad, unpad
from Crypto.Cipher import AES
BLOCK_SIZE = 32 # Bytes
key = 'qnbyzzwmdgghmcnm'
cipher = AES.new(key.encode('utf8'), AES.MODE_ECB)
msg = cipher.encrypt(pad(b'970369', BLOCK_SIZE))
print(msg.hex())
decipher = AES.new(key.encode('utf8'), AES.MODE_ECB)
msg_dec = decipher.decrypt(msg)
print(unpad(msg_dec, BLOCK_SIZE))