import requests
import os
from lxml import etree
if __name__ == '__main__':
if not os.path.exists('./sucai'):
os.mkdir('./sucai')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
}
url = 'https://sc.chinaz.com/jianli/free_%d.html'
for pageNum in range(5, 50):
if pageNum == 1:
new_url = 'https://sc.chinaz.com/jianli/free.html'
else:
new_url = format(url % pageNum)
response = requests.get(url=new_url, headers=headers)
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
div_list = tree.xpath('//*[@id="container"]/div')
print('爬取数据开始....')
for div in div_list:
detail_url = 'https:' + div.xpath('./a/@href')[0]
page_name = div.xpath('./a/img/@alt')[0] + '.rar'
responsee = requests.get(url=detail_url, headers=headers,timeout=(3,5))
responsee.encoding = responsee.apparent_encoding
detail_data = responsee.text
tree2 = etree.HTML(detail_data)
download_list = tree2.xpath('//div[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
download_data = requests.get(url=download_list, headers=headers, timeout=(3,10))
download_data.encoding = 'utf-8'
download_data = download_data.content
filepath = 'sucai/' + page_name
with open(filepath, 'wb') as fp:
fp.write(download_data)
print(page_name, '爬取成功!!!')