1:抓取网站
http://www.365kk.cc/255/255036/
2:网站分析
3:代码实现
import requests
from lxml import etree
import time
import random
# 获取下一页链接的函数
def next_url(next_url_element):
nxturl = 'http://www.365kk.cc/255/255036/'
# rfind('/') 获取最后一个'/'字符的索引
index = next_url_element.rfind('/') + 1
nxturl += next_url_element[index:]
return nxturl
# 请求头,需要添加你的浏览器信息才可以运行
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.1.3162 SLBChan/8',
'Cookie': 'ASP.NET_SessionId=3sz0dliidjary1w5hxjnuutl; fontFamily=null; fontColor=null; fontSize=null; bg=null',
'Host': 'www.365kk.cc',
'Connection': 'keep-alive'
}
# 小说主页
main_url = "http://www.365kk.cc/255/255036/"
# 使用get方法请求网页
main_resp = requests.get(main_url, headers=headers)
# 将网页内容按utf-8规范解码为文本形式
main_text = main_resp.content.decode('utf-8')
#print(main_text)
# 将文本内容创建为可解析元素
main_html = etree.HTML(main_text)
#print(main_html)
bookTitle = main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/h1/text()')[0]#/html/body/div[3]/div[1]/div/div/div[2]/div[1]/h1
author = main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[1]/text()')[0]#/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[1]
update = main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[5]/text()')[0]#/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[5]
introduction = main_html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[2]/text()')[0]
print(bookTitle)
print(author)
print(update)
print(introduction)
# 调试期间仅爬取六个页面
maxPages = 6
cnt = 0
# 记录上一章节的标题
lastTitle = ''
# 爬取起点
url = 'http://www.365kk.cc/255/255036/4147599.html'
# 爬取终点
endurl = 'http://www.365kk.cc/255/255036/4148385.html'
while url != endurl:
cnt += 1 # 记录当前爬取的页面
if cnt > maxPages:
break # 当爬取的页面数超过maxPages时停止
resp = requests.get(url, headers)
text = resp.content.decode('utf-8')
html = etree.HTML(text)
title = html.xpath('//*[@class="title"]/text()')[0]
contents = html.xpath('//*[@id="content"]/text()')
# 输出爬取进度信息
print("cnt: {}, title = {}, url = {}".format(cnt, title, url))
with open(bookTitle + '.txt', 'a', encoding='utf-8') as f:
if title != lastTitle: # 章节标题改变
f.write(title) # 写入新的章节标题
lastTitle = title # 更新章节标题
for content in contents:
f.write(content)
f.write('\n\n')
f.close()
# 获取"下一页"按钮指向的链接
next_url_element = html.xpath('//*[@class="section-opt m-bottom-opt"]/a[3]/@href')[0]
# 传入函数next_url得到下一页链接
url = next_url(next_url_element)
sleepTime = random.randint(2, 5) # 产生一个2~5之间的随机数
time.sleep(sleepTime) # 暂停2~5之间随机的秒数
print("complete!")
4:本地运行