0
点赞
收藏
分享

微信扫一扫

Python爬取意林杂志所有期刊文章


可能很多人对这个意林杂志比较陌生,但是对于小编来说,那可是满满的回忆。记得我们上中学那时候读过的意林,那可是一本接着一本,其中有很多令人感动的故事,一直被温暖,被治愈。


接下来让我们来看看如何使用爬虫,爬取所有的杂志内容,目标网站:

https://www.yilinzazhi.com/


爬虫代码:

# coding:utf-8
# __auth__ = "maiz"
import requests
from lxml import etree


class Zazhi:
def __init__(self):
self.url = 'https://www.yilinzazhi.com/'
self.item = {}
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}


def parse_url(self,url):
response = requests.get(url=url,headers=self.headers)
return response.content.decode('utf-8',errors='ignore')


def handle_html_str(self,html_str):
html = etree.HTML(html_str)
return html


def get_list_url(self,html_str):
"""
:param html_str: 前端页面的html字符
:return: list_url: 20xx 年 x 期的列表url
"""
list_url = []
html = self.handle_html_str(html_str)
div_list = html.xpath("//td[@class='time']")
for div in div_list:
list_url.append(r"https://www.yilinzazhi.com/" + div.xpath("./a/@href")[0])
return list_url


def get_title_list_url(self,list_url):
"""
:param list_url: 20xx 年 x 期的列表url
:return: content_url_list 内容页url列表
"""
content_url_list = []
for url in list_url[:60]:
print(url)
base_url = url.replace("index.html","")
html_str = self.parse_url(url)
html = self.handle_html_str(html_str)
div_list = html.xpath("//span[@class='maglisttitle']")
for div in div_list:
content_url_list.append(base_url + div.xpath("./a/@href")[0])
return content_url_list


def end(self,content_url_list):
"""
:param content_url_list: 内容页url列表
:return:
"""
for content_url in content_url_list:
print(content_url)
self.item['content_url'] = content_url
str = self.parse_url(content_url)
html = self.handle_html_str(str)
self.item['title'] = html.xpath("//div[@class='blkContainerSblk collectionContainer']/h1/text()")[0]
content_list = html.xpath("//div[@class='blkContainerSblkCon']//p//text()")
content = "".join(content_list)
content_ = content.replace("\u3000\u3000","")
self.item['content'] = content_
with open('意林合集.txt', 'a', encoding='utf-8') as fw:
fw.writelines("{}\n{}\n\n\n".format(self.item['title'], self.item['content']))
print("{}保存成功".format(self.item['title']))


def run(self):
str = self.parse_url(self.url)
list_url = self.get_list_url(str)
content_url_list = self.get_title_list_url(list_url)
self.end(content_url_list)


if __name__ == '__main__':
zazhi = Zazhi()
zazhi.run()



举报

相关推荐

0 条评论