一、页面结构分析
二、编写程序代码
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@author: Roc-xb
"""
import requests
from lxml import etree
def run(page=1):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
params = (
('assignee_id', ''),
('author_id', ''),
('branch', ''),
('collaborator_ids', ''),
('issue_search', ''),
('label_ids', ''),
('label_text', ''),
('milestone_id', ''),
('priority', ''),
('private_issue', ''),
('program_id', ''),
('project_type', ''),
('scope', ''),
('sort', ''),
('state', 'closed'),
('target_project', ''),
('page', page),
)
response = requests.get('https://gitee.com/y_project/RuoYi/issues', headers=headers, params=params).text
dom = etree.HTML(response)
res = dom.xpath('//*[@id="git-issues"]/div/div/div[1]/h3/a/text()')
print("".join(res))
with open("issus.txt", 'a', encoding="utf-8") as f:
f.writelines(res)
next_page = str(dom.xpath('//*[@id="git-discover-page"]/a[@rel="next"]//text()'))
if len(next_page) > 1:
run(page + 1)
if __name__ == '__main__':
run()
三、运行程序结果
四、词云图生成
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@author: Roc-xb
"""
from wordcloud import WordCloud
import matplotlib.pyplot as plt # 绘制图像的模块
import jieba # jieba分词
f = open('issus.txt', 'r', encoding='UTF-8').read()
# 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云
cut_text = " ".join(jieba.cut(f))
wordcloud = WordCloud(
# 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
# 设置了背景,宽高
background_color="white", width=1500, height=880).generate(cut_text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()