爬取网络:Lply分类目录网
代码:
##爬取网站:Lply分类目录网(http://www.lply.net/category/)
import requests
import re
import xlwt
heads = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
} ##heads非必要
response = requests.get('http://www.lply.net/category/', headers = heads)##此处可以省略“headers = heads”
content = response.text
pattern = re.compile('<li>.*?<a.*?href="(.*?)".*?>.*?(.*?)</a>.*?<em>.*?(\d+).*?</em>.*?</li>',re.S)
results = re.findall(pattern,content)
headee = ["序号","网站名称","包含数量","网站地址"]
####将结果存储为excel表,若执行此步,请在idle环境运行,若用pycharm,可能有其他要求
with open(r"C:\Users\AdamCY\Desktop\wenjian\python爬虫\py_lply.xlsx", "w",encoding='utf-8') as file:
file = xlwt.Workbook(encoding = 'utf-8')
sheet = file.add_sheet('sheet_1')
sheet.write(0, 0, headee[0])
sheet.write(0, 1, headee[1])
sheet.write(0, 2, headee[2])
sheet.write(0, 3, headee[3])
s = 1
i = 0
for result in results:
url = "http://www.lply.net/category"+str(result[0])
sheet.write(i+1, 0, s)
sheet.write(i+1, 1, result[1])
sheet.write(i+1, 2, result[2])
sheet.write(i+1, 3, url)
s += 1
i += 1
file.save(r"C:\Users\AdamCY\Desktop\wenjian\python爬虫\py_lply.xlsx")
print("excel数据保存成功")
####若感觉存储为excel表很麻烦,也可以执行下面代码直接打印结果
##for result in results:
## ur0, name, num = result
## url = "http://www.lply.net/category"+str(ur0)
## print(url, name, num)
####也可以选择将数据保存在txt文本中
##f = open(r'C:\Users\AdamCY\Desktop\wenjian\python爬虫\py_play.txt','w')
##for result in results:
## url = "http://www.lply.net/category"+str(result[0])
## f.writelines([result[1],'\t', result[2],'\t',url,'\n'])
##print("TXT文本数据保存成功")
##f.close()三种输出结果(自选其一:excel表、直接打印、txt文本)













