最近接触了多线程爬虫,多线程可以让不同请求的等待同时进行,大大提高爬虫运行效率,运行起来就巨快
import requests
from lxml import etree
import csv
import threading
from queue import Queue
import time
parse_count = 1
crawl_fail_list = []
parse_fail_list = []
# http://www.yanglao.com.cn/resthome_2
# 封装一个多线程爬虫类
class crawl_thread(threading.Thread):
def __init__(self, name, page_queue, data_queue):
super().__init__()
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
self.name = name
self.page_queue = page_queue
self.data_queue = data_queue
def run(self):
# 在run方法中发起请求
# 1、从页面队列中出队一个页面编号 2、拼接url 3、发起请求 4、将响应数据加入到响应数据队列中
global crawl_fail_list
print("*********%s开始************" % self.name)
while 1:
# 如果page-queue空就终止线程
if self.page_queue.empty():
break
# 从页码池获取数据,拼接url
try:
page = self.page_queue.get()
url = 'http://www.yanglao.com.cn/resthome_' + str(page)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
# 发送请求,获得响应
r = requests.get(url, headers=headers,verify=False)
# 将响应放入数据队列
self.data_queue.put(r.text)
print('%s:第%s页爬取完成' % (self.name, page))
time.sleep(0.3)
except Exception as e:
print(e)
crawl_fail_list.appen(page)
print("*********%s结束************" % self.name)
# 封装一个多线程的类,用于解析与存储
class parse_thread(threading.Thread):
def __init__(self, name, data_queue, suo, writer):
super().__init__()
self.name = name
self.data_queue = data_queue
self.suo = suo
self.writer = writer
def run(self):
# 1、将数据从数据队列中取出 2、解析并存储
global parse_count
global parse_fail_list
print("*********%s开始************" % self.name)
while 1:
# 从数据队列获得数据,如果超过30s没有新数据就终止
try:
content = self.data_queue.get(True, 15)
except:
break
# 解析数据
try:
tree = etree.HTML(content)
li_list = tree.xpath('//li[@class="rest-item"]')
for li in li_list:
name = li.xpath('.//h4/a/text()')[0]
location = li.xpath('.//ul/li[1]/text()')[0].replace('地址:', '')
beds = li.xpath('.//ul/li[2]/text()')[0].replace('床位数:', '').replace('张', '')
money = li.xpath('.//ul/li[3]/text()')[0].replace('收费区间:', '')
lt = [name, location, beds, money]
# 上锁写csv
self.suo.acquire()
self.writer.writerow(lt)
self.suo.release()
print("%s:第%s页写入完成" % (self.name, parse_count))
# 如果解析失败就抛出错误,继续循环
except Exception as e:
print(e)
parse_fail_list.append(parse_count)
parse_count += 1
print("*********%s结束************" % self.name)
##################################################################
def create_queue():
# 创建页码队列
page_queue = Queue()
# 总页数
for page in range(1, 1676):
page_queue.put(page)
# 创建数据队列
data_queue = Queue()
return page_queue, data_queue
def create_crawl_list(page_queue, data_queue):
crawl_list = []
name_list = ['爬虫1号', '爬虫2号']
for name in name_list:
crawl = crawl_thread(name, page_queue, data_queue)
crawl_list.append(crawl)
return crawl_list
def create_parse_list(data_queue, suo, writer):
parse_list = []
name_list = ['解析1号', '解析2号']
for name in name_list:
parse = parse_thread(name, data_queue, suo, writer)
parse_list.append(parse)
return parse_list
###################################################
def main():
# 创建队列
page_queue, data_queue = create_queue()
# 创建锁
suo = threading.Lock()
# 打开文件,创建writer
f = open('养老院数据_全.csv', 'a', encoding='utf8', newline='')
writer = csv.writer(f)
# 创建爬虫队列和解析队列
crawl_list = create_crawl_list(page_queue, data_queue)
parse_list = create_parse_list(data_queue, suo, writer)
print(crawl_list, parse_list)
# 启动爬虫
for crawl in crawl_list:
crawl.start()
for parse in parse_list:
parse.start()
# 确保主线程最后关闭
for crawl in crawl_list:
crawl.join()
for parse in parse_list:
parse.join()
# 收尾
f.close()
print('所有线程关闭,程序结束!!!')
print(crawl_fail_list)
print(parse_fail_list)
if __name__ == '__main__':
main()
得到的结果: