mcdonals.py
import scrapy
from Mcdonalds.items import McdonaldsItem
class McdonaldsSpider(scrapy.Spider):
name = 'mcdonalds'
allowed_domains = ['www.mcdonalds.com.cn']
def start_requests(self):
"""作用同 start_url,返回起始页请求列表"""
for page_num in range(1, 40):
url = 'https://www.mcdonalds.com.cn/news/corporate?page=' + str(page_num)
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
"""起始页解析"""
news_list = response.xpath('''//div[@class='news-center-list']/ul/li''')
for li in news_list:
title = li.xpath('''./h4/a/text()''')[0].extract()
time = li.xpath('''./time/text()''').extract_first()
detail_url = 'https://www.mcdonalds.com.cn' + li.xpath('''./h4/a/@href''')[0].extract()
item = McdonaldsItem()
item['title'] = title
item['time'] = time
item['detail_url'] = detail_url
# 请求传参
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response):
"""详情页解析"""
item = response.meta['item']
detail_content = response.xpath('''//div[@class="cmsPage"]''')[0].extract()
if detail_content:
item['detail_content'] = detail_content
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class McdonaldsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
time = scrapy.Field()
detail_content = scrapy.Field()
detail_url = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class McdonaldsPipeline:
fp = None
# 重写父类方法,只在开始爬虫时调用一次
def open_spider(self, spider):
print('开始爬虫............')
self.fp = open('./mcdonalds.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
# 接收爬虫文件提交的 item 对象
title = item['title']
time = item['time']
detail_content = item['detail_content']
detail_url = item['detail_url']
self.fp.write(title + ':' + time + '\n' + detail_content + '\n' + detail_url + '\n')
return item
def close_spider(self, spider):
print('结束爬虫............')
self.fp.close()
class mysqlPipeline:
conn = None
# 重写父类方法,只在开始爬虫时调用一次
def open_spider(self, spider):
print('开始爬虫............')
self.conn = pymysql.connect(host='xxx', port=xxx, user='xxx', password='xxx', db='xxx')
def process_item(self, item, spider):
# 接收爬虫文件提交的 item 对象
title = item['title']
time = item['time']
detail_content = item['detail_content']
detail_url = item['detail_url']
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into mcdonalds(`title`,`issueTime`,`content`,`url`) value(%s,%s,%s,%s);', [title, time, detail_content, detail_url])
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
print('结束爬虫............')
self.cursor.close()
self.conn.close()