1. 构建item:
(1). spidertest\spidertest\items.py:
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
...
(2). spidertest\spidertest\spiders\bt.py:
import datetime
from ArticleSpider.items import JobBoleArticleItem
from ArticleSpider.utils.common import get_md5
def parse_detail(self, response):
article_item = JobBoleArticleItem()
# 有些值可以通过Request的meta来传递
front_image_url = response.meta.get("front_image_url", "")
match_re = re.match(".*?(\d+).*", fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0
tag_list = response.css("p.entry-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
tags = ",".join(tag_list)
try:
create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
article_item["url_object_id"] = get_md5(response.url)
article_item["front_image_url"] = front_image_url
......
yield article_item
2. 问题点:
①. 如fav_nums的正则、md5、时间只能写在这个spider中,另外一个spider用到,还是会继续写一次.
3. itemLoader:
from scrapy.loader import ItemLoader
from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
def parse_detail(self, response):
front_image_url = response.meta.get("front_image_url", "")
# 通过item loader加载item
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
// 添加规则
// 添加css
item_loader.add_css("title", ".entry-header h1::text")
// 非css、xpath的值添加
item_loader.add_value("url", response.url)
item_loader.add_value("front_image_url", [front_image_url])
// 解析上面添加的规则
article_item = item_loader.load_item()
yield article_item