0
点赞
收藏
分享

微信扫一扫

scrapy 使用CrawlSpider提取超链接,不循环的问题


直接上代码:

这个循环爬取url有问题:

# -*- coding: utf-8 -*-
import scrapy
from BaikeSpider.items import BaikespiderItem

from scrapy.spiders import CrawlSpider,Rule # 提取超链接的规则
from scrapy.linkextractors import LinkExtractor # 超链接提取器

class ManypagesbkspiderSpider(CrawlSpider):
    name = 'ManyPagesBKSpider'
    allowed_domains = ['baike.baidu.com']
    start_urls = ['https://baike.baidu.com/item/Python']

    # 根据正则提取超链接,这里仅仅是对url的正则过滤
    page_links = LinkExtractor(allow=(r'/item/.*'))
    # 回调函数处理提取到的链接,follow=True表示一直循环下去
    rules = [Rule(page_links, callback="parse", follow=True)]
    # 注意:此处要想循环起来,必须修改原来的parse函数函数名,不然循环失败。

    def gettitle(self, response):
        h1 = response.xpath('//dd[@class]/h1/text()').extract()
        h2 = response.xpath('//dd[@class]/h2/text()').extract()
        h1.extend(h2)
        return "".join(h1)

    def getcontent(self, response):
        text_list = response.xpath('//div[@class="lemma-summary"]//text()').extract()
        return "".join(text_list).split()

    def geturl(self, response):
        return response.url  # body代表数据,url表示当前链接

    def parse(self, response):
        item = BaikespiderItem()
        item['kword'] = self.gettitle(response)
        item['url'] = self.geturl(response)
        item['content'] = self.getcontent(response)
        # for i in item:
        #     print(i, item[i])
        yield item

这个循环爬取url没有问题:

# -*- coding: utf-8 -*-
import scrapy
from BaikeSpider.items import BaikespiderItem

from scrapy.spiders import CrawlSpider,Rule # 提取超链接的规则
from scrapy.linkextractors import LinkExtractor # 超链接提取器

class ManypagesbkspiderSpider(CrawlSpider):
    name = 'ManyPagesBKSpider'
    allowed_domains = ['baike.baidu.com']
    start_urls = ['https://baike.baidu.com/item/Python']

    # 根据正则提取超链接,这里仅仅是对url的正则过滤
    page_links = LinkExtractor(allow=(r'/item/.*'))
    # 回调函数处理提取到的链接,follow=True表示一直循环下去
    rules = [Rule(page_links, callback="parse_item", follow=True)]
    # 注意:此处要想循环起来,必须修改原来的parse函数函数名,不然循环失败。

    def gettitle(self, response):
        h1 = response.xpath('//dd[@class]/h1/text()').extract()
        h2 = response.xpath('//dd[@class]/h2/text()').extract()
        h1.extend(h2)
        return "".join(h1)

    def getcontent(self, response):
        text_list = response.xpath('//div[@class="lemma-summary"]//text()').extract()
        return "".join(text_list).split()

    def geturl(self, response):
        return response.url  # body代表数据,url表示当前链接

    def parse_item(self, response):
        item = BaikespiderItem()
        item['kword'] = self.gettitle(response)
        item['url'] = self.geturl(response)
        item['content'] = self.getcontent(response)
        # for i in item:
        #     print(i, item[i])
        yield item

注意关键区别:

rules = [Rule(page_links, callback="parse_item", follow=True)]
# 注意:此处要想循环起来,必须修改原来的parse函数函数名,不然循环失败。

 

举报

相关推荐

0 条评论