Python:Scrapy应用-CFANZ编程社区

1、Create a Project

win+R-->cmd-->cd desktop-->scrapy startproject tutorial #this step will create a folder in the desktop.

2、Define an item

open the items.py file,the code:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    desc = scrapy.Field()

3、Start a Scrapy

create a new file named dmoz_spider.py then start coding:

import scrapy

class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ['dmoz.org']
    start_urls = [
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/'
        ]
    def parse(slef,response):
        filename = response.url.split("/")[-2]
        with open(filename,'wb')import scrapy

class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ['dmoz.org']
    start_urls = [
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/'
        ]
    def parse(slef,response):
        '''filename = response.url.split("/")[-2]
        with open(filename,'wb') as f:
            f.write(response.body)'''
        sel = scrapy.selector.Selector(response)
        sites = sel.xpath('//div[@class="title-and-desc"]')
        for site in sites:
            title = site.xpath('a/div[@class="site-title"]/text()').extract()
            link = site.xpath('a/@href').extract()
            desc = site.xpath('div[@class="site-descr "]/text()').extract()
            print(title,link,desc)

        
        
import scrapy

class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ['dmoz.org']
    start_urls = [
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/'
        ]
    def parse(slef,response):
        '''filename = response.url.split("/")[-2]
        with open(filename,'wb') as f:
            f.write(response.body)'''
        sel = scrapy.selector.Selector(response)
        sites = sel.xpath('//div[@class="title-and-desc"]')
        for site in sites:
            title = site.xpath('a/div[@class="site-title"]/text()').extract()
            link = site.xpath('a/@href').extract()
            desc = site.xpath('div[@class="site-descr "]/text()').extract()
            print(title,link,desc)

f.write(response.body)



4、cmd：
import scrapy
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ['dmoz.org']
    start_urls = [
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
        'http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/'
        ]
    def parse(slef,response):
        '''filename = response.url.split("/")[-2]
        with open(filename,'wb') as f:
            f.write(response.body)'''
        sel = scrapy.selector.Selector(response)
        sites = sel.xpath('//div[@class="title-and-desc"]')
        items = []
        for site in sites:
            item = DmozItem()
            
            item['title'] = site.xpath('a/div[@class="site-title"]/text()').extract()
            item['link'] = site.xpath('a/@href').extract()
            item['desc'] = site.xpath('div[@class="site-descr "]/text()').extract()
            #print(title,link,desc)
            items.append(item)
        return items


7、cmd