0
点赞
收藏
分享

微信扫一扫

Scrapy框架 之采集某网站产品(按分类采集)

一、效果图

Scrapy框架 之采集某网站产品(按分类采集)_Small

 

二、示例代码

1、items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class HongxingItem(scrapy.Item):
# define the fields for your item here like:
catname = scrapy.Field()
name = scrapy.Field()
ico = scrapy.Field()

2、pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import os
import requests

class HongxingPipeline(object):
count = 0
cat_dict = {}

def __init__(self):
pass

def process_item(self, item, spider):
self.count += 1

# 判断分类文件夹是否存在
if not os.path.exists(item["catname"]):
os.mkdir(item["catname"])

cat_product_count = self.cat_dict.get(item["catname"])
if cat_product_count is None:
count = 1
else:
count = int(cat_product_count) + 1

self.cat_dict[item["catname"]] = count

# 网络图片保存到本地
response = requests.get(item['ico'])
file = item["catname"] + '/' + str(count) + ".jpg"
with open(file,'wb') as f:
f.write(response.content)
f.close()

with open( item["catname"] + '/' + str(count) +'.txt','w') as f:
f.write(item['name'])
f.close()
return item

def close_spider(self, spider):
print("总共采集:{0}".format(str(self.count)))

 

3、product.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import HongxingItem
import requests
from lxml import etree

class ProductSpider(scrapy.Spider):
name = 'product'
allowed_domains = ['hxdy.cn']
host = "http://www.hxdy.cn"
url = host + '/products.asp?Small_Class=16&page={0}'

start_urls = [
{"name": '条形连接器', "url": host + '/products.asp?Small_Class=2&page={0}'},
{"name": '贴片式连接器', "url": host + '/products.asp?Small_Class=3&page={0}'},
{"name": '车用连接器', "url": host + '/products.asp?Small_Class=4&page={0}'},
{"name": '洗衣机连接器', "url": host + '/products.asp?Small_Class=5&page={0}'},
{"name": '空调冰箱插件', "url": host + '/products.asp?Small_Class=6&page={0}'},
{"name": '保险丝管连接器', "url": host + '/products.asp?Small_Class=7&page={0}'},
{"name": '电源骨架系列', "url": host + '/products.asp?Small_Class=8&page={0}'},
{"name": '微波炉连接器', "url": host + '/products.asp?Small_Class=9&page={0}'},
{"name": '硬护套系列', "url": host + '/products.asp?Small_Class=10&page={0}'},
{"name": '软护套系列', "url": host + '/products.asp?Small_Class=11&page={0}'},
{"name": '端子系列', "url": host + '/products.asp?Small_Class=12&page={0}'},
{"name": '特种连接器', "url": host + '/products.asp?Small_Class=13&page={0}'},
{"name": '机械手粉碎机', "url": host + '/products.asp?Small_Class=16&page={0}'},
]

# 获取总页数
def get_all_page(self, url):
response = requests.get(url)
html = etree.HTML(response.content, parser=etree.HTMLParser())
res = html.xpath('//ul[@class="pagination"]')
if len(res) > 0:
u = res[0].xpath("./li[last()]//a/@href")[0]
return int(u.split('page=')[1])
return 1

def start_requests(self):
print(self.start_urls)
for item in self.start_urls:
# 获取总共有多少页
url = item.get('url')
total_page = self.get_all_page(url.format('1'))
for page in range(1, total_page+1):
link = url.format(str(page))
yield scrapy.Request(link, callback=self.parse, meta={"url": link, "name": item.get('name')})

def parse(self, response):
meta = response.meta
print("当前采集链接:{0}".format(meta['url']))

for each in response.xpath('//div[@class="product_list wow fadeInUp"]//ul//li'):
url = each.xpath("./a/@href").extract()[0]
# print(url)
item = HongxingItem()
item['catname'] = meta['name']
yield scrapy.Request(self.host + '/' + url, callback=self.url_parse, meta={"item": item})


def url_parse(self, response):
item = response.meta['item']
item['name'] = response.xpath("//div[@class='product_t']//h3//text()").extract()[0]
item['ico'] = self.host + response.xpath("//div[@id='product_show_01']//img/@src").extract()[0]
yield item

 



举报

相关推荐

0 条评论