以下是使用爬虫框架 Scrapy 批量抓取招聘信息的实战步骤:
scrapy startproject recruitment_spider
cd recruitment_spider
import scrapy
class RecruitmentItem(scrapy.Item):
job_title = scrapy.Field()
company_name = scrapy.Field()
location = scrapy.Field()
salary = scrapy.Field()
job_description = scrapy.Field()
import scrapy
from recruitment_spider.items import RecruitmentItem
class JobSpider(scrapy.Spider):
name = "job_spider"
start_urls = [
"https://your-job-board-url.com",
]
def parse(self, response):
# 找到招聘信息列表的容器元素
job_listings = response.css('.job-listing')
for job in job_listings:
item = RecruitmentItem()
item['job_title'] = job.css('.job-title::text').get()
item['company_name'] = job.css('.company-name::text').get()
item['location'] = job.css('.location::text').get()
item['salary'] = job.css('.salary::text').get()
item['job_description'] = job.css('.job-description::text').get()
yield item
# 找到下一页的链接并继续爬取
next_page = response.css('.next-page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
# 设置下载延迟
DOWNLOAD_DELAY = 2
# 设置用户代理
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
scrapy crawl job_spider
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open('jobs.json', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
ITEM_PIPELINES = {
'recruitment_spider.pipelines.JsonWriterPipeline': 300,
}