import requests
from lxml import etree
import logging
import time
import json
import os
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
# https://movie.douban.com/subject/1292052/
BASE_URL = "https://movie.douban.com/"
TOTAL_PAGE = 2
RESULTS_DIR = "豆瓣TOP250"
if not os.path.exists(RESULTS_DIR):
os.makedirs(RESULTS_DIR)
headers = {
# 此处填入你的headers
}
# 定义一个爬取方法,返回html代码
def scrape_page(url):
logging.info(f"scraping {url}...")
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except requests.RequestException:
logging.error(f'error occurred while scraping {url}...')
# 传入页面序号,返回列表页html代码
def scrape_index(page):
index_url = BASE_URL + f"top250?start={25 * (page - 1)}"
return scrape_page(index_url)
# 传入列表页html代码,返回详情页url网址
def parse_index(index_data):
html = etree.HTML(index_data)
hrefs = html.xpath('//div[@class="item"]/div/a/@href')
return hrefs
# 传入详情页url网址,返回详情页html数据
def scrape_detail(url):
return scrape_page(url)
def parse_detail(data_html):
detail_html = etree.HTML(data_html)
name = detail_html.xpath('//h1/span[1]/text()')[0]
score = detail_html.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
time = detail_html.xpath('//span[@property="v:runtime"]/text()')[0]
return {
'name': name,
'score': score,
'time': time
}
def save_data(data):
name = data.get('name')
data_path = f'{RESULTS_DIR}/{name}.json'
json.dump(data, open(data_path, mode='w', encoding='utf-8'),
ensure_ascii=False, indent=2)
def main():
for page in range(1, TOTAL_PAGE + 1):
index_data = scrape_index(page)
urls = parse_index(index_data)
logging.info(f"scraping {urls}...")
time.sleep(2)
for url in urls:
detail_html = scrape_detail(url)
detail_data = parse_detail(detail_html)
time.sleep(2)
save_data(detail_data)
logging.info(f"{url} data saved successfully")
if __name__ == '__main__':
main()