直接上代码
1 from pyquery import PyQuery as pq
2 from requests import request
3 import re
4 import pymongo
5
6
7 class SpiderDouBan2:
8
9
10 def __init__(self):
11 client = pymongo.MongoClient(host='localhost', port=27017)
12 db = client['spider_db']
13 self.collection = db['douban_movie_top250_2']
14
15
16
17 def get_doc(self, url):
18 '''
19 获取某一页的doc
20 :param url: 地址
21 :return:
22 '''
23 doc = pq(url)
24 return doc
25
26
27 def get_one_page(self, doc, order):
28 '''
29 获取某一页的内容
30 :return:
31 '''
32 movie_names = [name.text() for name in doc('.title').items() if not re.search('\/', name.text())]
33 movie_actors = [re.sub('\xa0|" "', '', actor.text()).split('\n') for actor in doc('.info .bd')('p:first-child').items()]
34 movie_rates = [rate.text() for rate in doc('.rating_num').items()]
35 comment_nums = [comment_num.text() for comment_num in doc('.star')('span:last-child').items()]
36 short_comments = [short_comment.text()[:-1] for short_comment in doc('.inq').items()]
37 for index, name in enumerate(movie_names):
38 print(f'正在爬取第{order + index + 1}条数据...')
39 movie_info = {
40 'order': f'No.{order + index + 1}',
41 'movie_name': name,
42 'movie_actor': movie_actors[index][0].rstrip('\/'),
43 'movie_type': movie_actors[index][1],
44 'movie_rate': f'{movie_rates[index]}分',
45 'comment_num': comment_nums[index],
46 'short_comment': short_comments[index]
47 }
48 self.collection.insert_one(movie_info)
49
50
51
52 def main(self, url, order):
53 '''
54 主程序
55 :return:
56 '''
57 doc = self.get_doc(url)
58 self.get_one_page(doc, order)
59
60
61 if __name__ == '__main__':
62 for offset in range(0, 250, 25):
63 order = offset
64 url = f'https://movie.douban.com/top250?start={str(offset)}'
65 SpiderDouBan2().main(url, order)
运行结果