0
点赞
收藏
分享

微信扫一扫

利用beautifulsoup爬取豆瓣电影top250,存储在mongodb

上善若水的道 2022-07-12 阅读 33

不多说了,上代码:

1 from requests import request
2 from bs4 import BeautifulSoup
3 import re
4 import pymongo
5
6
7
8 class SpiderDouBan:
9
10
11
12 def __init__(self):
13 client = pymongo.MongoClient(host='localhost', port=27017)
14 db = client['spider_db']
15 self.collection = db['douban_movie_top250']
16
17
18
19 def get_html(self, url):
20 '''
21 获取一页的html文本
22 :param url: 地址
23 :return:
24 '''
25 html = request('get', url).text
26 soup = BeautifulSoup(html, 'lxml')
27 return soup
28
29
30 def get_one_page(self, soup, order):
31 '''
32 获取某一页的内容
33 :param soup: soup实例化对象
34 :return:
35 '''
36 movie_names = [span.string for span in soup.find_all(name='span', attrs={'class': 'title'}) if not re.search('\/', span.string)]
37 movie_actors = [ re.sub('\n|\xa0', '', p.get_text().strip('" " |\n | \xa0')).split('/') for p in soup.find_all(name='p', attrs={'class': ''})]
38 movie_rates = [span.string for span in soup.find_all(name='span', attrs={'class': 'rating_num'})]
39 comment_num = [span_2.string for span in soup.find_all(attrs={'property': 'v:best'}) for span_2 in span.next_siblings if re.search('\w+', span_2.string)]
40 short_comments = [re.sub('。', '', span.string) for span in soup.find_all(class_='inq')]
41 for index, name in enumerate(movie_names):
42 print(f'正在爬取第{order + index + 1}条数据...')
43 movie_info = {
44 'order': f'No.{order + index + 1}',
45 'movie_name': name,
46 'movie_type': f'{re.findall("[0-9]+", movie_actors[index][-3])[0]}年/{movie_actors[index][-2]}/{movie_actors[index][-1]}',
47 'movie_rate': f'{movie_rates[index][0]}分',
48 'short_comment': f'{short_comments[index]}'
49 }
50 self.collection.insert_one(movie_info)
51
52
53
54 def main(self, url, order):
55 '''
56 主程序
57 :return:
58 '''
59 soup = self.get_html(url)
60 self.get_one_page(soup, order)
61
62
63
64
65 if __name__ == '__main__':
66 for offset in range(0, 250, 25):
67 order = offset
68 url = f'https://movie.douban.com/top250?start={str(offset)}'
69 SpiderDouBan().main(url, order)

运行结果:

利用beautifulsoup爬取豆瓣电影top250,存储在mongodb_html

MongoDB存储效果:

利用beautifulsoup爬取豆瓣电影top250,存储在mongodb_html_02

 


举报

相关推荐

0 条评论