import parsel import requests for page in range(0, 226, 25): # 获取10页的所有数据 baseurl = f"https://movie.douban.com/top250?start={page}&filter=" # 用于翻页 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36" } response = requests.get(url=baseurl, headers=headers) html_data = response.text selector = parsel.Selector(html_data) lis = selector.xpath('//ol[@class = "grid_view"]/li') # 得到250部电影点击去详细页面的地址 for li in lis: link = li.xpath('.//div[@class = "pic"]/a/@href').extract() #再使用这些地址封装一个方法来进行详细内容的获取 for links in link: baseurl = links headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36" } response = requests.get(url=baseurl, headers=headers) html_data = response.text selector = parsel.Selector(html_data) #标题 db_title = selector.xpath('.//div[@id = "content"]/h1/span[1]/text()').get() director = selector.xpath('.//span[@class = "attrs"]/a/text()').get() type = selector.xpath('.//span[@property = "v:genre"]/text()').get() type2 = selector.xpath('.//span[@property = "v:genre"][2]/text()').get() type3 = selector.xpath('.//span[@property = "v:genre"][3]/text()').get() #电影类型 db_type = (type, type2, type3) db_Running_time = selector.xpath('.//span[@property = "v:runtime"]/text()').get() #电影的详细剧情 db_juqing = selector.xpath('.//span[@class = "short"]/span/text()').get().strip() or selector.xpath('.//div[@class = "indent"]/span/text()').get().strip() print(db_juqing)
这个的一个思想就是,先获取到250部电影点击进入的地址,再使用这些地址来获取里面的详细信息,获取的内容不全,这里只是保留一个方法,个别的内容需要自己写表达式来进行获取
至于数据的保存的话,csv和mysql会比较方便也好写
略略略....................