0
点赞
收藏
分享

微信扫一扫

爬取豆瓣电影top250

暮晨夜雪 2022-01-13 阅读 75

1.requsts失败,换了好几个headers都不行,一直418,用的selenium
2.多线程导致是乱序的,懒得排序了

from lxml import etree
from selenium import webdriver
import time
import threading

def dis(itable):
    for it in itable:
        print(it)
url='https://movie.douban.com/top250'
def cal(url):  
    #selenium或取代码     
    browser = webdriver.Edge()
    browser.get(url)
    time.sleep(5)#等待网页加载完,可根据网速调整
    pageSource = browser.page_source
    Html=etree.HTML(pageSource)
    browser.close()
    #筛选信息
    titles=Html.xpath('//span[@class="title"]/text()')
    titles=[x for x in titles if x.find('/')==-1]
    values=Html.xpath('//p[@class=""]/text()')
    #爬取的简介分成了两条,合并
    values1=[]
    for i in range(25):
        values1.append(values[2*i]+values[2*i+1])
    values=values1
    #评分
    stars=Html.xpath('//span[@class="rating_num"]/text()')
    #评论数不会精确定位,每次都多筛选了一个,去掉
    comments=Html.xpath('//div[@class="star"]/span/text()')
    comments1=[]
    for i in range(25):
        comments1.append(comments[2*i+1])
    comments=comments1

    with open('data3.txt','a',encoding='utf-8') as f:
        for title,star,comment,value in zip(titles,stars,comments,values):
            f.write(title+'\t'+star+'\t'+comment+'\n')
            f.write(value+'\n\n')

para='?start={}&filter='
with open('data3.txt','w',encoding='utf-8') as f:
    pass

thread=[]
for i in range(10):
    start=i*25
    url_now=url+para.format(str(start))
    #半吊子多线程
    thread.append(threading.Thread(target=cal,args=(url_now,)))
    if (i+1)%5==0:
        for it in thread:
            time.sleep(2)#太快会被封
            it.start()
        for it in thread:
            it.join()
        thread.clear() 
    

举报

相关推荐

0 条评论