0
点赞
收藏
分享

微信扫一扫

selenium 爬豆瓣帖子

谁知我新 2023-01-13 阅读 100


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 30 19:17:12 2021

@author: ledi
"""

import time
import parsel
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

import requests
from lxml import etree
import datetime

from bs4 import BeautifulSoup
import pandas as pd

url = 'https://accounts.douban.com/passport/login?source=group'

browser = webdriver.Chrome()

wait = WebDriverWait(browser, 50)

browser.get(url)

time.sleep(30)

import time

data=[]
for pa in range(10000):

kkt=25*pa
url='https://www.douban.com/group/707669/discussion?start='+str(kkt)
# url = 'https://www.douban.com/group/707669/'

browser.get(url)

page = browser.page_source

print(page)

soup = BeautifulSoup(page, "lxml")

# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签下的a标签的第一个span标签
# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签
div_list = soup.find_all('td', class_='title')



# import time
# data=[]

dd=pd.read_html(page)[1].values
time.sleep(3)
for k in range(len(div_list)):
print(div_list[k])
c=str(div_list[k].a).split()

print(c)
print(len(c))
print('############')
c1=c[2].split('=')
c2=c1[1].split('"')

temp=[c2[1],c[-2],dd[:,2][k]]


# temp_html= requests.get(temp[0], headers=headers).text
browser.get(temp[0])
temp_html = browser.page_source
et_html = etree.HTML(temp_html)
# # 查找所有class属性为hd的div标签下的a标签的第一个span标签


urls = et_html.xpath("""//*[@id="topic-content"]/div[2]/h3/span[2]""")




this_time=[each.text.strip() for each in urls]


kkp=temp+this_time
data.append(kkp)
print(kkp)
time.sleep(0.2)

# time.sleep(1)



# result = []


举报

相关推荐

0 条评论