0
点赞
收藏
分享

微信扫一扫

爬豆瓣小组

唯米天空 2023-01-13 阅读 60


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 30 17:41:40 2021

@author: ledi
"""

import requests
from lxml import etree
import datetime

from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}

# 设置保存路径 保存到指定文件夹 路径复制过来
# path = input("请输入保存路径:")

path='./'
start_time = datetime.datetime.now()


url='https://www.douban.com/group/707669/'

# url='https://www.douban.com/group/707669/discussion?start=50'


# 查找所有class属性为hd的div标签


data={'channel': 'notification','user':171179645,
'auth': '171179645_1630319952:4c75d6c238952c300674a7ac028cf7ad2ba527ce'}

html= requests.get(url, headers=headers).text
html= requests.get(url, headers=headers,data=data).text
# soup = BeautifulSoup(html, "lxml")

soup = BeautifulSoup(html, "lxml")

# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签下的a标签的第一个span标签
# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签
div_list = soup.find_all('td', class_='title')

# //*[@id="topic-content"]/div[2]/h3/span[2]
#topic-content > div.topic-doc > h3 > span.create-time.color-green


import time
data=[]
for k in div_list:
# print(k)
c=str(k.a).split()

print(len(c))

print(c)
c1=c[2].split('=')
c2=c1[1].split('"')

temp=[c2[1],c[-2]]


temp_html= requests.get(temp[0], headers=headers).text
et_html = etree.HTML(temp_html)
# 查找所有class属性为hd的div标签下的a标签的第一个span标签
urls = et_html.xpath("""//*[@id="topic-content"]/div[2]/h3/span[2]""")




this_time=[each.text.strip() for each in urls]


kkp=temp+this_time
data.append(kkp)
print(kkp)

time.sleep(1)








# soup = BeautifulSoup(html, 'html.parser')
# map_node = soup.find_all("tbody")
# div_list = soup.find_all('tr', class_='title')


# for row in soup.select('tbody tr'):
# row_text = [x.text for x in row.find_all('td')]
# print(', '.join(row_text)) # You can save or print this string however you want.

# html = etree.HTML(rep)
# # 获取电影封面图 电影名称 xpath定位提取 得到的是列表
# src = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/img/@src')
# name = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/img/@alt')
# # 保存到本地
# for src, name in zip(src, name):
# file_name = name + ".jpg"
# img = requests.get(src, headers=headers).content
# with open(path + "/" + file_name, "wb") as f:
# f.write(img)


# if __name__ == "__main__":
# # 列表推导式得到url列表 10页的电影信息 Top250
# url_list = ["https://movie.douban.com/top250?start={}&filter=".format(x * 25) for x in range(10)]
# for url in url_list:
# get_pic(url)
# delta = (datetime.datetime.now() - start_time).total_seconds()
# print("抓取250张电影封面图用时:{}s".format(delta))


举报

相关推荐

0 条评论