0
点赞
收藏
分享

微信扫一扫

Selenium+xpath爬取简书

from selenium import webdriver
 import time
 from lxml import etree
 import pymysqldriver = webdriver.Chrome()
 driver.get('https://www.jianshu.com/')#加载更多
 def load_mord(num):
     #通过观察发现,打开页面需要鼠标滑动大概5次左右才能出现阅读更多按钮
     for x in range(5):
         js = "var q=document.documentElement.scrollTop=100000"
         driver.execute_script(js)
         time.sleep(2)
     if num==0:
         time.sleep(2)
     #定位并点击加载更多
     load_more = driver.find_element_by_class_name("load-more")
     load_more.click()
 #获取内容源码
 def get_html():
     note_list=driver.find_element_by_class_name("note-list")
     html=note_list.get_attribute('innerHTML')
     return html#传入内容网页源码,使用xpath提取信息标题、简介、发布昵称
 def extract_data(content_html):
     html = etree.HTML(content_html)
     title_list =html.xpath('//li//a[@class="title"]/text()')
     abstract_list=html.xpath('//li//p[@class="abstract"]/text()')
     nickname_list=html.xpath('//li//a[@class="nickname"]/text()')
     data_list=[]
     for index,x in enumerate(title_list):
         item={}
         item["title"]=title_list[index]
         item["abstract"] = abstract_list[index]
         item["nickname"] = nickname_list[index]
         data_list.append(item)
     return data_list#保存到mysql数据库
 def insert_data(sql):
     db = pymysql.connect("127.0.0.1", "root", "123456lyl", "xs_db", charset="utf8")
     try:
         cursor = db.cursor()
         return cursor.execute(sql)
     except Exception as ex:
         print(ex)
     finally:
         db.commit()
         db.close()#模拟点击10次阅读更多按钮
 for x in range(2):
     print("模拟点击加载更多第 {} 次".format(str(x)))
     load_mord(x)
     time.sleep(1)resuts=extract_data(get_html())
 for item in resuts:
     sql="insert into tb_test(title,abstract,nickname) values('%s','%s','%s')" \
         ""%(item["title"],item["abstract"],item["nickname"])
     insert_data(sql)

举报

相关推荐

0 条评论