爬取Google学术文献并多线程下载PDF-CFANZ编程社区

search_download.py如下

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time,queue,threading
import xlwt,os
from time import sleep
from tqdm import tqdm
from Download import Download

TotalNum=0
class Article(object):
    title = ""
    article_link = ""
    authors = ""
    authors_link = ""
    abstract = ""
    def __init__(self):
        title = "New Paper"

def save_xls(sheet, paper):
    # 将数据按列存储入excel表格中
    global TotalNum
    sheet.write(TotalNum, 0, TotalNum)
    sheet.write(TotalNum, 1, paper.title)
    sheet.write(TotalNum, 2, paper.article_link)
    sheet.write(TotalNum, 3, paper.journal)
    sheet.write(TotalNum, 4, paper.authors_link)
    sheet.write(TotalNum, 5, paper.abstract)
    TotalNum += 1

head = {
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
        }  # 20210607更新，防止HTTP403错误

article_titles = []
article_links = []

def GetInfo(sheet,html):
  soup = BeautifulSoup(html, "html.parser")
  #print("\n"+soup)
  articles = soup.find_all(class_="gs_ri")
  for article in articles:
      paper =Article()
      try:
          title = article.find('h3')
          paper.title = title.text
          try:
              paper.article_link = title.a.get('href')
              article_titles.append(paper.title)
              article_links.append(paper.article_link)
          except:
              continue
          #print("\n"+paper.title)
          #print("\n"+paper.article_link)
          journal = article.find(class_="gs_a")
          paper.journal =journal.text
          #print("\n"+paper.authors)
          authors_addrs = journal.find_all('a')
          for authors_addr in authors_addrs:
              #print("\n"+authors_addr.get('href'))
              paper.authors_link=paper.authors_link +(authors_addr.get('href'))+"\n"

          abstract = article.find(class_="gs_rs")
          paper.abstract = abstract.text
          #print("\n"+paper.abstract)
      except:
          continue
      save_xls(sheet,paper)
  return

exitFlag = 0
class myThread(threading.Thread):
   def __init__(self,queueLock,queue):
      threading.Thread.__init__(self)
      self.queueLock =queueLock
      self.queue = queue

   def run(self):
      euDownload(self.queueLock,self.queue)

def euDownload(queueLock,queue):
   while not exitFlag:
      queueLock.acquire()
      if not queue.empty():
         url,path = queue.get()
         queueLock.release()
         try:
            Download.getPDF(url,path)
         except:
            continue
      else:
         queueLock.release()
      time.sleep(3)

if __name__ == '__main__':
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet(u'PaperInfo', True)
    column = ['序号', '文章题目', '文章链接', '期刊', '作者链接', '摘要']
    for i in range(0, len(column)):
        sheet1.write(TotalNum, i, column[i])
    TotalNum += 1

    keyword = input("keywords is?\n")
    # keyword = diabetes and conjunctiva and (microcirculation or microvasculature)
    # symfony and ((high myopia)or(long axis))
    # print("\n"+keyword)
    key = keyword.replace(" ", "+")
    info = ".\\Info\\" + keyword + "_PaperInfo.xls"
    ##检索
    print("\n" + "检索中……")
    if os.path.exists(info) == True:
        print("\n" + "PaperInfo already exists!")
    else:
        desired_capabilities = DesiredCapabilities.CHROME
        desired_capabilities["pageLoadStrategy"] = "none"  # 此两行配置可以大大节省加载时间
        option = webdriver.ChromeOptions()
        chrome_dir = os.getcwd() + '/Chrome/'
        #print(chrome_dir)
        option.add_argument("--user-data-dir=" + chrome_dir)
        browser = webdriver.Chrome(options=option, executable_path='./chromedriver.exe')
        url = 'https://xs.dailyheadlines.cc/scholar?start=0&q=diabetes&hl=zh-CN&as_sdt=0,5'

        start = 0
        for i in tqdm(range(10)):
            url = 'https://xs.dailyheadlines.cc/scholar?start=' + str(start) + '&q=' + key + '&hl=zh-CN&as_sdt=0,5'
            start = start + 10
            browser.get(url)
            time.sleep(3)  ##睡眠3s,等待加载
            html = browser.page_source
            GetInfo(sheet1, html)
            myxls.save(info)
            sleep(0.5)
        browser.close()
    print("\n" + "检索完成")

    ##下载
    print("\n" + "下载中……")
    if len(article_titles) == 0:
        import xlrd
        data = xlrd.open_workbook(info)
        table = data.sheet_by_index(0)
        article_titles = table.col_values(1)[1:]
        article_links = table.col_values(2)[1:]
        #print(len(article_titles),len(article_links))
    #保存路径
    dir = ".\\Articles\\" + keyword + "\\"
    if os.path.exists(dir) == False:
        os.mkdir(dir)
    # print (dir)
    queueLock = threading.Lock()
    article_num = len(article_titles)
    workQueue = queue.Queue(article_num)
    threads = []
    # 创建新线程
    for i in range(25):
        thread = myThread(queueLock,workQueue)
        thread.start()
        threads.append(thread)
    queueLock.acquire()
    for k in range(article_num):
        article_title = "{0}".format(article_titles[k].replace(':', ' ')).replace('.', '')
        path = dir + article_title + ".pdf"
        # print("\n"+path)
        workQueue.put((article_links[k], path))
    queueLock.release()
    # 等待队列清空
    while not workQueue.empty():
        pass
    # 通知线程是时候退出
    exitFlag = 1
    # 等待所有线程完成
    for t in threads:
        t.join()
    print("\n" + "下载完成")

Download.py如下

import os.path
import re
import requests
from bs4 import BeautifulSoup
class Download:

    head = { \
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36' \
        }  # 20210607更新，防止HTTP403错误

    def pdf_hub(url,path):
        try:
            pdf = requests.get(url, headers=Download.head)
            with open(path, "wb") as f:
                f.write(pdf.content)
            print("\n"+"pdf found directly!")
        except:
            print("\n"+"failed to download pdf directly!\n" +url)
            Download.err_log(url)
    def sci_hub(path,doi):
        doi = str(doi).split("https://doi.org/")[1]
        url = "https://www.sci-hub.ren/doi:" + doi + "#"
        r = requests.get(url, headers=Download.head)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, "html.parser")
        download_url = soup.iframe.attrs["src"]
        try:
            download_r = requests.get(download_url, headers=Download.head)
            download_r.raise_for_status()
            with open(path, "wb+") as temp:
                temp.write(download_r.content)
                print("\n"+"Article downloaded by doi!")
        except:
            print("\n"+"failed to download pdf by doi!\n" +url)
            Download.err_log(url)

    def err_log(url):
        with open("download_err.txt", "a+", encoding="utf-8") as error:
            error.write("PDF not found,download link may be: \n"+url +"\n")

    def getSoup(url):
        r = requests.get(url, headers=Download.head)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, "html.parser")
        return soup

    def getPDF(url,path):
        if os.path.exists(path) == True:
            print("\n" + "Article already exists")
        else:
            if (len(re.findall('pdf', url)) != 0):
                print ("\n"+'pdf link already!')
                Download.pdf_hub(url,path)
            elif re.match("https://www.sci-hub.ren/",url):
                print("\n" + 'sci_hub link!')
                url = str(url).replace("https://www.sci-hub.ren/","https://doi.org/")
                Download.sci_hub(path,url)
            #if pdf can be easily found!
            elif re.match("https://academic.oup.com/", url):
                soup = Download.getSoup(url)
                pdf_link ="https://academic.oup.com"+soup.find(class_="al-link pdf article-pdfLink").get('href')
                #print("\n"+pdf_link)
                Download.pdf_hub(pdf_link,path)
                '''
                doi = soup.select('div[class="ww-citation-primary"]')[0].a.get('href')
                #print("\n"+doi)
                Download.sci_hub(path,doi)
                '''
            elif re.match("https://content.iospress.com/", url):
                soup = Download.getSoup(url)
                pdf_link = soup.find(class_="btn btn-download btn-right get-pdf").get('href')
                # print("\n"+pdf_link)
                Download.pdf_hub(pdf_link, path)
            elif re.match("https://wwwnature.53yu.com/", url):
                soup = Download.getSoup(url)
                pdf_link = soup.find(class_="c-pdf-download__link").get('href')
                #print("\n"+pdf_link)
                Download.pdf_hub(pdf_link, path)
            elif re.match("https://bjo.bmj.com/", url):
                soup = Download.getSoup(url)
                pdf_link = soup.find(class_="article-pdf-download").get('href')
                pdf_link = "https://bjo.bmj.com" + pdf_link
                #print("\n"+pdf_link)
                Download.pdf_hub(pdf_link,path)
            elif re.match("https://jamanetwork.com/", url):
                soup = Download.getSoup(url)
                pdf_link = soup.find(class_="toolbar-tool toolbar-pdf al-link pdfaccess").get('data-article-url')
                pdf_link = "https://jamanetwork.com" + pdf_link
                #print("\n"+pdf_link)
                Download.pdf_hub(pdf_link, path)

            # if pdf can't be easily found,but doi can!
            elif re.match("https://sciencedirect.53yu.com/", url):
                soup = Download.getSoup(url)
                doi = soup.find(class_="doi").get('href')
                Download.sci_hub(path, doi)
            elif re.match("https://diabetes.diabetesjournals.org/", url):
                soup = Download.getSoup(url)
                doi = soup.select('.citation-doi')[0].a.get('href')
                Download.sci_hub(path, doi)
            elif re.match("https://journals.lww.com/", url):
                soup = Download.getSoup(url)
                doi = "https://doi.org/" + str(soup.find(id="ej-journal-doi").text).split("doi: ")[1]
                Download.sci_hub(path, doi)
            else:
                '''
                https://europepmc.org/
                https://iovs.arvojournals.org/
                https://linkspringer.53yu.com/
                '''
                print("\n"+"To be prettified!Download link may be: " +"\n" +url)
                Download.err_log(url)

if __name__ == '__main__' :

    url = "https://www.nature.com/articles/s41598-021-87315-7.pdf"
    url1 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"
    url2 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"
    Download.getPDF(url,"test.pdf")
    Download.getPDF(url1,"test1.pdf")
    Download.getPDF(url2,"test2.pdf")