nature爬虫-CFANZ编程社区

nature爬虫
c一段旅程c 2022-03-30 阅读 23
就是备份一下
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
import re
from io import StringIO
from io import open
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
 
#hang=0
for i in range(10006,10011):
    def read_pdf(pdf):
        # resource manager
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        laparams = LAParams()
        # device
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        process_pdf(rsrcmgr, device, pdf)
        device.close()
        content = retstr.getvalue()
        retstr.close()
        jie='Received'
        x=[substr.start()for substr in re.finditer(jie,str(content))]
        con=str(content)[0:x[-1]]
        con = con.replace("\n", " ") 
        # 获取所有行

        hang=0
        target = 'https://www.nature.com/articles/nature'+'%d' %i+'#accession-codes.html'
        req = requests.get(url = target)
        html = req.text
        bf = BeautifulSoup(html)
        texts = bf.find_all("div",class_="c-article-section__content",id="Abs2-content")
        t=str(texts)
        jian=re.sub(u"\\<.*?\\>", "", t)
        jian = str(jian).replace(".", ".\n")
        jian = str(jian).replace("?", "?\n")
        jian = str(jian).replace("!", "!\n")
        for ch in jian:
            if(ch=='\n'):
                hang=hang+1
        print(jian)
        print(hang)
       
        lines = con.split(".")
        text = open("%d.txt"%i, 'w+',encoding='gb18030',errors='ignore')
        for j in range(len(lines)):
            if(len(lines[j])>20):
                if((lines[j][0].isdigit())==False and lines[j][1].isdigit()==False):
                #if(lines[j][0]>='9' and lines[j][0]<='0'):
                    hang=hang-1
                    if(hang<0):
                        text.write("\n")
            lines[j]= str(lines[j]).replace(".", ".\n")
            lines[j] = str(lines[j]).replace("！", "!\n")
            lines[j] = str(lines[j]).replace("？", ".\n")
            if(hang<0):
                text.write(lines[j])
                text.write(".")
        text.write("\n\n\nhighlight:\n")
        text.close()

        f= open("%d.txt"%i,"a+", encoding='gb18030',errors='ignore')
        f.write(str(jian))
        f.close()
        
    def _main():
        my_pdf = open('nature'+'%d' %i+'.pdf', "rb")
        read_pdf(my_pdf)
        my_pdf.close()
        
        

    if __name__ == '__main__':
        _main()
0 条评论