0
点赞
收藏
分享

微信扫一扫

Python-1: ISBN豆瓣提取书籍信息

_karen 2022-03-12 阅读 84
python

在这里插入图片描述

ISBN豆瓣提取书籍信息,基于python 3.8

#python 3.8
import requests
import re 


def find3(txt,str1,str2,str3,str4):
    a=find2(txt,str1,str2)
    k1=a.find(str3)
    a=a[k1+1:]
    k1=a.find(str4)
    a=a[:k1]
    return a


def find2(txt,str1,str2):    
    s = ifind(txt,str1)
    txt1 = txt[s[0]+11:s[0]+200]
    k = txt1.find(str2)
    txt1 = txt1[:k]
    return txt1

def ifind(somestr,sub):
    s = [substr.start() for substr in re.finditer(sub , somestr)]
    return s

def jwrite(out_file,str0):
    text_file = open(out_file+'.txt', 'w',encoding='utf-8')
    text_file.write(str0)
    text_file.close()
    outansestr = str0.encode('gbk','ignore')
    text_file = open(out_file+'_gbk.txt', 'wb')
    text_file.write(outansestr)
    text_file.close()
    
def jfind(str0,str1,str2):
    
    str3=""
    m1=str0.find(str1)
    m2=str0.find(str2)
    str3=str0[m1:m2]

    return str3    

f = open("./ISBN.txt")
isbn = f.readline()
f.close()   
# isbn = '9787111608974'
html = "https://douban.com/isbn/" + isbn
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67'
}
h = requests.get(html,headers=headers)
h.encoding = 'utf-8'

txt =  h.text

# jwrite('out',txt)


a=find3(txt,'og:title','og:description','\"','\"')
print('书名:'+a)
jwrite('title',a)

a=find3(txt,'作者</span>','<br','>','<')
print('作者:'+a)
jwrite('author',a)


a=find3(txt,'出版社:</span>','<br','>','<')
print('出版社:'+a)
jwrite('publisher',a)

a=find2(txt,'出版年:</span>','<br')
print('出版年:'+a)
jwrite('publish_year',a)

a=find2(txt,'定价:</span>','<br')
jwrite('price',a)
print('价格:'+a)
举报

相关推荐

0 条评论