查询URL:
https://myanimelist.net/search/all?q=
功能:
仅爬取动漫名对应的图片链接
未解决bug:
- 预防机器人检测,网页不定时机器人检测,禁止访问时间大概1-3分钟,不锁IP,采用time.sleep解决
- 未采用轮询查找当前项,直接填充null,跳过,因为数据量较小,以后再改进
代码
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
import requests
from bs4 import BeautifulSoup
import csv
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}
def getAnimeName():
nameList = []
with open("./name_empty.csv", 'r') as f1:
for line in f1.readlines():
nameList.append(line.strip().replace('\n', ''))
return nameList
def getLink():
url = 'https://myanimelist.net/search/all?q='
# 图片链接列表
img_links_list = []
nameList = getAnimeName()
for i in range(0,len(nameList)):
#print(name)
name = nameList[i]
#error:Your keyword is too short. solution:名字后加入任意占位字符
if len(name)<=2:
name += "---"
animeurl = url + name
# 获取img标签,在获取图片链接
html = requests.get(animeurl, headers = headers).content
#print(html)
#print(e)
'''
res = getAnimeInfo(html)
if res == "":
for j in range(0,5):
time.sleep(30)
res = getAnimeInfo(html)
if res != "":
break
'''
res = getAnimeInfo(html)
if res == "":
time.sleep(60)
print("休息一会------------")
res = getAnimeInfo(html)
print(i," ", res)
img_links_list.append(res)
return nameList,img_links_list
def getAnimeInfo(html):
img_link = ""
try:
soup = BeautifulSoup(html, 'lxml')
imgA = soup.find(attrs={'class': 'information di-tc va-t pl8'})
imgALink = imgA.find('a')
imgAnimelink = imgALink['href']
imgAnimehtml = requests.get(imgAnimelink, headers=headers).content
imgsoup = BeautifulSoup(imgAnimehtml, 'lxml')
tdsoup = imgsoup.find('td', attrs={'class': 'borderClass'})
# print(tdsoup)
imgLable = tdsoup.find('img')
# print(imgLable)
img_link = imgLable['data-src']
#(img_link)
except Exception:
img_link = ""
return img_link
def animeOutPut():
nameList,img_links_list = getLink()
with open('./animeLink1.csv', 'a') as wrifile:
for i in range(0, len(img_links_list)):
wrifile.write(nameList[i]+","+img_links_list[i]+'\n')
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
animeOutPut()