MyAnimeList爬取图片链接-CFANZ编程社区

查询URL：

https://myanimelist.net/search/all?q=

功能：

仅爬取动漫名对应的图片链接

未解决bug：

预防机器人检测，网页不定时机器人检测，禁止访问时间大概1-3分钟，不锁IP，采用time.sleep解决
未采用轮询查找当前项，直接填充null，跳过，因为数据量较小，以后再改进

代码

# This is a sample Python script.

# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
import requests
from bs4 import BeautifulSoup
import csv
import time
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'

}

def getAnimeName():
    nameList = []
    with open("./name_empty.csv", 'r') as f1:
        for line in f1.readlines():
            nameList.append(line.strip().replace('\n', ''))

    return nameList

def getLink():
    url = 'https://myanimelist.net/search/all?q='
    # 图片链接列表
    img_links_list = []

    nameList = getAnimeName()
    for i in range(0,len(nameList)):
        #print(name)
        name = nameList[i]
        #error:Your keyword is too short.  solution：名字后加入任意占位字符
        if len(name)<=2:
            name += "---"
        animeurl = url + name

        # 获取img标签,在获取图片链接
        html = requests.get(animeurl, headers = headers).content
        #print(html)

        #print(e)
        '''
        res = getAnimeInfo(html)
        if res == "":
            for j in range(0,5):
                time.sleep(30)
                res = getAnimeInfo(html)
                if res != "":
                    break
        '''
        res = getAnimeInfo(html)
        if res == "":
            time.sleep(60)
            print("休息一会------------")
            res = getAnimeInfo(html)
        print(i,"   ", res)

        img_links_list.append(res)


    return nameList,img_links_list

def getAnimeInfo(html):
    img_link = ""
    try:
        soup = BeautifulSoup(html, 'lxml')
        imgA = soup.find(attrs={'class': 'information di-tc va-t pl8'})
        imgALink = imgA.find('a')
        imgAnimelink = imgALink['href']

        imgAnimehtml = requests.get(imgAnimelink, headers=headers).content
        imgsoup = BeautifulSoup(imgAnimehtml, 'lxml')
        tdsoup = imgsoup.find('td', attrs={'class': 'borderClass'})
        # print(tdsoup)
        imgLable = tdsoup.find('img')
        # print(imgLable)
        img_link = imgLable['data-src']

        #(img_link)
    except Exception:
        img_link = ""

    return img_link
def animeOutPut():
    nameList,img_links_list = getLink()
    with open('./animeLink1.csv', 'a') as wrifile:
        for i in range(0, len(img_links_list)):
            wrifile.write(nameList[i]+","+img_links_list[i]+'\n')
            
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    animeOutPut()