- 上代码
- 实现逻辑:是利用pyautogui 自动点击脚本,实现下载
- 代码:
#从https://onlinelibrary.wiley.com/toc/15214095/2019/31/42 自动下载文章
# author : ytouch
# date :2019.10.17
import requests
import time
import pyautogui
from bs4 import BeautifulSoup
from selenium import webdriver
#相关URL定义
DOWNLOAD_URL = 'https://onlinelibrary.wiley.com/toc/15214095/2019/31/42'#访问链接
SPICE_URL = 'https://onlinelibrary.wiley.com' #需要拼接的URL
TEST_URL = 'https://onlinelibrary.wiley.com/doi/epdf/10.1002/adma.201970296' #测试下载pdf的url
def downloadFile(name, url):
'''
:param name:下载保存的名称
:param url: 下载链接
:return:
'''
headers = {'Proxy-Connection': 'keep-alive'}
r = requests.get(url, stream=True, headers=headers)
length = float(r.headers['content-length'])
f = open(name, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 1024 / 2
count_tmp = count
print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
time1 = time.time()
f.close()
def formatFloat(num):
return '{:.2f}'.format(num)
def autoDownload(url):
'''
:param url:下载pdf的链接
:return:
# 不同窗体可能位置不同,需要根据自己机器去修改pyautogui.click中的x,y参数
# 使用的是模拟点击的方式去下载
'''
chrome_browser = webdriver.Chrome(executable_path='E:\chromedriver.exe')
chrome_browser.maximize_window() # 最大化
chrome_browser.get(url)
time.sleep(10)
pyautogui.click(1857, 132, clicks=2, interval=0, button='left') # 此处需要说明 此处下载PDF保存至下载目录
#time.sleep(10)
#chrome_browser.close() # 人工关闭为妙
r = requests.get(DOWNLOAD_URL)
soup = BeautifulSoup(r.text,'lxml')
pdf_list_a = soup.find_all('a',title='PDF')
for pdf_url in pdf_list_a:
pdf_str = pdf_url.get('href')
if pdf_str.find('epdf') > 0 :
pdf_download_url = SPICE_URL + pdf_str # 拼接下载链接
autoDownload(pdf_download_url) # 开始访问网站并下载