0
点赞
收藏
分享

微信扫一扫

1688完整爬虫框架 附源码

萨科潘 2022-04-04 阅读 59
python

 前些日子做的阿里巴巴爬虫 通过搜索关键词采用selenium爬取指定页数的商品信息,包括公司名,五项评分,综合评分,价格,所有宝贝图的图片,以及产品的规格,尺寸暂时没写,不足之处:1688的验证码很变态,要么换ip刷新页面,要么换账号,这一部分还是得人工,已经写了ip切换的功能,暂时没找到合适的ip池,需要的自行根据代码将注释取消启用,并且修改ip.txt的内容即可,ip通过http https 地址+端口直连的方式连接。在爬取频繁之后,一般是十个商品会出现一次验证码,目前采用的方式是在电脑人工切换ip刷新页面。数量不多的话影响不大。

上传目的为社区的集体意识做一份贡献~

""" 

项目概述: 爬取1688关键词名称和对应的产品数量,单页60个产品的数据。 对于每个产品: A.标题(以及红标题)和链接 B.五项评分评分和综合评分 C.复购率 D.成交额 E.价格 F.企业名称 G.页面链接 """ """ author:wes; updatetime:2022.04.22 第三版更新说明: 优化了程序,使之能完整的运行 未来可以继续优化的步骤:效率高于扫码登录的更优方式,ip验证问题(遇到ip验证需要人力解决)

""" """ 第二版更新说明: 修改了退换体验分数为空时存表为-1的错误 修改了成交额显示错误的问题 美化了下代码

"""

1.项目名称:1688.com 的关键词商品信息爬虫

2.需求分析

A.分析商品页ajax链接(下的)存储到{keyword}_{sort_type}.csv 中 (此功能在py爬虫文件均自动重新生成)

3.主要代码实现

4.其他描述: A.直接运行主文件

5.测试: cookies容易失效,后续考虑多账号轮番登录,登录暂时需要人工

下面的这一部分代码是搜索页的,详情页的还没放下来,需要的可以联系我获取,

# -*- coding: utf-8 -*-
"""
@Time : 2022/3/21 18:28
@Auth : wes
@File :1688_goods_spider.py
@IDE :PyCharm
@email:2934218525@qq.com
"""
import time
import json
import requests
import re
from utility import generate_headers
import urllib.parse
import csv
from jsonsearch import JsonSearch
from seleniumwire import webdriver
from tkinter import messagebox
from browsermobproxy import Server
from requests.cookies import RequestsCookieJar
from selenium.webdriver import ActionChains
def save_cookies():
cookies = driver.get_cookies()
#获取cookies并保存
with open("cookies.txt", "w") as fp:
json.dump(cookies, fp)
def create_driver(
show: bool = True
) :
"""
创建浏览器驱动
:param show: 是否弹出浏览器页面
:return: 浏览器驱动
"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
prefs = {
'profile.default_content_setting_values':
{
'notifications': 2
},
'profile.password_manager_enabled': False,
'credentials_enable_service': False
}
chrome_options.add_experimental_option('prefs', prefs)
# chrome_options.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe"
if not show:
chrome_options.add_argument("--headless")
# 开发者模式防止被识别出
# 网址:https://blog.csdn.net/dslkfajoaijfdoj/article/details/109146051
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option('w3c', False)
caps = {
'browserName': 'chrome',
'loggingPrefs': {
'browser': 'ALL',
'driver': 'ALL',
'performance': 'ALL',
},
'goog:chromeOptions': {
'perfLoggingPrefs': {
'enableNetwork': True,
},
'w3c': False,
},
}
driver = webdriver.Chrome(desired_capabilities= caps,options=chrome_options)
# 执行cdp命令
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
return driver
def find_element_exists(xpath_,flag=0,erro_warning = """"""):
"""
flag为0时 采用循环定位知道元素存在
flag为1时,仅判断元素存在与否
默认为0时无需传参
:param xpath_: 元素xpath路径
:param flag:int
:return:
"""
if flag==0:
while True:
try:
ele = driver.find_element_by_xpath(xpath_)
return ele
except Exception as file:
print(file)
print(erro_warning)
time.sleep(1)
else:
try:
ele = driver.find_element_by_xpath(xpath_)
return ele
except Exception as file:
return False
def get_goods_json(url):
while True:
try:
driver.get(url)
#程序继续运行所必要的网页条件筛选 在未加载出来之前不执行之后的命令
find_element_exists('//*[@id="alisearch-input"]',"请手动跳过验证码进行下一步操作")
#写入js 滑动脚本 模拟人工滑动 加载出ajax 请求
js = "var q=document.documentElement.scrollTop=10000"
time.sleep(1)
driver.execute_script(js)
time.sleep(2)
driver.execute_script(js)
#加载完之后需要一定程度的延迟 否则获取日志时报错
time.sleep(5)
#这一部分是网页原生加载的请求 无需ajax 包括20个商品
content = '{"code":200,"data":{"asyncConfig":{"async":false,"asyncCount":20,"asyncUrl"' + driver.page_source.split(
'window.data.offerresultData = successDataCheck({"code":200,"data":{"asyncConfig":{"async":false,"asyncCount":20,"asyncUrl"')[
-1]
check_num = re.findall('"ok","time":(.*?)}',content)[0]
# print(check_num,'"ok","time":' + str(check_num) +'});')
# input()
content = content.split('"ok","time":' + str(check_num) +'});')[0] + '"ok","time":' + str(check_num) +'}'
#抓selenium的network内置日志
two_ajax = []
#去selenium日志中查看遍历所有http请求并且配和js滚动提取需要的ajax
for entry in driver.get_log('performance'):
si = json.loads(entry['message'])
si = JsonSearch(object=si, mode='j')
try:
request_id = si.search_all_value(key='requestId')[-1]
except:
continue
try:
#寻找是否是带有url的网络请求 不是则跳过
url = si.search_all_value(key='url')[-1]
except:
continue
#匹配所需要寻找的ajax请求 匹配则继续
if "asyncCount=20&pageSize=60" not in url:
continue
else:
pass
# print(str(url))
excluded = []
try :
#执行命令 获取得到的requestid的具体返回值
session_infor = driver.execute("executeCdpCommand",
{'cmd': 'Network.getResponseBody', 'params': {'requestId': request_id}})['value']
print(type(session_infor['body']))
true = True
false = False
# d = JsonSearch(object=eval(str(session_infor['body'])), mode='j')
ajax = JsonSearch(object=eval(str(session_infor['body'])), mode='j')
ori_ajax = str(session_infor['body'])
# print(session_infor,type)
index = ajax.search_all_value(key="startIndex")[0]
if index not in excluded:
excluded.append(index)
two_ajax.append(ori_ajax)
else:
continue
print(index)
except Exception as file:
print(file)
return [content, two_ajax[0], two_ajax[2]]
except:
input("请手动跳过验证码之后按回车键进行下一步操作")
#取日志ajax失败时重新加载页面,一般只考虑出现各类验证码场景时用
continue
def login(choice):
global driver,headers
driver.get('https://login.taobao.com/?https://login.taobao.com/?redirect_url=https%3A%2F%2Flogin.1688.com%2Fmember%2Fjump.htm%3Ftarget%3Dhttps%253A%252F%252Flogin.1688.com%252Fmember%252FmarketSigninJump.htm%253FDone%253D%25252F%25252Fdetail.1688.com%25252Foffer%25252F564418418427.html&style=tao_custom&from=1688web')
if choice.strip() == "1":
#输入账号
user = input("请输入用户名")
find_element_exists('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[1]/div[2]/input').send_keys("user")
time.sleep(1)
#输入密码
password = input("请输入密码")
find_element_exists('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[2]/div[2]/input').send_keys(password)
#点击登录按钮
find_element_exists('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[4]/button').click()
# print(driver.page_source)
# input("check")
else:
input("请在扫码后按回车键继续")
#暂时弃用 玩不过1688的滑块验证 只能往换ip的思路着手
# if "请按住滑块,拖动到最右边" in driver.page_source:
# #用来二次定位的图片元素
# ele = find_element_exists('//*[@id="bg-img"]',flag = 1)
# if ele:
# ActionChains(driver).move_to_element_with_offset(ele, -46,-332).context_click().perform()
# input('check001')
#目前用不到de cookies存储 备用
save_cookies()
def get_cookies():
cookies_dict = dict()
with open("cookies.txt", "r") as fp:
cookies = json.load(fp)
for cookie in cookies:
cookies_dict[cookie['name']] = cookie['value']
return cookies_dict
choice = input("请输入登录方式:\n\t1.账号密码自动登录\n\t2.自助扫码登录\n\n")
driver = create_driver()
login(choice)
for sort_type in ["综合搜索","成交量搜索"]:
keyword = "自行车配件"
#打开文件便于后续存储
file = open(f"{keyword}_{sort_type}.csv", "w", newline="", encoding="utf-8")
csv_writer = csv.writer(file)
head = ['搜索关键词','id', '价格', '标题', '加红标题', '成交额', '复购率', '搜索页面链接','商品链接', '企业名称', 'consultantscores', 'returnscore', 'quality_experiences', 'disputescores', 'expressscores', '综合服务星级']
csv_writer.writerow(head)
uncode_keyword = urllib.parse.quote(keyword.encode('gb2312'))
for page_num in [1,2]:
if sort_type =="综合搜索":
url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={uncode_keyword}&beginPage={page_num}#sm-filtbar"
else:
# 按成交额进行排序的商品json信息
url = f"https://s.1688.com/selloffer/offer_search.htm?&keywords={uncode_keyword}&beginPage={page_num}&sortType=va_rmdarkgmv30&descendOrder=true&uniqfield=userid&n=y#sm-filtbar"
print(f"page: {page_num}")
#传回三个原始字符串json 第一个是页面原有的 第二第三个系ajax生成
res_3 = get_goods_json(url)
#回溯索引
num1 = -1
#回溯标志 初始化为0 值为1时代表需要回溯
num1_flag = 0
for res_ in res_3:
num1+=1
# print(res.text)
false= False
true = True
null = None
res_json=eval(res_)
#循环三次加载的循环(60 个商品被分为三次请求)
jsondata = JsonSearch(object=res_json, mode='j')
#获取json商品中列表 一次20
while True:
try:
#解析出需要用到的json
all = jsondata.search_all_value(key='offerList')[-1]
break
except:
#在查询下一个链接是出现验证码的处理,人工操控后按回车继续
driver.refresh()
input("请手动通过验证码成功后按回车")
res_3 = get_goods_json(url)
num1-=1
num1_flag = 1
# 获取json商品中列表 一次20
break
if num1_flag:
continue
#获取公司名
companies = jsondata.search_all_value(key='hoverName')
#获取价格
prices = jsondata.search_all_value(key='price')
#获取商品标题
titles = jsondata.search_all_value(key='simpleSubject')
#获取商品红色标题(这里是原始数据)
red_titles_origins = jsondata.search_all_value(key='subject')
#咨询评分
consultantscores = jsondata.search_all_value(key='consultationScore')
#物流评分
expressscores = jsondata.search_all_value(key='logisticsScore')
#质量评分
quality_experiences = jsondata.search_all_value(key='goodsScore')
#纠纷评分
disputescores = jsondata.search_all_value(key='disputeScore')
#综合评分
totalscores = jsondata.search_all_value(key= 'compositeNewScore')
#是否展现复购率的标志
ifres = jsondata.search_all_value(key= 'showShopRepurchaseRate')
red_titles = []
#遍历原始标题取出红色html标签
for i in red_titles_origins:
reds = re.findall("<font color=red>(.*?)</font>",i)
if len(reds) != 0:
red_titles.append(" ".join(reds))
else:
red_titles.append(' ')
for k in range(len(titles)):
#判断部分值无效时设置为空(如界面内部分广告产品)
rePurchaseRate = all[k]['company']["shopRepurchaseRate"]
try:
regCapitalUnit = all[k]["tradeQuantity"]["gmvValueFuzzify"]["integer"]
except:
regCapitalUnit="无"
try:
id = all[k]["id"]
except:
id ="无"
if id != "无":
self_url = f"https://detail.1688.com/offer/{id}.html"
else:
self_url = "无"
try:
returnscore = all[k]["tradeService"]["returnScore"]
except:
returnscore="无"
if str(rePurchaseRate) =="0.0":
rePurchaseRate ="无"
if str(returnscore) =="-1.0":
returnscore ="无"
if sort_type =="综合搜索":
dad_url = f"https://s.1688.com/selloffer/offer_search.htm?spm=a26352.13672862.filtbar.9.27ed5860tLwp6H&keywords={uncode_keyword}&beginPage=1&sortType=normal&descendOrder=true&uniqfield=userid&n=y#sm-filtbar"
else:
dad_url = f"https://s.1688.com/selloffer/offer_search.htm?spm=a26352.13672862.filtbar.9.27ed5860tLwp6H&keywords={uncode_keyword}&beginPage=1&sortType=va_rmdarkgmv30&descendOrder=true&uniqfield=userid&n=y#sm-filtbar"
if regCapitalUnit =="":
regCapitalUnit ="无"
if rePurchaseRate =="":
rePurchaseRate ="无"
#必要的行纠错,便于排查
prices[k]
titles[k]
red_titles[k]
companies[k]
consultantscores[k]
quality_experiences[k]
disputescores[k]
expressscores[k]
totalscores[k]
print(ifres[k])
#定义存入csv的列表
row = [keyword,id,prices[k],titles[k],red_titles[k],regCapitalUnit,rePurchaseRate,dad_url,self_url,companies[k],consultantscores[k],returnscore,quality_experiences[k],disputescores[k],expressscores[k],totalscores[k]]
print(row)
# input("检查单页正确性 调试用")
#写入csv 一个简单的容错文件占用循环
while True:
try:
csv_writer.writerow(row)
break
except:
print("程序运行期间不可打开特定写入表格,请及时关闭等待程序运行.")
#页数增加传入下一个循环
file.close()

配置:略 安装包: 目录下 requirements.txt文件 打开当前目录的dos窗口 输入 pip install -r requirements.txt

这是搜索页的,详情页的也做了,需要的可以私信。

本代码仅供个人参考交流探讨更优方案等。

举报

相关推荐

0 条评论