0
点赞
收藏
分享

微信扫一扫

day26学习总结与作业

金牛豆豆 2022-03-21 阅读 57
爬虫

day学习总结与作业

import csv

import requests
from bs4 import BeautifulSoup
import re
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
url = 'https://mapi.guazi.com/car-source/carList/pcList?minor=benz&sourceType=&ec_buy_car_list_ab=&location_city=&district_id=&tag=-1&license_date=&auto_type=&driving_type=&gearbox=&road_haul=&air_displacement=&emission=&car_color=&guobie=&bright_spot_config=&seat=&fuel_type=&order=&priceRange=0,-1&tag_types=&diff_city=&intention_options=&initialPriceRange=&monthlyPriceRange=&transfer_num=&car_year=&carid_qigangshu=&carid_jinqixingshi=&cheliangjibie=&page=1&pageSize=20&city_filter=12&city=12&guazi_city=12&qpres=&versionId=0.0.0.0&osv=Unknown&platfromSource=wap'
response = requests.get(url, headers=headers)
result0=response.json()['data']['postList']
red=[]
table = {
    '0xe1d0': '7', '0xe325': '4', '0xe41d': '1', '0xe52e': '9', '0xe630': '2', '0xe76e': '8',
    '0xe891': '5', '0xe9ce': '0', '0xeaf2': '3', '0xec4c': '6', '0xf88a': '7'
}
for x in result0:
    title=x['title']
    license_date=x['license_date']
    road_haul=x['road_haul'].split(';')
    new_road_haul = ''
    for g in road_haul:
        if g.startswith('&#'):
            new_road_haul += table[hex(int(g[2:]))]
        elif g.startswith('.&#'):
            new_road_haul += '.' + table[hex(int(g[3:]))]
        else:
            new_road_haul += g
    first_pay=x['first_pay'].split(';')
    new_first_pay = ''
    for h in first_pay:
        if h.startswith('&#'):
            new_first_pay += table[hex(int(h[2:]))]
        elif h.startswith('.&#'):
            new_first_pay += '.' + table[hex(int(h[3:]))]
        else:
            new_first_pay += h
    if not new_first_pay:
        new_first_pay='不支持首付'
    buy_out_price=x['price'].split(';')
    new_buy_out_price = ''
    for h in buy_out_price:
        if h.startswith('&#'):
            new_buy_out_price+= table[hex(int(h[2:]))]
        elif h.startswith('.&#'):
            new_buy_out_price += '.' + table[hex(int(h[3:]))]
        else:
            new_buy_out_price += h
    red.append([title,license_date,new_road_haul,new_first_pay,new_buy_out_price])
result1=csv.writer(open('files/ershouche.csv','w',encoding='utf-8',newline=''))
result1.writerow(['标题','年限','里程','首付','一次付'])
result1.writerows(red)

学习总结

import requests


def get_html(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
    }

    # 1. 使用代理: 给参数proxies赋值
    # {'https': 'ip:端口'}
    # 1)使用固定的代理ip
    response = requests.get(url, headers=headers, proxies={'https': '36.25.226.139:4513'})

    # 2)通过请求获取代理ip的地址动态获取最新的代理
    # ip = requests.get('http://d.jghttp.alicloudecs.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=4&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions=').text.strip()
    # response = requests.get(url, headers=headers, proxies={'https': ip})
    # response = requests.get(url, headers=headers)
    print(response.text)
    return response.text


if __name__ == '__main__':
    url = 'https://movie.douban.com/top250'
    get_html(url)
    # while True:
    #     url = 'https://movie.douban.com/top250'
    #     get_html(url)
from selenium.webdriver import Chrome, ChromeOptions

options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

# 1. 使用代理
# --proxy-server=http://IP:端口号    (ip端口是https的ip)
options.add_argument('--proxy-server=http://122.6.202.214:4510')

b = Chrome(options=options)

b.get('https://movie.douban.com/top250')
import requests

# requests完成自动登录的步骤:
"""
第1步:在谷歌浏览器中打开网页完成登录操作,然后刷新页面
第2步:打开当前页面的检查,在network的All选项下,找到当前页面的请求,获取Request Headers中的cookie值
第3步:在用requests发送请求的时候给headers赋值,在headers中添加 cookie对应的键值对
"""
headers = {
    'cookie': '_zap=b7912ec1-8e24-4d72-81cd-8fa2a2b5f78f; d_c0="AECfz77blxSPTv9pswdcr3xSHE0eCEMj0Dc=|1646568427"; _xsrf=qy8EtuCtpOGgIUmA3g0qSi2edFqOUIBw; __snaker__id=kvuiOSuliPGkosHY; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=pJYUp8Detk5AUUUUFFc6upRtNshfmnQX; q_c1=51a6e1898c4d46a594865b3db1dd3e95|1647227093000|1647227093000; NOT_UNREGISTER_WAITING=1; gdxidpyhxdE=q6eDxuI%5CS11auZ9%2Be%5C%2BxIx%2F2cg96ULLZchHGSiL8EQvgYs9OmbeiyhhCXoa%5C%2BblJQfD%5CayzzA8oosyB%2FhPPMm7%2Fkd8W5prCdSYuppYUL5qdoyPdjsHcgA8pZgBquceXtX6di5Mu46C7dEKcHakVA7mxxjMZh%2Bre5j%2F4AQta4bulZ39y%5C%3A1647834647437; YD00517437729195%3AWM_NI=oe9bRvHOQTEjCNug5CHPzre%2BCdBGZr6dru1M9KaaTCyY5hZsiM2d%2FSXjfLKsl91VYoTN6x%2Fvc%2FWYtZWgHtA%2BkNpQdcQgt86C%2F1Vffl9dc8gqk08aF1%2Bp0LL%2BSZzsecFXbUI%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee99e96f86e9ada5f97cbaac8aa3c84f878a9a85aa7afbeeac8cc97e8c989f8bb32af0fea7c3b92aadbbf783d17e8c919fb5c8628192bb88b8498c90b693e82181bc8ea8d97af6bcbe96b447fb95af82d749bb87fb89d979ab93fe85ae599cb6aaadca79b0e7ffb7dc659892fa99ae7ea1f199baf75295b98895c421a992c0b2e76993b9bcd9b549edef9a87e568fbbcadd9e67ba9f0f882c53be9b000adcf219186b694e8698688aeb6cc37e2a3; captcha_session_v2=2|1:0|10:1647833755|18:captcha_session_v2|88:Q0FUYjlIemk4MklmZlA5ZlVOWDczbGlhYjRabzMxME1Fc0N6Szl4aGVTQU5JQkdGUklNWFRBVVZtU0hYQUVvLw==|cf267f149c0b9509d0e424579194d94fc6ec1f57567afd34050dc2a47f14bacc; captcha_ticket_v2=2|1:0|10:1647833769|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfZ2pLT21aR2FUbHdXWFo2SG9wTUY1OWRZTG0tcG1vcGlfNUtrUkpzUThJSmNlV2VJdVVzdEhWNks4TVNBcVhobEZPUkZMcjUuc3N1QXpmUzBHbnIuSUNIdjlnb04tdjdoMlBydUtGSmdmV2FYUUZ2SE10dUl0RU40TzRFQS14dEhtMXJRLWg3UHdRblRMVkt1VG1kX09oekdab0pzUGxUWGFmUVF1NzRfYjc4di1QUmlQVUJMR0s5TEhvTGpDX2JUMF9jNFllNlM1SVFYSGgtMHAwV1ZMR2IwMWVqMS5FdEg1ZzZHRUZNZWMwLU8uMXk3YnY2dzlHdGNmNEE1a3E2Ry5EOWJSVlN4QjRWWmZodU1Gdk54N2lvNkZNUjJCdmw4Q254R1NUeUhtLXI1Yl9xeEZPS00wLnJONjkweW14cTQ5MHNjUzBKeHBwUE9lY05BSDZyNV9Vb3ZxZkFkR191aEQwOGc5ZUkwR3k5X005QVkub3gxOS5QdE1GanQwc1dtNERVbVZwQ3ZJV3R4V21nUS5Nd0tVTk92U3N1ZFlmcXJGMmhIWjUyTXZZUzFLc1RQaGlTNTRQWDRETElCdXQuNmU0ZUpzUFUyUzBWeUJZUXE5WFZSOEUwYzE5MEVYX2x5cEJ2MVdManBzVXJlV0t3d19GcGhTLnguVnlGMyJ9|22db975942e5d8c1799fa7c70120378eed6a34bb8bdd00afa3c5eccf92436296; z_c0=2|1:0|10:1647833781|4:z_c0|92:Mi4xaW5CWUdRQUFBQUFBUUpfUHZ0dVhGQ1lBQUFCZ0FsVk50VUFsWXdEWFYyXzVOZVRzbGJmbVg4Mk1GNF95bUYtLWd3|9d5f40b82ca91b89a903d5d50df74ed66850c2b3ecf03d86510300f5eb729d96; tst=r; SESSIONID=hbQ6zfAEKL3cjwOoewSAbkpnJKR4GtTVNCDhhjrorQ2; KLBRSID=0a401b23e8a71b70de2f4b37f5b4e379|1647833814|1647833734',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
response = requests.get('https://www.zhihu.com/', headers=headers)

print(response.text)
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys

# 1. 创建浏览器对象打开网页
b = Chrome()
b.get('https://www.taobao.com/')

# 2. 添加本地保存的cookie信息
cookie_list = eval(open('files/taobao.txt', encoding='utf-8').read())
for cookie in cookie_list:
    b.add_cookie(cookie)

# 3.重新打开网页
b.get('https://www.taobao.com/')

# 4. 进行后续其他操作
search = b.find_element_by_id('q')
search.send_keys('雪糕')
search.send_keys(Keys.ENTER)
from selenium.webdriver import Chrome

# 1. 获取cookie保存到本地
# 第一步:创建浏览器对象,打开需要自动登录的网站
b = Chrome()
b.get('https://www.taobao.com/')

# 2. 手动完成登录操作
input('是否完成:')

# 3. 获取cookie数据并且保存到本地文件中
cookies = b.get_cookies()
open('files/taobao.txt', 'w', encoding='utf-8').write(str(cookies))
举报

相关推荐

0 条评论