0
点赞
收藏
分享

微信扫一扫

python爬取12306网站数据(仅学习使用)

前提知识回顾:

1、浏览器 发送数据(请求URL,请求的方式get/post,请求数据)

2、服务器 作出响应(响应状态码,响应数据【二进制?文件数据?JSON数据?】)

(1)、Elements(元素html,css,js)

(2)、Console()

(3)、Source(相当于文件夹,js,css等)

(4)、Network(所有发送的请求都会在这)


要点:数据怎么爬取?

1、找到目标网站,发起请求---手动模拟,F12查询

2、分析URL是如何变化的,和提取有用的URL----query?

3、提取有用的数据

4、数据的存储

特别注意:需要在headers中加入Cookie


import requests
import json
import re


def send_request():
headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 ",
'Cookie': '_uab_collina=166887116645595690734937; JSESSIONID=244D130E5A9F3E12C192DC46312A312C; BIGipServerotn=1173357066.50210.0000; fo=olipdttx3716b1sp-iI4aQsXQmaXos6pc5ihpawbqOhzCRy_L7RvfmA8KNy7lcKH_PlEz7UrUO9DhVskmtHN1Q42xDwJNoNaahHm5oXPzpv9tGOlEbsEnov8AfovMijDL4tlcXDB-4gBDZSYW7d_3tOd_q-1fGFUN1w51GdLErdSa4-r7lXGyy9dn84; RAIL_EXPIRATION=1669158604701; RAIL_DEVICEID=K0Sq-xEkOg-M-mR-_1I2M-W8Bo1YIjwbwMdLMWr8mQAI4h0o2_AR6bf0FRkqAprVnFDKHWinsjRdk2ZL3KOBPgWqsrEs_rG7fYeMIvur23pwloRb1RDtBVvJ2gO_eVD1bXsCYYBFk3iI4huLl7bTqvtO0rF6cSDM; BIGipServerpool_passport=149160458.50215.0000; guidesStatus=off; highContrastMode=defaltMode; cursorStatus=off; route=6f50b51faa11b987e576cdb301e545c4; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u5929%u6D25%2CTJP; _jc_save_fromDate=2022-11-22; _jc_save_toDate=2022-11-19; _jc_save_wfdc_flag=dc'
}
url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2022-11-22&leftTicketDTO.from_station=BJP&leftTicketDTO.to_station=TJP&purpose_codes=ADULT'
resp = requests.get(url=url, headers=headers)
resp.encoding = resp.apparent_encoding
# print(resp.text)
return resp


# 提取数据
def parse_json(resp, city):
# print(resp.text)
json_ticket = resp.json() # 将响应结果转换成json
data_list = json_ticket['data']['result'] # 提取车次的列表
# 遍历每一个车次信息
d = data_list[0].split('|')
# print(d)
# print(d[3])#索引为3的为车次,为6的是超始站,7为到底到达站,31的为一等座,30的为二等座,13为的表示出行时间
lst = [] # 数据放到列表
for item in data_list:
d = item.split('|')
# print(d[3], city[d[6]], city[d[7]], d[31], d[30], d[13])
lst.append([d[3], city[d[6]], city[d[7]], d[31], d[30], d[13]])
return lst

def start(): # 调用函数
lst = parse_json(send_request(), get_city())
# 数据的筛选(把无的去掉)
for i in lst:
if i[3] != '无' and i[3] != '':
print(i)

# 获取城市信息
def get_city():
url = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9242'
headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 ",
}
resp = requests.get(url=url, headers=headers)
resp.encoding = 'utf-8'
# print(resp)
stations = re.findall('([\u4e00-\u9fa5]+)\|([A-Z]+)', resp.text)
# print(stations)
# 将列表转成字典
stations_data = dict(stations)
# print(stations_data)
stations_d = {}
for item in stations_data:
stations_d[stations_data[item]] = item
# print(stations_d)
return stations_d


if __name__ == '__main__':
start()
# get_city()


"D:\Program Files\Python38\python.exe" E:/project/project01/05-python爬取12306网站数据.py
['C2201', '北京南', '武清', '1', '有', '20221122']
['G2551', '北京南', '天津南', '5', '有', '20221122']
['C2557', '北京南', '武清', '2', '有', '20221122']
['C2557', '北京南', '天津', '1', '有', '20221122']
['G321', '北京南', '天津南', '10', '有', '20221122']
['G119', '北京南', '天津南', '有', '有', '20221122']
['C2569', '北京南', '武清', '3', '有', '20221122']
['C2569', '北京南', '天津', '2', '有', '20221122']
['G125', '北京南', '天津南', '13', '有', '20221122']
['G11', '北京南', '天津南', '有', '有', '20221122']
['G135', '北京南', '天津南', '13', '有', '20221122']
['G183', '北京南', '天津南', '有', '有', '20221122']
['C2051', '北京南', '天津', '3', '有', '20221122']
['G191', '北京南', '天津南', '有', '有', '20221122']
['C2095', '北京南', '天津', '3', '有', '20221122']
['C2225', '北京南', '武清', '1', '有', '20221122']
['C2063', '北京南', '天津', '4', '有', '20221122']
['C2227', '北京南', '武清', '3', '有', '20221122']
['C2227', '北京南', '天津', '2', '有', '20221122']
['G1049', '北京南', '天津南', '10', '有', '20221122']
['C2597', '北京南', '天津', '2', '有', '20221122']

进程已结束,退出代码0


四个经典步骤:

1、发送请求

2、获取数据

3、解析数据

4、保存数据




举报

相关推荐

0 条评论