0
点赞
收藏
分享

微信扫一扫

【正则表达式练习】爬取城市编码

捡历史的小木板 2022-01-24 阅读 76

爬取城市编码

import requests
from bs4 import BeautifulSoup
import re 
import os

base_url = 'http://www.weather.com.cn'
city_referer_url = 'http://www.weather.com.cn/textFC/hb.shtml'

code_regex = re.compile('^.*?weather/(.*?).shtml$',re.S)  #正则表达式
save_file_name = os.path.join(os.getcwd(),'city_codes.txt')
city_code_list = []

def fetch_city_url_list():
    city_url_list = []
    resp = requests.get(city_referer_url)
    resp.encoding = 'utf-8'
    bs = BeautifulSoup(resp.text,'lxml')
    content = bs.find('div',attrs={'class':'lqcontentBoxheader'})
    if content is not None:
        a_s = content.find_all('a')
        if a_s is not None:
            for a in a_s:
                city_url_list.append(base_url+a.get('href'))
    return city_url_list

def fetch_city_weather_url_list(url):
    resp = requests.get(url)
    resp.encoding = 'utf-8'
    bs = BeautifulSoup(resp.text,'lxml')
    a_s = bs.select('div.conMidtab a')
    for a in a_s:
        if a.get("href") is not None and a.text !='详情' and a.text != '返回顶部':
            result = code_regex.match(a.get("href"))
            if result is not None:
                city_code_list.append(a.text+":"+result.group(1))

def write_list_to_file(data):
    try:
        with open(save_file_name,"w+",encoding='utf-8') as f:
            for content in data:
                f.write(content+"\n")
    except OSError as reason:
        print(str(reason))

if __name__ == '__main__':
    city_list = fetch_city_url_list()
    for city in city_list:
        print("解析:",city)
        fetch_city_weather_url_list(city)
    write_list_to_file(city_code_list)


举报

相关推荐

0 条评论