天风证券基金产品的爬取-CFANZ编程社区

#-*-coding:GBK -*- 
#********************
#微信&电话：13248260503
# 证券开户 研报收集
# 代码交流 数据分析
# 脚本开发 投资推荐
#********************
import urllib.request
import requests
import re
import random
import time
from urllib.parse import urlencode
import pandas as pd  #制表模块
from urllib.parse import urlparse
from bs4 import BeautifulSoup
my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
headers = {'User-Agent':random.choice(my_headers)}
def get_page(url):#封装下载页面方法
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.content.decode("utf-8")#应对乱码
    else:
        return '爬取失败！'

def parse_html(html_content):
    pattern = re.compile('.*?search_result_ul.*?<li title="(.*?)".*?<li>(.*?)</li>.*?<li>(.*?)</li>.*?<li>(.*?)</li>.*?<li>(.*?)</li>.*?<li>(.*?)</li>.*?<li.*?>(.*?)</li>',re.S)#|.*?search_result_ul.*?<LI title="(.*?)".*?<LI>(.*?)</LI>.*?<LI>(.*?)</LI>.*?<LI>(.*?)</LI>.*?<LI>(.*?)</LI>.*?<LI>(.*?)</LI>.*?<LI.*?>(.*?)</LI>(.*?fname fl.*?>(.*?)\D(\d+)\D</a>).*?ping.*?>(.*?)</span>.*?基金类型：(.*?)</li>.*?管&nbsp;理&nbsp;人：.*?>(.*?)</a>.*?规&nbsp;&nbsp;&nbsp;&nbsp;模</a>：(.*?)亿元.*?基金经理：.*?>(.*?)</a>.*?手&nbsp;续&nbsp;费</a>：(.*?)<', re.S)
    result1 = re.findall(pattern, html_content)
    return result1
def parse_html_list(html_content):
    pattern = re.compile('.*?[(.*?)]',re.S)
    result3 = re.findall(pattern, html_content)
    return result3
def parse_html1(html):
    pattern = re.compile('.*?allPages.*?(\d+)', re.S)
    result2 = re.findall(pattern, html)
    return result2
base_url = 'https://www.tfzq.com/business/fund.html?page='
shuju = pd.DataFrame([], columns=[ '产品名称', '净值日期','单位净值','累计净值','首次申购最低金额','基金公司'])
for i in range(1000):
   i= i+1
   new_url = base_url + str(i)
   html_content = get_page(new_url)
   soup = BeautifulSoup(html_content, 'lxml')
   html_content1 =soup.find(class_='items')
   result1 = parse_html(str(html_content1))
   print(result1)
   for item in result1:
      daima = '代码：'+item[1]
      mingcheng = item[0]
      jingzhi = item[2]
      zhangdie = item[3]
      leixing = item[4]
      guanli = item[5]
      guimo = item[6]
      shuju.loc[daima,'产品名称'] = mingcheng
      shuju.loc[daima,'净值日期'] = jingzhi
      shuju.loc[daima,'单位净值'] = zhangdie
      shuju.loc[daima,'累计净值'] = leixing
      shuju.loc[daima,'首次申购最低金额'] = guanli
      shuju.loc[daima,'基金公司'] = guimo
shuju.to_csv('天风.csv', encoding='utf-8')