python简单爬虫开发（urllib2、requests + BeautifulSoup）-CFANZ编程社区

#一、网页内容下载
#1、urllib（python2中的urllib2在python3中被整合到一起了urllib）下载网页方法1：最简洁方法
from urllib import request,parse

#直接请求
with request.urlopen('http://www.baidu.com')  as f:  #python2中：urllib2.urlopen
    data = f.read()
    print('Status:',f.status,f.reason) #获取状态码
    for k,v in f.getheaders():#获取所有头信息
        print('%s : %s' % (k,v))
    print(data.decode('utf-8')) #不decode会有很多/r/n



#2 urllib,post请求，发送post数据，添加header模拟头信息(例如下面例子，模拟iphone请求百度)
from urllib import request,parse
req = request.Request('http://localhost/index.php?zz=1')
post_data= parse.urlencode([
    ('aa','11'),
    ('bb','22'),
]).encode('utf-8')
req.add_header('User-Agent','Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1')
with request.urlopen(req,post_data) as f:
    data = f.read()
    print('Status:',f.status,f.reason)
    for k,v in f.getheaders():
        print('%s : %s' % (k,v))
    print(data.decode('utf-8'))




#3 urllib HTTPCookieProcessor（记住COOKIE）、ProxyHander（使用代理访问）、HTTPSHandler（https访问）、HTTPRedirectHandler（连接直接跳转的形式访问）
#以HTTPCookieProcessor举例：
import urllib
import http.cookiejar

url = "http://c.highpin.cn/Users/CLogin"
postdata =urllib.parse.urlencode({
"Logon_Password":"sunmin",
"Logon_PostCode":"fghc",
"Logon_RememberMe":"false",
"Logon_UserEmail":"sun121@qq.com"
}).encode('utf-8')
header = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding":"utf-8",
"Accept-Language":"zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
"Connection":"keep-alive",
"Host":"c.highpin.cn",
"Referer":"http://c.highpin.cn/",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0"
}
req = urllib.request.Request(url,postdata,header)
##print(urllib.request.urlopen(req).read().decode('utf-8'))

#cookie数据
cj = http.cookiejar.CookieJar()#创建cookie容器
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) #创建一个opener
r = opener.open(req)#打开网页
print(r.read().decode('utf-8'))


#二、网页解析器
#正则、html.parser、Beautiful Soup(html.parser、lxml)、lxml
#Beautiful Soup：
#流程：Html网页-->创建Beautiful Soup对象-->搜索节点find(搜索第一个满足要求的节点,按节点名称、属性、节点文字)、findall(搜索所有满足要求节点,按节点名称、属性、节点文字)
# ------>访问节点名称、属性、文字


#第一步：搜索节点<a href="123.html" class="article_link">Python</a>
from bs4 import BeautifulSoup
import re;
soup = BeautifulSoup(
    'html文档字符串',  #html文档字符串，即爬下来的静态网页内容
    'html.parser',   #html文档解析器（lxml）
    from_encoding='utf-8' #html文档编码，如果html代码与设置不一样，解析会乱码
)

#第二步：搜索节点find、findall(name,attrs,string)，它们参数都一样
#查找所有节点为a标签的
soup.find_all('a')
#查找所有a标签，连接符合/view/123.htm形式的节点
soup.find_all('a',href='/view/123.htm')
soup.find_all('a',href=re.compile(r'/view/\d+\.htm$')) #可以传入正则
#查找所有标签为div，class为abc，文字为Python的节点
soup.find_all('div',class_='abc',name='abc',string='Python')#由于class为关键字，所有加"_"区分,第一个参数一定是标签名,string一定是文字，其他属性多了就添加就行了


#实际例子：
from urllib import request,parse
from bs4 import BeautifulSoup

with request.urlopen('http://home.eaglive.com') as f:
    data = f.read()
    print('%s : %s' % (f.status,f.reason))

soup = BeautifulSoup(data,'html.parser',from_encoding='utf-8')
all_a  = soup.find_all('a')
for a in all_a:
    print(a.name,a.get('href'),a.get_text()) #读取标签名字a.name、读取属性a.get('属性名') 、读取文字内容a.get_text()