0
点赞
收藏
分享

微信扫一扫

python简单爬虫开发(urllib2、requests + BeautifulSoup)

yellowone 2022-11-20 阅读 97


#一、网页内容下载
#1、urllib(python2中的urllib2在python3中被整合到一起了urllib)下载网页方法1:最简洁方法
from urllib import request,parse

#直接请求
with request.urlopen('http://www.baidu.com') as f: #python2中:urllib2.urlopen
data = f.read()
print('Status:',f.status,f.reason) #获取状态码
for k,v in f.getheaders():#获取所有头信息
print('%s : %s' % (k,v))
print(data.decode('utf-8')) #不decode会有很多/r/n



#2 urllib,post请求,发送post数据,添加header模拟头信息(例如下面例子,模拟iphone请求百度)
from urllib import request,parse
req = request.Request('http://localhost/index.php?zz=1')
post_data= parse.urlencode([
('aa','11'),
('bb','22'),
]).encode('utf-8')
req.add_header('User-Agent','Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1')
with request.urlopen(req,post_data) as f:
data = f.read()
print('Status:',f.status,f.reason)
for k,v in f.getheaders():
print('%s : %s' % (k,v))
print(data.decode('utf-8'))




#3 urllib HTTPCookieProcessor(记住COOKIE)、ProxyHander(使用代理访问)、HTTPSHandler(https访问)、HTTPRedirectHandler(连接直接跳转的形式访问)
#以HTTPCookieProcessor举例:
import urllib
import http.cookiejar

url = "http://c.highpin.cn/Users/CLogin"
postdata =urllib.parse.urlencode({
"Logon_Password":"sunmin",
"Logon_PostCode":"fghc",
"Logon_RememberMe":"false",
"Logon_UserEmail":"sun121@qq.com"
}).encode('utf-8')
header = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding":"utf-8",
"Accept-Language":"zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
"Connection":"keep-alive",
"Host":"c.highpin.cn",
"Referer":"http://c.highpin.cn/",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0"
}
req = urllib.request.Request(url,postdata,header)
##print(urllib.request.urlopen(req).read().decode('utf-8'))

#cookie数据
cj = http.cookiejar.CookieJar()#创建cookie容器
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) #创建一个opener
r = opener.open(req)#打开网页
print(r.read().decode('utf-8'))


#二、网页解析器
#正则、html.parser、Beautiful Soup(html.parser、lxml)、lxml
#Beautiful Soup:
#流程:Html网页-->创建Beautiful Soup对象-->搜索节点find(搜索第一个满足要求的节点,按节点名称、属性、节点文字)、findall(搜索所有满足要求节点,按节点名称、属性、节点文字)
# ------>访问节点名称、属性、文字


#第一步:搜索节点<a href="123.html" class="article_link">Python</a>
from bs4 import BeautifulSoup
import re;
soup = BeautifulSoup(
'html文档字符串', #html文档字符串,即爬下来的静态网页内容
'html.parser', #html文档解析器(lxml)
from_encoding='utf-8' #html文档编码,如果html代码与设置不一样,解析会乱码
)

#第二步:搜索节点find、findall(name,attrs,string),它们参数都一样
#查找所有节点为a标签的
soup.find_all('a')
#查找所有a标签,连接符合/view/123.htm形式的节点
soup.find_all('a',href='/view/123.htm')
soup.find_all('a',href=re.compile(r'/view/\d+\.htm$')) #可以传入正则
#查找所有标签为div,class为abc,文字为Python的节点
soup.find_all('div',class_='abc',name='abc',string='Python')#由于class为关键字,所有加"_"区分,第一个参数一定是标签名,string一定是文字,其他属性多了就添加就行了


#实际例子:
from urllib import request,parse
from bs4 import BeautifulSoup

with request.urlopen('http://home.eaglive.com') as f:
data = f.read()
print('%s : %s' % (f.status,f.reason))

soup = BeautifulSoup(data,'html.parser',from_encoding='utf-8')
all_a = soup.find_all('a')
for a in all_a:
print(a.name,a.get('href'),a.get_text()) #读取标签名字a.name、读取属性a.get('属性名') 、读取文字内容a.get_text()



举报

相关推荐

0 条评论