Beautifupsoup框架常用方法-CFANZ编程社区

Beautifupsoup框架常用方法
#__author__ = 'DouYunQian'

 #coding=utf-8

 html = """

 <html><head><title>The Dormouse's story</title></head>

 <body>

 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

 <p class="story">Once upon a time there were three little sisters; and their names were

 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

 and they lived at the bottom of a well.</p>

 <p class="story2">...</p>

 """

 import re

 from bs4 import BeautifulSoup



 soup=BeautifulSoup(html,"html.parser")

 print(soup.title)#<title>The Dormouse's story</title>

 print(soup.title.string)#The Dormouse's story

 print(soup.title.parent)#<head><title>The Dormouse's story</title></head>

 print(soup.p)#返回第一个p标签    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

 print(soup.a)#<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>

 print(soup.p['class'])#['title']

 print(soup.find_all("a"))#返回一个列表 所有a标签的

 print(soup.find(id="link2"))#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

 print(soup.find(id="link2").string)#Lacie 如果中间有别的标签就不能够很好的使用了

 print(soup.find(id="link2").get_text())#Lacie

 print(soup.find("p",class_="title"))#<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

 print(soup.find("p",{"class":"story2"}))#<p class="story2">...</p>

 print(soup.find("p",{"class":"story"}).get_text())#获取任何标签中间的内容不论标签有多少

 print("===================")

 for tag in soup.find_all(re.compile("^b")):

     print(tag.name)

     

 print("=============")#找到属性是某种类型的所有集合

 all_href=soup.find_all(href=re.compile("http://example.com/.+"))
0 条评论