python 网页数据解析
文章目录
XPath解析
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-0 item-1"><a href="link5.html">attributeMultiValueMatching</a></li>
<li class="item-0 item-1" name ='multiple-attributes'><a href="link5.html">multiple-attributes</a></li>
</ul>
</div>
from cgitb import reset
from lxml import etree
def etreeParse():
text = '''
<div>
<u1>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a</li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</u1>
</div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))
def etreeParseFromFile():
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//*')
result2 = html.xpath('//li')
result3 = html.xpath('//li/a')
result4 = html.xpath('//li//a')
print(result4)
html = etree.parse('./test.html',etree.HTMLParser())
def etreeParseParentNode():
result = html.xpath('//a[@href="link4.html"]/../@class')
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
result = html.xpath('//li[@class="item-0"]')
print(result)
def etreeParseTextNode():
result = html.xpath('//li[@class="item-0"]/a/text()')
result = html.xpath('//li[@class="item-0"]//text()')
print(result)
def etreeParseNodeProperties():
result = html.xpath('//li/@class')
print(result)
def attributeMultiValueMatching():
result = html.xpath('//li[contains(@class,"item-0")]/a/text()')
print(result)
def multipleAttributes():
result = html.xpath('//li[contains(@class,"item-0") and @name = "multiple-attributes"]/a/text()')
print(result)
def selectInOrder():
result = html.xpath('//li[1]/a/text()')
print(result)
result = html.xpath('//li[last()]/a/text()')
print(result)
result = html.xpath('//li[position()<3]/a/text()')
print(result)
result = html.xpath('//li[last()-2]/a/text()')
print(result)
def nodeAxisSelection():
result = html.xpath('//li[1]/ancestor::')
print(result)
result = html.xpath('//li[1]/ancestor::div')
print(result)
result = html.xpath('//li[1]/attribute::')
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
result = html.xpath('//li[1]/descendant::span')
print(result)
result = html.xpath('//li[1]/following::[2]')
print(result)
result = html.xpath('//li[1]/following-sibling::*')
print(result)
etreeParseFromFile()
BeautifulSoup解析
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
def BeautifulSoupInit():
print(soup.prettify())
print(soup.title.string)
def nodeSelector():
print(soup.title)
print(type(soup.title))
print(soup.title.string)
print(soup.head)
print(soup.p)
print(soup.title.name)
print(soup.p.attrs)
print(soup.p.attrs['name'])
print(soup.p['name'])
print(soup.p['class'])
def relatedSelection():
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.p.contents)
print(soup.p.children)
for i,child in enumerate(soup.p.children):
print(i,child)
for i,child in enumerate(soup.p.descendants):
print(i,child)
def queryTheFatherNode():
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.parents)))
def querySiblingNodes():
html = """
<html>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
Hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
"""
soup = BeautifulSoup(html, 'lxml')
print('Next Sibling', soup.a.next_sibling)
print('Prev Sibling', soup.a.previous_sibling)
print('Next Siblings', list(enumerate(soup.a.next_siblings)))
print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))
def extractInfo():
html = """
<html>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Bob</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
</p>
"""
soup = BeautifulSoup(html, 'lxml')
print('Next Sibling:')
print(type(soup.a.next_sibling))
print(soup.a.next_sibling)
print(soup.a.next_sibling.string)
print('Parent:')
print(type(soup.a.parents))
print(list(soup.a.parents)[0])
print(list(soup.a.parents)[0].attrs['class'])
def methodSelector():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name='ul'))
print(type(soup.find_all(name='ul')[0]))
for ul in soup.find_all(name='ul'):
for li in ul.find_all(name='li'):
print(li.string)
def methodSelectorByAttrs():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name = "elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
print(soup.find_all(id='list-1'))
def methodSelectorByString():
html = '''
<div class="panel">
<div class="panel-body">
<a>Hello, this is a link</a>
<a>Hello, this is a link, too</a>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(string=re.compile('link')))
def methodSelectorFind():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find(name='ul'))
print(soup.find(attrs={'class': 'list'}))
def cssSelector():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('#list-1 .element'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(type(soup.select('ul')[0]))
def nestedSelector():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
nestedSelector()
PyQuery 解析
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
def init():
doc = pq(html)
print(doc('li'))
def init_url():
url = 'http://cuiqingcai.com'
doc = pq(url=url)
print(doc('title'))
def basic_selector():
doc = pq(html)
print(doc('#container .list li'))
print(type(doc('#container .list li')))
def find_child():
doc = pq(html)
items = doc('.list')
print(items)
print(type(items))
lis = items.children()
print(lis)
print(type(lis))
lis = items.children('.active')
print(lis)
def find_parent():
doc = pq(html)
item = doc('.list')
container = item.parent()
print(container)
print(type(container))
def find_grand_parent():
doc = pq(html)
item = doc('.list')
container = item.parents()
print(container)
print(type(container))
def find_sibling():
doc = pq(html)
item = doc('.list .item-0.active')
siblings = item.siblings()
print(siblings)
siblings = item.siblings('.active')
print(siblings)
def return_multiple_nodes():
doc = pq(html)
items = doc('.list .item-0').items()
for item in items:
print(str(item))
def get_attr():
doc = pq(html)
item = doc('.item-0.active a')
print(item,type(item))
print(item.attr('href'))
print(item.attr.href)
a = doc('a')
print(a)
for i in a.items():
print(i.attr.href)
def get_text():
doc = pq(html)
item = doc('.item-0.active a')
print(item.text())
li = doc('li')
print(li.text())
print(li.html())
def node_operation():
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.addClass('new-class')
print(li)
li.removeClass('active')
print(li)
def other_node_operation():
html = '''
<ul class="list">
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
</ul>
'''
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')
print(li)
li.text('changed item')
print(li)
li.html('<span>changed item</span>')
print(li)
def pseudo_selector():
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
other_node_operation()