from lxml import etree
wb_data = """
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
"""
#解析字符串为html对象,自动补全html。body
html=etree.HTML(wb_data)
#解析数据,a标签的文本
#写法一:text属性
data1 = html.xpath('/html/body/div/ul/li/a')
for i in data1:
print(i.text)
#写法二:text(函数)
data2=html.xpath('//a/text()')
print(data2)
#解析文件:html文件,但是自定义解析器,etree默认是xml解析器
#自定义解析器
parser=etree.HTMLParser(encoding='utf-8')
#解析html的文件为html对象
html2=etree.parse('123.htm',parser=parser)
#将对象html2变为字符串,解码,打印输出
# html_data =etree.tostring(html2,pretty_print=True)
# res=html_data.decode('utf-8')
# print(res)
#任意匹配
# data3=html2.xpath('//*')
# for i in data3:
# if hasattr(i,'text'):
# print(i.text)
#答应电影名字,[]表示带有title属性的div
data4=html2.xpath('//div[@title]')
for i in data4:
# .当前节点div,寻找title属性值
title=i.xpath('./@title')
#寻找当前节点文本
desc=i.xpath('./text()')
print(title,"\t\t\t\t",desc)