from lxml import etree wb_data = """ <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> """ #解析字符串为html对象,自动补全html。body html=etree.HTML(wb_data) #解析数据,a标签的文本 #写法一:text属性 data1 = html.xpath('/html/body/div/ul/li/a') for i in data1: print(i.text) #写法二:text(函数) data2=html.xpath('//a/text()') print(data2) #解析文件:html文件,但是自定义解析器,etree默认是xml解析器 #自定义解析器 parser=etree.HTMLParser(encoding='utf-8') #解析html的文件为html对象 html2=etree.parse('123.htm',parser=parser) #将对象html2变为字符串,解码,打印输出 # html_data =etree.tostring(html2,pretty_print=True) # res=html_data.decode('utf-8') # print(res) #任意匹配 # data3=html2.xpath('//*') # for i in data3: # if hasattr(i,'text'): # print(i.text) #答应电影名字,[]表示带有title属性的div data4=html2.xpath('//div[@title]') for i in data4: # .当前节点div,寻找title属性值 title=i.xpath('./@title') #寻找当前节点文本 desc=i.xpath('./text()') print(title,"\t\t\t\t",desc)