Delphi解析html取想要的内容
抓取到网页,有时想快速取得里面想要的内容,在用Delphi怎么实现呢,一般可以找到关键位置,复制区间再替换不想要的内容,一点点的取,这种方式有点麻烦,这里找到一个可用于Delphir解析html库,能方便我们加快速度
Delphi Dom HTML Parser and Converter
https://sourceforge.net/projects/htmlp/
下面我们来看看怎么使用
效果
样本HTML
test.html
<html>
<head>
</head>
<body>
<div class="ina_silde_right12">
<div class="ina_icon ina_bt"><h3>召回公报</h3><a href="/recall/anvanced_search.html" target="_blank">召回公报搜索</a></div>
<div class="ina_zhgb"><span class="ina_zh">召回公报</span><span class="ina_sz">所在总成</span><span class="ina_date">发布时间</span></div>
<ul>
<li><span class="ina_zh"><a href="/recall/20151225/recall_9607E8_1068.html" target="_blank">A股份有限公司召回部分奔奔汽车</a></span><span class="ina_sz">转向系</span><span>2015-12-25</span></li>
<li><span class="ina_zh"><a href="/recall/20151224/recall_8C4EF1_1067.html" target="_blank">B召回部分进口劳恩斯酷派汽车</a></span><span class="ina_sz">车身</span><span>2015-12-24</span></li>
<li><span class="ina_zh"><a href="/recall/20151224/recall_042E43_1066.html" target="_blank">C国际贸易有限公司召回部分进口斯伯特野玛汽车</a></span><span class="ina_sz">发动机</span><span>2015-12-24</span></li>
<li><span class="ina_zh"><a href="/recall/20151222/recall_DFFD64_1065.html" target="_blank">D有限公司召回部分进口迷你汽车</a></span><span class="ina_sz">转向系</span><span>2015-12-22</span></li>
<li><span class="ina_zh"><a href="/recall/20151222/recall_E28E97_1064.html" target="_blank">E易有限公司召回部分进口劳斯莱斯幻影汽车</a></span><span class="ina_sz">气囊和安全带</span><span>2015-12-22</span></li>
</ul>
</div>
</body>
</html>
实现源码
先选择文件,再打开文件取得内容,调用THTMLParser进行解析,取得内容放入TListView进行显示,这里可以看到是非常容易得到想要内容的
unit HtmlParserTestMain;
interface
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics,
Controls, Forms, Dialogs, StdCtrls,
HTMLParser, Grids, ComCtrls;
type
TForm16 = class(TForm)
btn1: TButton;
lv1: TListView;
FileNameEdit: TEdit;
Button1: TButton;
OpenDialog1: TOpenDialog;
procedure btn1Click(Sender: TObject);
procedure Button1Click(Sender: TObject);
private
{ Private declarations }
HTMLParser: THtmlParser;
function LoadHtmlFile: WideString;
public
{ Public declarations }
end;
var
Form16: TForm16;
implementation
uses
DomCore, Formatter;
{$R *.dfm}
function TForm16.LoadHtmlFile: WideString;
var
F: TFileStream;
S: AnsiString;
Apath: AnsiString;
begin
F := TFileStream.Create(FileNameEdit.Text, fmOpenRead);
try
SetLength(S, F.Size);
F.Read(S[1], F.Size);
Result := UTF8Decode(S);
finally
F.Free
end;
end;
procedure TForm16.Button1Click(Sender: TObject);
begin
if OpenDialog1.Execute then
FileNameEdit.Text := OpenDialog1.FileName
end;
procedure TForm16.btn1Click(Sender: TObject);
var
HtmlDoc: TDocument;
Formatter: TBaseFormatter;
list: TNodeList;
I: Integer;
span: TNode;
classAttr: TNode;
href: TNode;
Item: TListItem;
begin
HTMLParser := THtmlParser.Create;
try
try
HtmlDoc := HTMLParser.parseString(LoadHtmlFile);
except
end;
list := HtmlDoc.getElementsByTagName('span');
for I := 0 to list.length - 1 do
begin
span := list.Item(I);
classAttr := span.attributes.getNamedItem('class');
if Assigned(classAttr) then
begin
if classAttr.childNodes.Item(0).NodeValue = 'ina_zh' then
begin
href := span.childNodes.Item(0);
if href.nodeName = 'a' then
begin
Item := lv1.Items.Add;
Item.Caption := href.attributes.getNamedItem('href')
.childNodes.Item(0).NodeValue;
Item.SubItems.Add(href.childNodes.Item(0).NodeValue);
Item.SubItems.Add(span.parentNode.childNodes.Item(2)
.childNodes.Item(0).NodeValue);
end;
end;
end;
end;
list.Free;
finally
HTMLParser.Free
end;
HtmlDoc.Free;
end;
end.
窗口UI布局
object Form16: TForm16
Left = 311
Top = 238
Caption = 'Html Parser Test'
ClientHeight = 431
ClientWidth = 834
Color = clBtnFace
Font.Charset = DEFAULT_CHARSET
Font.Color = clWindowText
Font.Height = -11
Font.Name = 'Tahoma'
Font.Style = []
OldCreateOrder = False
DesignSize = (
834
431)
PixelsPerInch = 96
TextHeight = 13
object btn1: TButton
Left = 751
Top = 8
Width = 75
Height = 41
Anchors = [akTop, akRight]
Caption = #35299#26512
TabOrder = 0
OnClick = btn1Click
end
object lv1: TListView
Left = 8
Top = 55
Width = 818
Height = 419
Anchors = [akLeft, akTop, akRight, akBottom]
Columns = <
item
Caption = 'href'
Width = 240
end
item
Caption = 'Content'
Width = 240
end
item
Caption = 'Date'
Width = 120
end>
ReadOnly = True
TabOrder = 1
ViewStyle = vsReport
end
object FileNameEdit: TEdit
Left = 8
Top = 16
Width = 649
Height = 33
TabOrder = 2
Text = 'FileNameEdit'
end
object Button1: TButton
Left = 663
Top = 8
Width = 75
Height = 41
Anchors = [akTop, akRight]
Caption = #36873#25321
TabOrder = 3
OnClick = Button1Click
end
object OpenDialog1: TOpenDialog
Left = 424
Top = 144
end
end
看是不是很简单呀