0
点赞
收藏
分享

微信扫一扫

Delphi解析html取想要的内容

TiaNa_na 2023-08-19 阅读 49

Delphi解析html取想要的内容

      抓取到网页,有时想快速取得里面想要的内容,在用Delphi怎么实现呢,一般可以找到关键位置,复制区间再替换不想要的内容,一点点的取,这种方式有点麻烦,这里找到一个可用于Delphir解析html库,能方便我们加快速度

Delphi Dom HTML Parser and Converter

https://sourceforge.net/projects/htmlp/

下面我们来看看怎么使用

效果

Delphi解析html取想要的内容_Text

样本HTML

test.html

<html>
<head> 
</head>
<body> 
<div class="ina_silde_right12">
<div class="ina_icon ina_bt"><h3>召回公报</h3><a href="/recall/anvanced_search.html" target="_blank">召回公报搜索</a></div>
<div class="ina_zhgb"><span class="ina_zh">召回公报</span><span class="ina_sz">所在总成</span><span class="ina_date">发布时间</span></div>
<ul>
<li><span class="ina_zh"><a href="/recall/20151225/recall_9607E8_1068.html" target="_blank">A股份有限公司召回部分奔奔汽车</a></span><span class="ina_sz">转向系</span><span>2015-12-25</span></li>
<li><span class="ina_zh"><a href="/recall/20151224/recall_8C4EF1_1067.html" target="_blank">B召回部分进口劳恩斯酷派汽车</a></span><span class="ina_sz">车身</span><span>2015-12-24</span></li>
<li><span class="ina_zh"><a href="/recall/20151224/recall_042E43_1066.html" target="_blank">C国际贸易有限公司召回部分进口斯伯特野玛汽车</a></span><span class="ina_sz">发动机</span><span>2015-12-24</span></li>
<li><span class="ina_zh"><a href="/recall/20151222/recall_DFFD64_1065.html" target="_blank">D有限公司召回部分进口迷你汽车</a></span><span class="ina_sz">转向系</span><span>2015-12-22</span></li>
<li><span class="ina_zh"><a href="/recall/20151222/recall_E28E97_1064.html" target="_blank">E易有限公司召回部分进口劳斯莱斯幻影汽车</a></span><span class="ina_sz">气囊和安全带</span><span>2015-12-22</span></li>
</ul>
</div> 
</body>
</html>

实现源码

先选择文件,再打开文件取得内容,调用THTMLParser进行解析,取得内容放入TListView进行显示,这里可以看到是非常容易得到想要内容的

unit HtmlParserTestMain;

interface

uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics,
  Controls, Forms, Dialogs, StdCtrls,
  HTMLParser, Grids, ComCtrls;

type
  TForm16 = class(TForm)
    btn1: TButton;
    lv1: TListView;
    FileNameEdit: TEdit;
    Button1: TButton;
    OpenDialog1: TOpenDialog;
    procedure btn1Click(Sender: TObject);
    procedure Button1Click(Sender: TObject);

  private
    { Private declarations }
    HTMLParser: THtmlParser;
    function LoadHtmlFile: WideString;
  public
    { Public declarations }
  end;

var
  Form16: TForm16;

implementation

uses
  DomCore, Formatter;

{$R *.dfm}

function TForm16.LoadHtmlFile: WideString;
var
  F: TFileStream;
  S: AnsiString;
  Apath: AnsiString;
begin

  F := TFileStream.Create(FileNameEdit.Text, fmOpenRead);
  try
    SetLength(S, F.Size);
    F.Read(S[1], F.Size);
    Result := UTF8Decode(S);
  finally
    F.Free
  end;
end;
procedure TForm16.Button1Click(Sender: TObject);
begin
  if OpenDialog1.Execute then
    FileNameEdit.Text := OpenDialog1.FileName
end;
procedure TForm16.btn1Click(Sender: TObject);
var
  HtmlDoc: TDocument;
  Formatter: TBaseFormatter;
  list: TNodeList;
  I: Integer;
  span: TNode;
  classAttr: TNode;
  href: TNode;
  Item: TListItem;
begin
  HTMLParser := THtmlParser.Create;
  try
    try
      HtmlDoc := HTMLParser.parseString(LoadHtmlFile);
    except
    end;
    list := HtmlDoc.getElementsByTagName('span');
    for I := 0 to list.length - 1 do
    begin
      span := list.Item(I);
      classAttr := span.attributes.getNamedItem('class');
      if Assigned(classAttr) then
      begin
        if classAttr.childNodes.Item(0).NodeValue = 'ina_zh' then
        begin
          href := span.childNodes.Item(0);
          if href.nodeName = 'a' then
          begin
            Item := lv1.Items.Add;
            Item.Caption := href.attributes.getNamedItem('href')
              .childNodes.Item(0).NodeValue;
            Item.SubItems.Add(href.childNodes.Item(0).NodeValue);
            Item.SubItems.Add(span.parentNode.childNodes.Item(2)
              .childNodes.Item(0).NodeValue);
          end;
        end;
      end;
    end;
    list.Free;
  finally
    HTMLParser.Free
  end;
  HtmlDoc.Free;
end;

end.

窗口UI布局

object Form16: TForm16
  Left = 311
  Top = 238
  Caption = 'Html Parser Test'
  ClientHeight = 431
  ClientWidth = 834
  Color = clBtnFace
  Font.Charset = DEFAULT_CHARSET
  Font.Color = clWindowText
  Font.Height = -11
  Font.Name = 'Tahoma'
  Font.Style = []
  OldCreateOrder = False
  DesignSize = (
    834
    431)
  PixelsPerInch = 96
  TextHeight = 13
  object btn1: TButton
    Left = 751
    Top = 8
    Width = 75
    Height = 41
    Anchors = [akTop, akRight]
    Caption = #35299#26512
    TabOrder = 0
    OnClick = btn1Click
  end
  object lv1: TListView
    Left = 8
    Top = 55
    Width = 818
    Height = 419
    Anchors = [akLeft, akTop, akRight, akBottom]
    Columns = <
      item
        Caption = 'href'
        Width = 240
      end
      item
        Caption = 'Content'
        Width = 240
      end
      item
        Caption = 'Date'
        Width = 120
      end>
    ReadOnly = True
    TabOrder = 1
    ViewStyle = vsReport
  end
  object FileNameEdit: TEdit
    Left = 8
    Top = 16
    Width = 649
    Height = 33
    TabOrder = 2
    Text = 'FileNameEdit'
  end
  object Button1: TButton
    Left = 663
    Top = 8
    Width = 75
    Height = 41
    Anchors = [akTop, akRight]
    Caption = #36873#25321
    TabOrder = 3
    OnClick = Button1Click
  end
  object OpenDialog1: TOpenDialog
    Left = 424
    Top = 144
  end
end

Delphi解析html取想要的内容_Text_02

看是不是很简单呀

举报

相关推荐

0 条评论