话不多出,直接写出实现过程。若是需要代理,就设置代理
// // 设置代理上外网
// System.getProperties().put("proxySet", "true");
// System.getProperties().put("proxyHost", "172.31.170.14");
// System.getProperties().put("proxyPort", "8080");
View Code
package com.chen.downMessage;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import com.chen.system.util.ChenFile;
import com.chen.system.util.ChenTools;
public class DownMessage
{
public String saveRoot = "F:" + File.separator + "download" + File.separator;
private String httpUrl0 = "http://www.yi-see.com/";
// 详情页获取信息标签
public DownMessage()
{
}
public DownMessage(String saveRoot, String saveRoota)
{
this.saveRoot = saveRoot;
this.httpUrl0 = httpUrl0;
}
/**
* 获取url中的链接
*
* @author chen_weixian
* @param parentUrl
* @param urlStartTag
* @param urlEndTag
* @return
* @throws Exception
*/
public List<String[]> downLoadMessage(String parentUrl, String areaStartTag, String areaEndTag, String hrefStartTag, String hrefEndTag, String saveFileTitleStartTag, String saveFileTitleEndTag) throws Exception
{
List<String[]> resultList = new ArrayList<String[]>();
String htmlContent = this.getContentByUrl(parentUrl);
// StringBuffer sb = new StringBuffer(); // 本地测试
// ChenFile.readFile("F:/download/source.txt", sb, "gbk");
// String htmlContent = sb.toString();
String contentString = htmlContent.substring(htmlContent.indexOf(areaStartTag) + areaStartTag.length(), htmlContent.length());
contentString = contentString.substring(0, contentString.indexOf(areaEndTag));
String[] titleArray = contentString.split(saveFileTitleStartTag);
if (titleArray.length > 0)
{
for (int i = 0; i < titleArray.length; i++)
{
try
{
if (titleArray[i].indexOf(hrefEndTag) <= 0 || titleArray[i].indexOf(saveFileTitleEndTag) <= 0)
{
continue;
}
String[] tempArray = new String[2];
String url = titleArray[i].substring(titleArray[i].indexOf(hrefStartTag) + hrefStartTag.length(), titleArray[i].length());
url = url.substring(0, url.indexOf(hrefEndTag));
String title = ChenTools.clearHtml(titleArray[i]);
// 获取标题,获取
// String content = this.downStart(httpUrl0 + url);
// this.saveFile(content, title);
// System.out.println(i + "\t完成:" + title + "\t" + httpUrl0
// + url);
tempArray[0] = url;
tempArray[1] = title;
resultList.add(tempArray);
}
catch (Exception e)
{
// System.out.println(i + " 信息异常:" + e);
continue;
}
}
}
return resultList;
}
/**
* 读取信息内容
*
* @author chen_weixian
* @param urlString
* @return
* @throws MException
*/
public String downStart(String urlString, String titleStartTag, String titleEndTag, String authorSt1artTag, String authorEndTag, String contentStartTag, String contentEndTag) throws Exception
{
String result = "";
// // 设置代理上外网
// System.getProperties().put("proxySet", "true");
// System.getProperties().put("proxyHost", "172.31.170.14");
// System.getProperties().put("proxyPort", "8080");
String htmlContent = this.getContentByUrl(urlString);
if (htmlContent.length() > 0)
{
StringBuffer resultString = new StringBuffer(100); // 结果
String br = " \r\n";
// 标题
if (!ChenTools.isEmpty(titleStartTag) && !ChenTools.isEmpty(titleEndTag))
{
String title = htmlContent.substring(htmlContent.indexOf(titleStartTag), htmlContent.length());
title = title.substring(0, title.indexOf(titleEndTag));
resultString.append(title);
resultString.append(br);
}
// 作者
if (!ChenTools.isEmpty(authorSt1artTag) && !ChenTools.isEmpty(authorEndTag))
{
String author = htmlContent.substring(htmlContent.indexOf(authorSt1artTag), htmlContent.length());
author = author.substring(0, author.indexOf(authorEndTag));
resultString.append(author);
resultString.append(br);
}
// 内容
if (!ChenTools.isEmpty(contentStartTag) && !ChenTools.isEmpty(contentEndTag))
{
String content = htmlContent.substring(htmlContent.indexOf(contentStartTag), htmlContent.length());
content = content.substring(0, content.indexOf(contentEndTag));
resultString.append(content);
resultString.append(br);
}
result = resultString.toString().replaceAll("<br>", br);
}
// 去除html
return ChenTools.clearHtml(result.toString());
}
/**
* 读取url内容
*
* @author chen_weixian
* @param urlString
* @return
* @throws Exception
*/
public String getContentByUrl(String urlString) throws Exception
{
StringBuffer htmlContent = new StringBuffer(); // 临时变量
URL urlObj = new URL(urlString);
HttpURLConnection httpcon = (HttpURLConnection) urlObj.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
String line = "";
while ((line = reader.readLine()) != null)
{
line = new String(line.getBytes(), "gbk");
htmlContent.append(line);
}
reader.close();
return htmlContent.toString();
}
/**
* 保存文件
*
* @author chen_weixian
* @throws IOException
*/
public void saveFile(String contentString, String title) throws IOException
{
String savepath = saveRoot + File.separator + title + ".txt";
ChenFile.unExitCreate(savepath);
ChenFile.WriteToFile(savepath, contentString, "gbk", true);
}
public void doMain(String parentUrl)
{
String titleStartTag1 = "<B>";
String titleEndTag1 = "</B>";
String authorSt1artTag1 = "</B><BR>";
String authorEndTag1 = "</a><br>";
String contentStartTag1 = "<TD CLASS=ART>";
String contentEndTag1 = "</TD>";
// 列表链接
String areaStartTag2 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding=\"0\" cellspacing=\"0\" border=\"0\">";
String areaEndTag2 = "</TABLE>";
String hrefStartTag2 = "<a href='";
String hrefEndTag2 = "' >";
String saveFileTitleStartTag2 = "<BR>";
String saveFileTitleEndTag2 = "</A>";
// 获取一级链接页面
String areaStartTag3 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding='0' cellspacing='0'>";
String areaEndTag3 = "</TABLE>";
String hrefStartTag3 = "<a href='";
String hrefEndTag3 = "'";
String saveFileTitleStartTag3 = "<TR>";
String saveFileTitleEndTag3 = "</TR>";
// 记录日志文件
String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log";
try
{
// 获取二级链接
List<String[]> firstList = this.downLoadMessage(this.httpUrl0 + parentUrl, areaStartTag3, areaEndTag3, hrefStartTag3, hrefEndTag3, saveFileTitleStartTag3, saveFileTitleEndTag3);
if (firstList != null && firstList.size() > 0)
{
System.out.println("firstList.size()=" + firstList.size());
for (int i = 0; i < firstList.size(); i++)
{
String[] array = firstList.get(i);
// 获取一级链接
this.saveRoot = "F:" + File.separator + "download" + File.separator + array[1];
// 三级链接
List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2);
if (lastList != null && lastList.size() > 0)
{
System.out.println("lastList.size()=" + lastList.size());
for (int j = 0; j < lastList.size(); j++)
{
try
{
String[] array1 = lastList.get(j);
String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1);
this.saveFile(contentString, array1[1]);
String message = i + "\t"+ array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0];
ChenFile.WriteToFile(logFile, message, "gbk", true);
System.out.println(message);
}
catch (Exception e)
{
String message = i + "\t"+ "信息异常:\t" + array[1] + "\t" + j + "\n" + e;
ChenFile.WriteToFile(logFile, message, "gbk", true);
System.out.println(message);
continue;
}
}
}
}
}
}
catch (Exception e)
{
e.printStackTrace();
}
// 获取二级链接
// 获取详情内容
}
/**
* @author chen_weixian
* @param args
*/
public static void main(String[] args)
{
DownMessage downMessage = new DownMessage();
String parentUrl = "artlist_3.html";
downMessage.doMain(parentUrl);
}
}