0
点赞
收藏
分享

微信扫一扫

正在看的故事,下载


View Code

package com.chen.Test;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

public class NetDetailHtml
{
public String saveRoot = "D:" + File.separator + "download" + File.separator;

private String httpUrl0 = "http://www.yi-see.com/";

// 详情页获取信息标签

public NetDetailHtml()
{
}

public NetDetailHtml(String saveRoot, String httpUrl0)
{
this.saveRoot = saveRoot;
this.httpUrl0 = httpUrl0;
}

/**
* 获取url中的链接
*
* @author chen_weixian
* @param parentUrl
* @param urlStartTag
* @param urlEndTag
* @return
* @throws Exception
*/
public List<String[]> downLoadMessage(String parentUrl, String areaStartTag, String areaEndTag, String hrefStartTag, String hrefEndTag, String saveFileTitleStartTag, String saveFileTitleEndTag)
throws Exception
{
List<String[]> resultList = new ArrayList<String[]>();
String htmlContent = this.getContentByUrl(parentUrl);
// StringBuffer sb = new StringBuffer(); // 本地测试
// ChenFile.readFile("F:/download/source.txt", sb, "gbk");
// String htmlContent = sb.toString();

String contentString = htmlContent.substring(htmlContent.indexOf(areaStartTag) + areaStartTag.length(), htmlContent.length());
contentString = contentString.substring(0, contentString.indexOf(areaEndTag));

String[] titleArray = contentString.split(saveFileTitleStartTag);
if (titleArray.length > 0)
{
for (int i = 0; i < titleArray.length; i++)
{
try
{
if (titleArray[i].indexOf(hrefEndTag) <= 0 || titleArray[i].indexOf(saveFileTitleEndTag) <= 0)
{
continue;
}
String[] tempArray = new String[2];
String url = titleArray[i].substring(titleArray[i].indexOf(hrefStartTag) + hrefStartTag.length(), titleArray[i].length());
url = url.substring(0, url.indexOf(hrefEndTag));
String title = this.clearHtml(titleArray[i]);
// 获取标题,获取
// String content = this.downStart(httpUrl0 + url);
// this.saveFile(content, title);
// System.out.println(i + "\t完成:" + title + "\t" + httpUrl0
// + url);
tempArray[0] = url;
tempArray[1] = title;

resultList.add(tempArray);
} catch (Exception e)
{
// System.out.println(i + " 信息异常:" + e);
continue;
}
}
}

return resultList;
}

/**
* 读取信息内容
*
* @author chen_weixian
* @param urlString
* @return
* @throws MException
*/
public String downStart(String urlString, String titleStartTag, String titleEndTag, String authorSt1artTag, String authorEndTag, String contentStartTag, String contentEndTag) throws Exception
{
String result = "";

String htmlContent = this.getContentByUrl(urlString);
if (htmlContent.length() > 0)
{
StringBuffer resultString = new StringBuffer(100); // 结果
String br = " \r\n";
// 标题
if (!this.isEmpty(titleStartTag) && !this.isEmpty(titleEndTag))
{
String title = htmlContent.substring(htmlContent.indexOf(titleStartTag), htmlContent.length());
title = title.substring(0, title.indexOf(titleEndTag));
resultString.append(title);
resultString.append(br);
}
// 作者
if (!this.isEmpty(authorSt1artTag) && !this.isEmpty(authorEndTag))
{
String author = htmlContent.substring(htmlContent.indexOf(authorSt1artTag), htmlContent.length());
author = author.substring(0, author.indexOf(authorEndTag));
resultString.append(author);
resultString.append(br);
}
// 内容
if (!this.isEmpty(contentStartTag) && !this.isEmpty(contentEndTag))
{
String content = htmlContent.substring(htmlContent.indexOf(contentStartTag), htmlContent.length());
content = content.substring(0, content.indexOf(contentEndTag));
resultString.append(content);
resultString.append(br);
}

result = resultString.toString().replaceAll("    ", br);
}
// 去除html
return this.clearHtml(result.toString());
}

/**
* 读取url内容
*
* @author chen_weixian
* @param urlString
* @return
* @throws Exception
*/
public String getContentByUrl(String urlString) throws Exception
{
// 设置代理上外网
System.getProperties().put("proxySet", "true");
// System.getProperties().put("proxyHost", "10.17.171.11"); // ip
System.getProperties().put("proxyHost", "pascproxy1.pasc.com.cn"); // 域名
System.getProperties().put("proxyPort", "8080");

StringBuffer htmlContent = new StringBuffer(); // 临时变量
URL urlObj = new URL(urlString);
HttpURLConnection httpcon = (HttpURLConnection) urlObj.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
String line = "";
while ((line = reader.readLine()) != null)
{
line = new String(line.getBytes(), "gbk");
htmlContent.append(line);
}
reader.close();
return htmlContent.toString();
}

/**
* 保存文件
*
* @author chen_weixian
* @throws IOException
*/
public void saveFile(String contentString, String title) throws IOException
{
String savepath = saveRoot + File.separator + title + ".txt";
// ChenFile.unExitCreate(savepath);
this.WriteToFile(savepath, contentString, "gbk");
}

public void doMain(String parentUrl)
{

String titleStartTag1 = "<B>";
String titleEndTag1 = "</B>";
String authorSt1artTag1 = "</B><BR>";
String authorEndTag1 = "</a><br>";
String contentStartTag1 = "<TD CLASS=ART>";
String contentEndTag1 = "</TD>";
// 列表链接
String areaStartTag2 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding=\"0\" cellspacing=\"0\" border=\"0\">";
String areaEndTag2 = "</TABLE>";
String hrefStartTag2 = "<a href='";
String hrefEndTag2 = "' >";
String saveFileTitleStartTag2 = "<BR>";
String saveFileTitleEndTag2 = "</A>";
// 获取一级链接页面
String areaStartTag3 = "<TABLE WIDTH=900px ALIGN=CENTER cellpadding='0' cellspacing='0'>";
String areaEndTag3 = "</TABLE>";
String hrefStartTag3 = "<a href='";
String hrefEndTag3 = "'";
String saveFileTitleStartTag3 = "<TR>";
String saveFileTitleEndTag3 = "</TR>";
// 记录日志文件
// String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log";

try
{
// 获取二级链接
List<String[]> firstList = this.downLoadMessage(this.httpUrl0 + parentUrl, areaStartTag3, areaEndTag3, hrefStartTag3, hrefEndTag3, saveFileTitleStartTag3, saveFileTitleEndTag3);
if (firstList != null && firstList.size() > 0)
{
System.out.println("firstList.size()=" + firstList.size());
for (int i = 0; i < firstList.size(); i++)
{
String[] array = firstList.get(i);
// 获取一级链接
this.saveRoot = "D:" + File.separator + "download" + File.separator + array[1];
// 三级链接
List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2);
if (lastList != null && lastList.size() > 0)
{
System.out.println("lastList.size()=" + lastList.size());
for (int j = 0; j < lastList.size(); j++)
{
try
{
String[] array1 = lastList.get(j);
String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1);
this.saveFile(contentString, array1[1]);
String message = i + "\t" + array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0];
// ChenFile.WriteToFile(logFile, message, "gbk", true);
System.out.println(message);
} catch (Exception e)
{
String message = i + "\t" + "信息异常:\t" + array[1] + "\t" + j + "\n" + e;
// ChenFile.WriteToFile(logFile, message, "gbk", true);
System.out.println(message);
continue;
}
}
}
}
}
} catch (Exception e)
{
e.printStackTrace();
}
// 获取二级链接
// 获取详情内容
}

public void doMain2(String parentUrl)
{

String titleStartTag1 = "<span id=\"htmltimu\">";
String titleEndTag1 = "</span>";
String authorSt1artTag1 = "</span> <span>";
String authorEndTag1 = "</a></span>";
String contentStartTag1 = "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" align=\"center\" >";
String contentEndTag1 = "<div class=\"button_con\">";
// 列表链接
String areaStartTag2 = "<div id=\"htmlList\" class=\"insert_list\">";
String areaEndTag2 = "</div>";
String hrefStartTag2 = "<strong><a href=\"";
String hrefEndTag2 = "\">";
String saveFileTitleStartTag2 = "<li>";
String saveFileTitleEndTag2 = "</a>";
// 记录日志文件
// String logFile = "F:" + File.separator + "download" + File.separator + ChenTools.getCurrDatetime(2) + "下载日志.log";

try
{
String array[] = {parentUrl, "官道之色戒"};
// 获取一级链接
this.saveRoot = "D:" + File.separator + "story" + File.separator + array[1];
// 三级链接
List<String[]> lastList = this.downLoadMessage(this.httpUrl0 + array[0], areaStartTag2, areaEndTag2, hrefStartTag2, hrefEndTag2, saveFileTitleStartTag2, saveFileTitleEndTag2);
if (lastList != null && lastList.size() > 0)
{
System.out.println("lastList.size()=" + lastList.size());
for (int j = 0; j < lastList.size(); j++)
{
try
{
String[] array1 = lastList.get(j);
String contentString = this.downStart(this.httpUrl0 + array1[0], titleStartTag1, titleEndTag1, authorSt1artTag1, authorEndTag1, contentStartTag1, contentEndTag1);
this.saveFile(contentString, j + array1[1]);
String message = array[1] + "\t" + j + "\t" + array1[1] + "\t" + array1[0];
// ChenFile.WriteToFile(logFile, message, "gbk", true);
System.out.println(message);
} catch (Exception e)
{
String message = "信息异常:\t" + array[1] + "\t" + j + "\n" + e;
// ChenFile.WriteToFile(logFile, message, "gbk", true);
System.out.println(message);
continue;
}
}
}
} catch (Exception e)
{
e.printStackTrace();
}
// 获取二级链接
// 获取详情内容
}

private void WriteToFile(String savepath, String contentString, String code) throws IOException
{
File file = new File(savepath);
if (!file.exists())
{
File rootFile = file.getParentFile();
if (!rootFile.exists())
{
rootFile.mkdirs();
}
file.createNewFile();
}

OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file));
ow.write(new String(contentString.getBytes(code)));
ow.close();
}

/***
*
* @author : EX-CHENWEIXIAN001 陈惟鲜
* @create_date :2013-3-20 上午09:06:03
* @param str
* @return
*/
private boolean isEmpty(String str)
{
if (str == null || str.trim().length() == 0)
{
return true;
}
return false;
}

private String clearHtml(String htmlString)
{
// String s = " ddd<li title=\" 其它国内品牌\">品牌: 其它国内品牌</li>bsd";
String p = "<[^>]*>";
htmlString = htmlString.replaceAll(p, "");
return htmlString;
}
/**
* @author chen_weixian
* @param args
*/
public static void main(String[] args)
{
String saveRoot = "D:" + File.separator + "download" + File.separator;
// String httpUrl0 = "http://www.chkee.com/chkbook/13/13472/";
String httpUrl0 = "http://www.chkee.com/chkbook/0/554/";

NetDetailHtml downMessage = new NetDetailHtml(saveRoot, httpUrl0);
String parentUrl = "index.html";
downMessage.doMain2(parentUrl);
// String content = "<strong><a href=\"2965986.html\">作者有话说</a></strong></li>";
// String hrefStartTag = "<li><strong><a href=\"";
// String hrefEndTag = "\">";
// System.out.println(content.substring(content.indexOf(hrefStartTag) + hrefStartTag.length(), content.length()));

// System.out.println(s);
}

}

 

举报

相关推荐

0 条评论