闲来无事,写了一个小程序来遍历百度应用的所有应用
2014-03-12 来自:紫夜月痕 0 人回应

首先观察到百度应用中所有的app信息所在的网页地址都是类似于这种格式的:
http://as.baidu.com/a/item?docid={appid}
其中appid是一个数字,好吧,闲来无事就写写程序玩玩吧
首先准备好一个网络请求的类,这也是笔者长期以来一直使用的类,就是用来发http请求,然后获取响应的类,代码如下:

package com.xiechanglei.resource;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

public class Resource {

/**关闭流输出
* @param inputStream
* @param file
* @param outputStream
* @param isException
*/
private static void closeStream(InputStream inputStream, File file,
OutputStream outputStream, boolean isException) {
if(inputStream!=null){
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(outputStream!=null){
try {
outputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(isException && file.exists()){
file.delete();
}
}

public static void createFile(File file) throws IOException {
if(file.isDirectory()){
file.mkdirs();
}else{
File parentFile = file.getParentFile();
if(!parentFile.exists()){
parentFile.mkdirs();
}
file.createNewFile();
}
}

/** 根据请求地址,请求参数获得返回的Bufferedreader ,一般用于分析网页
* @param uri 请求地址 get参数请放置于uri中,按照格式编辑好url 类似于http://yun.baidu.com/share/home?uk=3741363348&view=share#category/type=0 这种格式
* @param params 请求的参数 的Map<String,String> 可以是空
* @return 返回BufferedReader 注意,在使用完之后记得关闭stream
* @throws IOException 跑出的异常 主要是无法获得资源
*/
public static BufferedReader getBufferedReader(String uri,Map<String, String> params,String charset) throws IOException{
URLConnection connection = sendRequest(uri, params);
String headerField = connection.getHeaderField("Content-Type");
if(headerField==null || headerField.indexOf("charset")==-1){
headerField = connection.getContentEncoding();
if(headerField==null){
headerField = charset;
}
}else{
headerField = headerField.substring(headerField.indexOf("=")+1);
}
return new BufferedReader(new InputStreamReader(connection.getInputStream(),headerField));
}

/** 直接打印请求
* @see Resource.getBufferedReader()
* @param uri
* @param params
* @throws IOException
*/
public static void printRequest(String uri,Map<String, String> params,String charset) throws IOException{
BufferedReader reader = null;
try {
reader = getBufferedReader(uri, params,charset);
String readline = null;
while((readline = reader.readLine())!=null){
System.out.println(readline);
}
} catch (IOException e) {
e.printStackTrace();
throw e;
}finally{
if(reader!=null){
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

/** 根据uri 和params发送请求
* @param uri 请求的地址
* @param params 请求参数的Map 可以为空
* @return 返回UrlConnection 对象
* @throws MalformedURLException
* @throws IOException
*/
private static URLConnection sendRequest(String uri,
Map<String, String> params) throws MalformedURLException,
IOException {
URL url = new URL(uri);
URLConnection connection = url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0(compatible;MSIE 5.0;Windows NT;DigExt)");
if(params!=null && params.size()>0){
connection.setDoOutput(true);
OutputStream outputStream = connection.getOutputStream();
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(outputStream));
Set<Entry<String, String>> entrySet = params.entrySet();
for (Entry<String, String> entry : entrySet) {
String key = entry.getKey();
String value = entry.getValue();
printWriter.print(key+"="+value+"&");
}
printWriter.flush();
}
return connection;
}

/** 将请求保存下来 主要可以用来下载文件
* @param uri
* @param params
* @param filename
* @throws IOException
*/
public static void storeUri(String uri,Map<String, String> params,String filename) throws IOException{
InputStream inputStream = null;
File file = new File(filename);
if(!file.exists()){
createFile(file);
}
OutputStream outputStream = null;
boolean isException = false;
try {
outputStream = new FileOutputStream(file);
URLConnection connection = sendRequest(uri, params);
inputStream = connection.getInputStream();
byte[] temp = new byte[8*1024];
while(true){
int read = inputStream.read(temp);
if(read==-1){
break;
}else{
outputStream.write(temp, 0, read);
outputStream.flush();
}
}
} catch (IOException e) {
e.printStackTrace();
isException = true;
}finally{
closeStream(inputStream, file, outputStream, isException);
}
}
/**
* just a util
* @author xiechanglei
* @date 2014-3-11 下午4:52:37
* @param key
* @param value
* @return
*/
public static Map<String, String> createParamMap(String key,String value){
Map<String, String> map = new HashMap<String, String>();
map.put(key, value);
return map;
}
}

然后弄个oracle10g的jdbc驱动,因为要把取得的数据存取来嘛
然后开始写代码,
一个准备以及存储数据的类,就叫CatchStatus 吧,代码如下

package com.xiechanglei.resource.baiduapp;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

public class CatchStatus {
public static int current = 0;
public static synchronized int getCatchIndex(){
return ++current;
}
public static Connection connection;

public static final String BASEURL= "http://as.baidu.com/a/item?docid=";
private static String url = "jdbc:oracle:thin:@127.0.0.1:1521:XE";
private static String username = "xiechanglei";
private static String password = "admin";
private static String driverName = "oracle.jdbc.OracleDriver";

static {
try {
Class.forName(driverName);
connection = DriverManager.getConnection(url, username, password);
ResultSet executeQuery = connection.prepareStatement("select max(id) from baiduapps").executeQuery();
while(executeQuery.next()){
current = executeQuery.getInt(1);
}
executeQuery.close();
} catch (Exception e) {
e.printStackTrace();
}
}

public static synchronized void store(String apptype, String appname, String size, String updatetime, String downloadaddress, int appid) throws SQLException {
PreparedStatement ps = connection.prepareStatement("insert into baiduapps(id,appname,appsize,updatetime,downloadaddress,apptype) values(?,?,?,?,?,?)");
ps.setInt(1, appid);
ps.setString(2, appname);
ps.setString(3, size);
ps.setString(4, updatetime);
ps.setString(5, downloadaddress);
ps.setString(6, apptype);
ps.executeUpdate();
ps.close();
}

public static void close(ResultSet rs, Statement st, Connection conn) {
try {
if (rs != null) {
rs.close();

}
} catch (SQLException e) {
e.printStackTrace();
}finally{
try {
if(st!=null){
st.close();
}
} catch (SQLException e) {
e.printStackTrace();
}finally{
if(conn!=null){
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}

}
}

一个分析网页请求的类,继续Thread
代码如下:

package com.xiechanglei.resource.baiduapp;

import java.io.BufferedReader;
import java.io.IOException;
import java.sql.SQLException;

import com.xiechanglei.resource.Resource;

public class BaiduAppCatcher extends Thread {
@Override
public void run() {
while (true) {
BufferedReader bufferedReader = null;
try {
int catchIndex = CatchStatus.getCatchIndex();
System.out.println(catchIndex);
bufferedReader = Resource.getBufferedReader(CatchStatus.BASEURL
+ catchIndex, null, "utf-8");
String readline = null;
while ((readline = bufferedReader.readLine()) != null) {
if (readline
.indexOf("article_conten tcontent_header middle_nav cls") != -1) {
beginCollectMessage(bufferedReader, catchIndex);
break;
}
}
bufferedReader.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (bufferedReader != null) {
try {
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

/**
* @author xiechanglei
* @date 2014-3-12 下午2:42:26
* @param bufferedReader
* @throws IOException
* @throws SQLException
*/
private void beginCollectMessage(BufferedReader bufferedReader, int appid)
throws IOException, SQLException {

String apptype = replaceAllTag(trip(bufferedReader, 6)).trim();
String appname = replaceAllTag(trip(bufferedReader, 2)).trim();
String size = replaceAllTag(goOnStop("params-size", bufferedReader))
.trim();
String updatetime = replaceAllTag(
goOnStop("params-updatetime", bufferedReader)).trim();
replaceAllTag(goOnStop("col-content", bufferedReader));
String readLine = bufferedReader.readLine();
readLine = readLine.substring(readLine.indexOf("data-download_url"));
readLine = readLine.substring(readLine.indexOf("\"") + 1);
String downloadaddress = readLine.substring(0, readLine.indexOf("\""));
store(apptype, appname, size, updatetime, downloadaddress, appid);

}

public void store(String apptype, String appname, String size,
String updatetime, String downloadaddress, int appid) throws SQLException {
CatchStatus.store(apptype, appname, size, updatetime, downloadaddress, appid);
}

public String trip(BufferedReader reader, int size) throws IOException {
String readLine = null;
for (int i = 0; i < size; i++) {
readLine = reader.readLine();
}
return readLine;
}

public String goOnStop(String reg, BufferedReader reader)
throws IOException {
String readline = null;
while ((readline = reader.readLine()) != null) {
if (readline.indexOf(reg) != -1) {
break;
}
}
return readline;
}

public String replaceAllTag(String str) {
while (str.indexOf("<") != -1 && str.indexOf(">") != -1 && str.indexOf("<") < str.indexOf(">"))
{
str = str.substring(0, str.indexOf("<")) + str.substring(str.indexOf(">") + 1);
}
return str;
}
}

最后做线程开跑

package com.xiechanglei.resource.baiduapp;

public class Manager {
public static void main(String[] args) {
for (int i = 0; i < 200; i++) {
new BaiduAppCatcher().start();
}
}
}

下面是数据库查询结果

还没有人回应!

您的回应

你还未登陆,不能回应!登陆