1.什么是WebMagi
WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
http://webmagic.io/
特性:
- 简单的API,可快速上手
- 模块化的结构,可轻松扩展
- 提供多线程和分布式支持
2.使用案例:
1.添加maven依赖
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.42</version>
</dependency>
<dependency>
<groupId>com.oracle</groupId>
<artifactId>ojdbc6</artifactId>
<version>1.0.0</version>
</dependency>
</dependencies>
2.核心类得写法
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FootProcessor implements PageProcessor {
//抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(10).setSleepTime(1000);
//博文数量
private static int num = 0;
//数据库持久化对象,用于将博文信息存入数据库
private BlogDao blogDao = new BlogDaoImpl();
public static void main(String[] args) throws Exception {
long startTime ,endTime;
System.out.println("========懂球帝小爬虫【启动】喽!=========");
startTime = new Date().getTime();
Spider.create(new FootProcessor()).addUrl("http://www.dongqiudi.com/data").thread(5).run();
endTime = new Date().getTime();
System.out.println("========懂球帝小爬虫【结束】喽!=========");
System.out.println("一共爬用时为:"+(endTime-startTime)/1000+"s");
}
@Override
public void process(Page page) {
// String content1 = page.getHtml().get();
try {
/*实例化BlogInfo,方便持久化存储。*/
position blog = new position();
//获取id
String id = page.getHtml().xpath("//tr[@class='top_rank']/td[1]/text()").get();
System.out.println(id);
//获取排名
String position = page.getHtml().xpath("//tr[@class='top_rank']/td[1]/text()").get();
System.out.println(position);
//获取队伍名字
String name = page.getHtml().xpath("//tr[@class='top_rank']/td/a/text()").get();
System.out.println(name);
//获取队伍分数
String grade = page.getHtml().xpath("//tr[@class='top_rank']/td[10]/text()").get().trim();
System.out.println(grade);
blog.setId(id);
blog.setPosition(position);
blog.setname(name);
blog.setgrade(grade);
num++;//博文数++
System.out.println("num:" + num + " " + blog.toString());//输出对象
blogDao.saveBlog(blog);//保存博文信息到数据库
}catch (Exception e){
e.printStackTrace();
}
}
@Override
public Site getSite() {
return this.site;
}
}
3.数据持久化
public interface BlogDao {
/**
* 保存博文信息
* @param blog
* @return
*/
public int saveBlog(position blog);
}
import java.util.ArrayList;
import java.util.List;
public class BlogDaoImpl implements BlogDao{
@Override
public int saveBlog(position blog){
DBHelper dbhelper = new DBHelper();
StringBuffer sql = new StringBuffer();
sql.append("INSERT INTO (id,position,name,grade)")
.append("VALUES (? , ? , ? , ? ) ");
//设置 sql values 的值
List<String> sqlValues = new ArrayList<String>();
sqlValues.add(blog.getId());
sqlValues.add(blog.getPosition());
sqlValues.add(blog.getname());
sqlValues.add(blog.getgrade());
int result = dbhelper.executeUpdate(sql.toString(), sqlValues);
return result;
}
}
import java.sql.*;
import java.util.List;
public class DBHelper {
public static final String driver_class = "com.mysql.jdbc.Driver";
public static final String driver_url = "jdbc:mysql://localhost/football?useunicode=true&characterEncoding=utf8";
public static final String user = "root";
public static final String password = "root";
private static Connection conn = null;
private PreparedStatement pst = null;
private ResultSet rst = null;
/**
* Connection
*/
public DBHelper() {
try {
conn = DBHelper.getConnInstance();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 单例模式
* 线程同步
* @return
*/
private static synchronized Connection getConnInstance() {
if(conn == null){
try {
Class.forName(driver_class);
conn = DriverManager.getConnection(driver_url, user, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
System.out.println("连接数据库成功");
}
return conn;
}
/**
* close
*/
public void close() {
try {
if (conn != null) {
DBHelper.conn.close();
}
if (pst != null) {
this.pst.close();
}
if (rst != null) {
this.rst.close();
}
System.out.println("关闭数据库成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
/**
* query
*
* @param sql
* @param sqlValues
* @return ResultSet
*/
public ResultSet executeQuery(String sql, List<String> sqlValues) {
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
rst = pst.executeQuery();
} catch (SQLException e) {
e.printStackTrace();
}
return rst;
}
/**
* update
*
* @param sql
* @param sqlValues
* @return result
*/
public int executeUpdate(String sql, List<String> sqlValues) {
int result = -1;
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
result = pst.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}
return result;
}
/**
* sql set value
*
* @param pst
* @param sqlValues
*/
private void setSqlValues(PreparedStatement pst, List<String> sqlValues) {
for (int i = 0; i < sqlValues.size(); i++) {
try {
pst.setObject(i + 1, sqlValues.get(i));
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
4,实体类
public class position {
private String id;
private String position;
private String name;
private String grade;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getPosition() {
return position;
}
public void setPosition(String position) {
this.position = position;
}
public String getname() {
return name;
}
public void setname(String name) {
this.name = name;
}
public String getgrade() {
return grade;
}
public void setgrade(String grade) {
this.grade = grade;
}
}
4.建立相关得表
5.运行结果
数据库报错,需要进行相关修改
待更新