0
点赞
收藏
分享

微信扫一扫

Java爬虫(四)框架的简单使用

Java旺 2022-07-27 阅读 99


1.什么是WebMagi

WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。

​​http://webmagic.io/​​

特性:

  • 简单的API,可快速上手
  • 模块化的结构,可轻松扩展
  • 提供多线程和分布式支持

 

2.使用案例:

1.添加maven依赖

 

<dependencies>


<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.42</version>
</dependency>
<dependency>
<groupId>com.oracle</groupId>
<artifactId>ojdbc6</artifactId>
<version>1.0.0</version>
</dependency>


</dependencies>

2.核心类得写法


import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class FootProcessor implements PageProcessor {

//抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(10).setSleepTime(1000);
//博文数量
private static int num = 0;
//数据库持久化对象,用于将博文信息存入数据库
private BlogDao blogDao = new BlogDaoImpl();
public static void main(String[] args) throws Exception {
long startTime ,endTime;
System.out.println("========懂球帝小爬虫【启动】喽!=========");
startTime = new Date().getTime();
Spider.create(new FootProcessor()).addUrl("http://www.dongqiudi.com/data").thread(5).run();
endTime = new Date().getTime();
System.out.println("========懂球帝小爬虫【结束】喽!=========");
System.out.println("一共爬用时为:"+(endTime-startTime)/1000+"s");
}

@Override
public void process(Page page) {

// String content1 = page.getHtml().get();
try {
/*实例化BlogInfo,方便持久化存储。*/
position blog = new position();
//获取id
String id = page.getHtml().xpath("//tr[@class='top_rank']/td[1]/text()").get();
System.out.println(id);
//获取排名
String position = page.getHtml().xpath("//tr[@class='top_rank']/td[1]/text()").get();
System.out.println(position);
//获取队伍名字
String name = page.getHtml().xpath("//tr[@class='top_rank']/td/a/text()").get();
System.out.println(name);
//获取队伍分数
String grade = page.getHtml().xpath("//tr[@class='top_rank']/td[10]/text()").get().trim();
System.out.println(grade);
blog.setId(id);
blog.setPosition(position);
blog.setname(name);
blog.setgrade(grade);

num++;//博文数++

System.out.println("num:" + num + " " + blog.toString());//输出对象
blogDao.saveBlog(blog);//保存博文信息到数据库
}catch (Exception e){
e.printStackTrace();
}
}


@Override
public Site getSite() {

return this.site;
}

}


3.数据持久化


public interface BlogDao {
/**
* 保存博文信息
* @param blog
* @return
*/
public int saveBlog(position blog);
}





import java.util.ArrayList;
import java.util.List;

public class BlogDaoImpl implements BlogDao{
@Override
public int saveBlog(position blog){
DBHelper dbhelper = new DBHelper();
StringBuffer sql = new StringBuffer();
sql.append("INSERT INTO (id,position,name,grade)")
.append("VALUES (? , ? , ? , ? ) ");
//设置 sql values 的值
List<String> sqlValues = new ArrayList<String>();
sqlValues.add(blog.getId());
sqlValues.add(blog.getPosition());
sqlValues.add(blog.getname());
sqlValues.add(blog.getgrade());
int result = dbhelper.executeUpdate(sql.toString(), sqlValues);
return result;
}
}


 

 


import java.sql.*;
import java.util.List;

public class DBHelper {

public static final String driver_class = "com.mysql.jdbc.Driver";
public static final String driver_url = "jdbc:mysql://localhost/football?useunicode=true&characterEncoding=utf8";
public static final String user = "root";
public static final String password = "root";
private static Connection conn = null;
private PreparedStatement pst = null;
private ResultSet rst = null;
/**
* Connection
*/
public DBHelper() {
try {
conn = DBHelper.getConnInstance();
} catch (Exception e) {
e.printStackTrace();
}
}

/**
* 单例模式
* 线程同步
* @return
*/
private static synchronized Connection getConnInstance() {
if(conn == null){
try {
Class.forName(driver_class);
conn = DriverManager.getConnection(driver_url, user, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
System.out.println("连接数据库成功");
}
return conn;
}
/**
* close
*/
public void close() {

try {
if (conn != null) {
DBHelper.conn.close();
}
if (pst != null) {
this.pst.close();
}
if (rst != null) {
this.rst.close();
}
System.out.println("关闭数据库成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
/**
* query
*
* @param sql
* @param sqlValues
* @return ResultSet
*/
public ResultSet executeQuery(String sql, List<String> sqlValues) {
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
rst = pst.executeQuery();
} catch (SQLException e) {
e.printStackTrace();
}
return rst;
}

/**
* update
*
* @param sql
* @param sqlValues
* @return result
*/
public int executeUpdate(String sql, List<String> sqlValues) {
int result = -1;
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
result = pst.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}

return result;
}

/**
* sql set value
*
* @param pst
* @param sqlValues
*/
private void setSqlValues(PreparedStatement pst, List<String> sqlValues) {
for (int i = 0; i < sqlValues.size(); i++) {
try {
pst.setObject(i + 1, sqlValues.get(i));
} catch (SQLException e) {
e.printStackTrace();
}
}
}

}


4,实体类


public class position {
private String id;
private String position;
private String name;
private String grade;

public String getId() {
return id;
}

public void setId(String id) {
this.id = id;
}

public String getPosition() {
return position;
}

public void setPosition(String position) {
this.position = position;
}

public String getname() {
return name;
}

public void setname(String name) {
this.name = name;
}

public String getgrade() {
return grade;
}

public void setgrade(String grade) {
this.grade = grade;
}
}

4.建立相关得表

 

Java爬虫(四)框架的简单使用_sql

 

5.运行结果

Java爬虫(四)框架的简单使用_数据库_02

数据库报错,需要进行相关修改

待更新

 

举报

相关推荐

0 条评论