0
点赞
收藏
分享

微信扫一扫

使用Jsoup爬取数据,并存到Excel 源码附上

陆公子521 2022-04-14 阅读 38
java爬虫

看了Jsoup教程,去试了试爬取数据玩玩。就拿b站的排行榜试试水

一、导入依赖

<dependency>
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.11.3</version>
</dependency>

二、敲码

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
  
  public List<Map<String, Object>> main() throws IOException {
        String url = "https://www.bilibili.com/v/popular/rank/all";

        Document document = Jsoup.parse(new URL(url), 30000);
        Elements elements = document.getElementsByClass("rank-item");
        List<Map<String, Object>> dataPojo = new ArrayList<>();

        for (Element el : elements) {
            String num = el.getElementsByClass("num").text();
            String src = el.getElementsByTag("a").eq(0).attr("href");
            String info = el.getElementsByClass("title").eq(0).text();
            String detail = el.getElementsByClass("detail").eq(0).text();
            String upName = el.getElementsByClass("up-name").eq(0).text();
            String pts = el.getElementsByClass("pts").eq(0).text();
			// 这里我没有细分detail字段(其实这里有三个值的,就用数组分割了)
            String[] split = detail.split(" ");
            pts = pts.substring(0,pts.length()-4);

            Map<String, Object> map = new LinkedHashMap<>();
            map.put("序号",num);
            map.put("路径","https:"+src);
            map.put("信息",info);
            map.put("播放量",split[0]);
            map.put("弹幕数量",split[1]);
            map.put("作者",upName);
            map.put("曝光亮",pts);

            dataPojo.add(map);
        }
        return dataPojo;
    }

三、将数据存储至excel

  <dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-ooxml</artifactId>
	<version>4.1.1</version>
</dependency>
<dependency>
	<groupId>cn.hutool</groupId>
	<artifactId>hutool-all</artifactId>
	<version>5.0.7</version>
</dependency>
<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-ooxml-schemas</artifactId>
	<version>3.17</version>
</dependency>

import cn.hutool.poi.excel.ExcelUtil;
import cn.hutool.poi.excel.ExcelWriter;
import java.io.IOException;
import java.util.List;
import java.util.Map;

public static void main(String[] args) throws IOException {
        //获取数据
        HtmlJsoupData jsoupData = new HtmlJsoupData();
        List<Map<String, Object>> list = jsoupData.main();

        //通过工具类创建writer 
        ExcelWriter writer = ExcelUtil.getWriter("C:/Users/2902404395/Desktop/data.xls");

        //跳过当前行,既第一行,非必须,在此演示用
        writer.passCurrentRow();

        // 设置宽度
        writer.setColumnWidth(0,20);
        writer.setColumnWidth(1,40);
        writer.setColumnWidth(2,90);
        writer.setColumnWidth(3,30);
        writer.setColumnWidth(4,30);
        writer.setColumnWidth(5,30);
        writer.setColumnWidth(6,30);
        writer.setColumnWidth(7,20);
        writer.setDefaultRowHeight(17);
        writer.write(list, true);

        //关闭writer,释放内存
        writer.close();

        System.out.println("成功");
    }

最后-效果图

在这里插入图片描述

源码

举报

相关推荐

0 条评论