0
点赞
收藏
分享

微信扫一扫

WebMagic框架爬取区划代码

朱小落 2021-09-29 阅读 49
技术博客

背景

2017年统计用区划代码和城乡划分代码(截止2017年10月31日)

WebMagic

PageProcessor

PageProcessor 主要用来定义爬虫的配置、页面元素的抽取和链接的发现。
官网给的列子如下:

public class GithubRepoPageProcessor implements PageProcessor {

    // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    @Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
        page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
        page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
        if (page.getResultItems().get("name") == null) {
            //skip this page
            page.setSkip(true);
        }
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

        // 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {

        Spider.create(new GithubRepoPageProcessor())
                //从"https://github.com/code4craft"开始抓
                .addUrl("https://github.com/code4craft")
                //开启5个线程抓取
                .thread(5)
                //启动爬虫
                .run();
    }
}

Selectable

Pipeline

实现

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static com.gsoft.area.spider.SpiderCommon.SPIDER_URL;
import static java.util.regex.Pattern.compile;

/**
 * 基于webmagic 爬取国家统计局行政规划数据
 *
 * @author plsy
 */
public class AreaSpider implements PageProcessor {

    /**
     * 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
     */
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    /**
     * 爬取数据的年度标识,用于区分批次
     */
    private String year;

    /**
     * 爬取从省级到包含的那一级数据
     * 例如: contain = "city,county";  即爬取省级到县级数据
     */
    private String contain;

    public AreaSpider(String year, String contain) {
        this.year = year;
        this.contain = contain;
    }

    @Override
    public void process(Page page) {
        page.setCharset("UTF-8");
        String url = page.getUrl().toString();
        String[] replaceUrl = url.replace(SPIDER_URL, "").replace(".html", "").split("/");
        if (url.matches(SPIDER_URL + "index.html")) {
            //抓取省级行政区划
            List<Map<String, Object>> province = page.getHtml().xpath("//tr[@class='provincetr']/td").nodes().stream()
                    .filter(selectable -> selectable.xpath("//a/text()") != null)
                    .filter(selectable -> selectable.links().all().size() > 0)
                    .map(selectable -> {
                        String name = selectable.xpath("//a/text()").toString();
                        String newUrl = selectable.links().all().get(0);
                        if (contain.contains("city")) {
                            page.addTargetRequest(newUrl);
                        }
                        String replace = newUrl.replace(SPIDER_URL, "").replace(".html", "");
                        String areaCode = replace + "0000";
                        HashMap<String, Object> map = new HashMap<>();
                        map.put("C_NAME", name);
                        map.put("C_CODE", areaCode);
                        map.put("C_LEVEL", 1);
                        map.put("C_CASCADE", "/");
                        map.put("C_PARENT_CODE", 0L);
                        map.put("C_YEAR", year);
                        return map;
                    }).collect(Collectors.toList());

            page.putField("area", province);
        }

        //抓取市级行政单位
        if (replaceUrl.length == 1 && !replaceUrl[0].equals("index") && contain.contains("city")) {
            List<Map<String, Object>> city = new ArrayList<Map<String, Object>>();
            List<Selectable> cityNodes = page.getHtml().xpath("//tr[@class='citytr']/td").nodes();
            cityNodes.stream().forEach(node -> {
                String name = node.xpath("//a/text()").toString();
                if (!compile("[0-9]*").matcher(name).matches()) {
                    String newUrl = node.links().all().get(0);
                    if (contain.contains("county")) {
                        page.addTargetRequest(newUrl);
                    }
                    String replace = newUrl.replace(SPIDER_URL, "").replace(".html", "");
                    String[] split = replace.split("/");
                    String parentId = split[0] + "0000";
                    String areaCode = split[split.length - 1] + "00";
                    HashMap<String, Object> map = new HashMap<>();
                    map.put("C_NAME", name);
                    map.put("C_CODE", areaCode);
                    map.put("C_LEVEL", 2);
                    map.put("C_CASCADE", "/" + parentId + "/" + areaCode);
                    map.put("C_PARENT_CODE", Long.valueOf(parentId));
                    map.put("C_YEAR", year);
                    city.add(map);
                }
            });
            page.putField("area", city);
        }

        //抓取县级行政单位
        if (replaceUrl.length == 2 && contain.contains("county")) {
            List<Map<String, Object>> county = new ArrayList<Map<String, Object>>();
            List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='countytr']/td").nodes();
            for (int i = 0; i < countyNodes.size(); i += 2) {
                List<String> code = countyNodes.get(i).xpath("//*/text()").all();
                List<String> name = countyNodes.get(i + 1).xpath("//*/text()").all();
                String areaCode = code.get(0);
                String areaName = name.get(0);
                if (code.size() > 1) {
                    areaCode = code.get(1);
                    areaName = name.get(1);
                    String newUrl = countyNodes.get(i).links().all().get(0);
                    if (contain.contains("town")) {
                        page.addTargetRequest(newUrl);
                    }
                }
                areaCode = areaCode.substring(0, 6);
                String parentId = areaCode.substring(0, 4) + "00";
                HashMap<String, Object> map = new HashMap<>();
                map.put("C_NAME", areaName);
                map.put("C_CODE", areaCode);
                map.put("C_LEVEL", 3);
                map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + parentId + "/" + areaCode);
                map.put("C_PARENT_CODE", Long.valueOf(parentId));
                map.put("C_YEAR", year);
                county.add(map);
            }
            page.putField("area", county);
        }

        //抓取镇级行政单位
        if (replaceUrl.length == 3 && contain.contains("town")) {
            List<Map<String, Object>> town = new ArrayList<Map<String, Object>>();
            List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='towntr']/td").nodes();
            for (int i = 0; i < countyNodes.size(); i += 2) {
                List<String> code = countyNodes.get(i).xpath("//*/text()").all();
                List<String> name = countyNodes.get(i + 1).xpath("//*/text()").all();
                String areaCode = code.get(0);
                String areaName = name.get(0);
                if (code.size() > 1) {
                    areaCode = code.get(1);
                    areaName = name.get(1);
                    String newUrl = countyNodes.get(i).links().all().get(0);
                    if (contain.contains("village")) {
                        page.addTargetRequest(newUrl);
                    }
                }
                areaCode = areaCode.substring(0, 9);
                String parentId = areaCode.substring(0, 6);
                HashMap<String, Object> map = new HashMap<>();
                map.put("C_NAME", areaName);
                map.put("C_CODE", areaCode);
                map.put("C_LEVEL", 4);
                map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + areaCode.substring(0, 4) + "00/" + parentId + "/" + areaCode);
                map.put("C_PARENT_CODE", Long.valueOf(parentId));
                map.put("C_YEAR", year);
                town.add(map);
            }
            page.putField("area", town);
        }

        //抓取乡级行政单位
        if (replaceUrl.length == 4 && contain.contains("village")) {
            List<Map<String, Object>> village = new ArrayList<Map<String, Object>>();
            List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='villagetr']/td").nodes();
            for (int i = 0; i < countyNodes.size(); i += 3) {
                String areaCode = countyNodes.get(i).xpath("//*/text()").get();
                String areaName = countyNodes.get(i + 2).xpath("//*/text()").get();
                String parentId = areaCode.substring(0, 9);
                HashMap<String, Object> map = new HashMap<>();
                map.put("C_NAME", areaName);
                map.put("C_CODE", areaCode);
                map.put("C_LEVEL", 4);
                map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + areaCode.substring(0, 4) + "00/" + areaCode.substring(0, 6) + "/" + parentId + "/" + areaCode);
                map.put("C_PARENT_CODE", Long.valueOf(parentId));
                map.put("C_YEAR", year);
                village.add(map);
            }
            page.putField("area", village);
        }
    }

    @Override
    public Site getSite() {
        return site;
    }
}

通过这样的特征找到每层行政区划,使用 page.putField 保存起来,再使用自定义的 SqlPipeline 将数据保存到 MySQL 中。

import com.gsoft.cos3.table.service.SingleTableService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.List;
import java.util.Map;

/**
 * webmagic 处理爬虫抽取结果
 *
 * @author plsy
 */
@Component
public class SqlPipeline implements Pipeline {

    @Autowired
    SingleTableService singleTableService;

    @Override
    public void process(ResultItems resultItems, Task task) {
        List<Map<String, Object>> area = resultItems.get("area");
        if (area.size() == 0) {
            System.out.println(resultItems.getRequest().getUrl() + " 此页面未爬取数据,请稍后重试!");
        } else {
            area.stream().forEach(stringObjectMap -> {
                singleTableService.save("cos_sys_administrative_area", stringObjectMap);
            });
        }
    }
}
举报

相关推荐

0 条评论