数据采集与清洗-CFANZ编程社区

网络爬虫，是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本；简单来说就是通过编写脚本模拟浏览器发起请求获取数据。本章内容将结合实践工作需求来编写一个简单爬虫项目小案例。

环境准备
本项目使用语言为Java，JDK版本为1.8，使用Maven管理项目依赖。
项目中用到的依赖包

<dependencies>
        <dependency>
            <groupId>com.squareup.okhttp3</groupId>
            <artifactId>okhttp</artifactId>
            <version>4.9.0</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.73</version>
        </dependency>
    </dependencies>

项目需求介绍
本次抓取的目标源网站为中国气象网(http://www.weather.com.cn/)，具体抓取数据为网站下的国际城市天气预报(http://www.weather.com.cn/forecast/world.shtml)，即203个国家和地区下面所有所属城市近7天的天气预报数据内容；获取到这些数据后可用来进行数据分析挖掘，分析各地区的天气数据情况。

项目需求分析
1、我们可以先进入到目标源网站查看网页数据结构(http://www.weather.com.cn/forecast/world.shtml)，可知第一步需要获取到网站上6大洲的所有国家和地区：我们使用chrome浏览器按F12调出开发者工具分析网页可发现，每个国家在网站中都有一个唯一的data-id，也就是国家id，所以我们需要先拿到这203个国家的id，然后再去遍历这203个国家下面的所有城市；
在这里插入图片描述
2、在抓取完这203个国家的id之后，我们再来抓取这203个国家下面的所有城市id，通过chrome浏览器上的开发者工具抓包可以抓到所有城市id的数据接口

继续观察这个接口url中的参数可以发现包含我们前面抓取的国家id，这样我们只要把这203个国家的id遍历一遍，就能抓取到所有城市的id：
在这里插入图片描述

3、在完成所有城市id的抓取后，我们就可以开始抓取所有城市天气预报的详情数据了，点击进入一个城市天气预报的详情页面进行探查分析；详情页url为http://www.weather.com.cn/weather/401620100.shtml；只要遍历城市id，然后将城市id替换url中的id就可以实现抓取所有城市的天气预报数据了；
在这里插入图片描述
开始编写代码
实体类定义
我们将实体类准备好，也就是对应的国家信息，城市信息，天气信息的类先定义出来。
国家信息

package net.csdn.document.module;
import java.util.List;
import java.util.Objects;

/**
 * <p> 国家。</p>
 *
 */
public class Country {
    /**
     * 国家编号
     */
    private String id;
    /**
     * 国家中文名称
     */
    private String name;
    /**
     * 国家英文名称
     */
    private String en;
    /**
     * 国家下面的城市列表
     */
    private List<City> cityList;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getEn() {
        return en;
    }

    public void setEn(String en) {
        this.en = en;
    }

    public List<City> getCityList() {
        return cityList;
    }

    public void setCityList(List<City> cityList) {
        this.cityList = cityList;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }
        Country country = (Country) o;
        return Objects.equals(id, country.id) &&
                Objects.equals(name, country.name) &&
                Objects.equals(en, country.en);
    }

    @Override
    public int hashCode() {
        return Objects.hash(id, name, en);
    }

    @Override
    public String toString() {
        return "Country{" +
                "id='" + id + '\'' +
                ", name='" + name + '\'' +
                ", en='" + en + '\'' +
                ", cityList=" + cityList +
                '}';
    }
}

城市信息

package net.csdn.document.module;
import java.util.List;
import java.util.Objects;

/**
 * <p>城市 。</p>
 *
 */
public class City {
    /**
     * 城市编号
     */
    private String id;
    /**
     * 城市中文名称
     */
    private String name;
    /**
     * 城市英文名称
     */
    private String en;
    /**
     * 城市的未来七天天气情况
     */
    private List<Weather> weatherList;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getEn() {
        return en;
    }

    public void setEn(String en) {
        this.en = en;
    }

    public List<Weather> getWeatherList() {
        return weatherList;
    }

    public void setWeatherList(List<Weather> weatherList) {
        this.weatherList = weatherList;
    }


    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }
        City city = (City) o;
        return Objects.equals(id, city.id) &&
                Objects.equals(name, city.name) &&
                Objects.equals(en, city.en);
    }

    @Override
    public int hashCode() {
        return Objects.hash(id, name, en);
    }

    @Override
    public String toString() {
        return "City{" +
                "id='" + id + '\'' +
                ", name='" + name + '\'' +
                ", en='" + en + '\'' +
                ", weatherList=" + weatherList +
                '}';
    }
}

天气信息

package net.csdn.document.module;
/**
 * <p>天气 。</p>
 *
 */
public class Weather {
    /**
     * 日期
     */
    private String date;
    /**
     * 天气
     */
    private String weather;
    /**
     * 温度
     */
    private String temperature;
    /**
     * 风级
     */
    private String windy;

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getWeather() {
        return weather;
    }

    public void setWeather(String weather) {
        this.weather = weather;
    }

    public String getTemperature() {
        return temperature;
    }

    public void setTemperature(String temperature) {
        this.temperature = temperature;
    }

    public String getWindy() {
        return windy;
    }

    public void setWindy(String windy) {
        this.windy = windy;
    }
}

接下来开始我们的数据抓取工作。

国家信息抓取
1、首先我们访问http://www.weather.com.cn/forecast/world.shtml，然后按F12打开控制台，然后刷新一下页面，可以看到有个world.shtml，然后我们找到User-Agent，当我们的http客户端添加了这个header之后，就可以伪装成浏览器了。在这里插入图片描述

然后我们来实现一下我们http客户端的代码

package net.csdn.document;

import com.alibaba.fastjson.JSONObject;
import net.csdn.document.module.City;
import net.csdn.document.module.Country;
import net.csdn.document.module.Weather;
import net.csdn.document.parser.impl.CityDocumentParser;
import net.csdn.document.parser.impl.CountryDocumentParser;
import net.csdn.document.parser.impl.WeatherDocumentParser;
import org.jetbrains.annotations.NotNull;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;

/**
 * <p> 天气爬虫。</p>
 */
public class WeatherSpider {
    ExecutorService executorService = Executors.newFixedThreadPool(5);
    CityDocumentParser cityDocumentParser = new CityDocumentParser();
    CountryDocumentParser countryDocumentParser = new CountryDocumentParser();
    WeatherDocumentParser weatherDocumentParser = new WeatherDocumentParser();

    public static void main(String[] args) throws IOException {
        WeatherSpider weatherSpider = new WeatherSpider();
        weatherSpider.start(new File("weather.json"));
    }

    /**
     * 开始抓取
     *
     * @param destFile {@link File}写入的目标文件
     * @throws IOException
     */
    private void start(File destFile) throws IOException {
        //记录开始时间
        long start = System.currentTimeMillis();
        FileOutputStream fileOutputStream = new FileOutputStream(destFile, false);
        //缓存国家与城市的映射关系
        Map<Country, Future<List<City>>> cityMap = new LinkedHashMap<>();
        //获取所有国家
        List<Country> countryList = getCountries(cityMap);
        System.out.println("爬取所有国家耗时" + (System.currentTimeMillis() - start) + " 国家数量" + countryList.size());
        //缓存城市与天气的映射关系
        Map<City, Future<List<Weather>>> weatherMap = new LinkedHashMap<>();
        //获取城市数据
        getCities(cityMap, weatherMap);
        System.out.println("城市总数" + weatherMap.keySet().size());
        System.out.println("爬取所有城市耗时" + (System.currentTimeMillis() - start));
        //获取天气数据
        getWeathers(weatherMap);
        System.out.println("爬取所有天气耗时" + (System.currentTimeMillis() - start));
        try {
            fileOutputStream.write(JSONObject.toJSONString(countryList).getBytes());
        } catch (IOException ioException) {
            ioException.printStackTrace();
        }

        fileOutputStream.close();
        executorService.shutdown();
        System.out.println("总耗时：" + (System.currentTimeMillis() - start));
    }

    /**
     * 获取天气数据
     *
     * @param weatherMap 城市和天气的映射，key为城市，value为天气集合，也就是一周的天气
     */
    private void getWeathers(Map<City, Future<List<Weather>>> weatherMap) {
        weatherMap.forEach((key, value) -> {
            try {
                List<Weather> weatherList = value.get(60, TimeUnit.SECONDS);
                key.setWeatherList(weatherList);
            } catch (InterruptedException | ExecutionException | TimeoutException e) {
                e.printStackTrace();
            }
        });
    }

    /**
     * 获取城市数据
     *
     * @param cityMap    国家和城市的映射
     * @param weatherMap 城市和天气的映射
     */
    private void getCities(Map<Country, Future<List<City>>> cityMap, Map<City, Future<List<Weather>>> weatherMap) {
        cityMap.forEach((key, value) -> {
            try {
                List<City> locationList = value.get(60, TimeUnit.SECONDS);
                key.setCityList(locationList);
                locationList.forEach((city) -> {
                    String cityId = city.getId();
                    //获取城市的近七天天气
                    String weatherUrl = String.format("http://www.weather.com.cn/weather/%s.shtml", cityId);
                    Future<List<Weather>> weatherFutureList = executorService.submit(() -> weatherDocumentParser.parseDocument(fetchHtmlContent(weatherUrl)));
                    weatherMap.put(city, weatherFutureList);
                });
            } catch (InterruptedException | ExecutionException | TimeoutException e) {
                e.printStackTrace();
            }
        });
    }

    /**
     * 获取国家信息
     *
     * @param cityMap 国家和城市的映射
     * @return 国家列表
     */
    @NotNull
    private List<Country> getCountries(Map<Country, Future<List<City>>> cityMap) {
        String content = fetchHtmlContent("http://www.weather.com.cn/forecast/world.shtml");
        List<Country> countryList = countryDocumentParser.parseDocument(content);
        for (Country country : countryList) {
            String id = country.getId();
            //获取国家下面的所有城市
            String url = String.format("http://d1.weather.com.cn/gw/gj%s.html?_=" + System.currentTimeMillis(), id);
            Future<List<City>> future = executorService.submit(() -> cityDocumentParser.parseDocument(fetchHtmlContent(url)));
            cityMap.put(country, future);
        }
        return countryList;
    }

    /**
     * 获取网页数据
     *
     * @param url 目标地址
     * @return 网页文本内容
     */
    private String fetchHtmlContent(String url) {
        try {
            return MyHttpClient.fetchHtmlSync(url);
        } catch (IOException ioException) {
            ioException.printStackTrace();
            return null;
        }
    }

}

然后我们执行一下main方法，就可以在控制台中看到网页的内容了。
在这里插入图片描述
检查User-Agent是一种最简单的反爬虫机制，而通过设定Request Headers中的User-Agent，可以突破这种机制。

2、请求到数据之后我们开始解析提取我们需要的数据了，这里我们使用Jsoup来解析提取数据。在写代码之前，我们先想一下，我们的有多个网页需要解析，而每个网页需要解析的内容都不一样的，因此我们可以定义一个文档解析的接口，然后不同的页面使用不同的解析器。

package net.csdn.document.parser;
/**
 * <p>文档解析器 。</p>
 *
 */
public interface DocumentParser<T> {
    /**
     * 解析文档内容
     *
     * @param document 网页文档字符串
     * @return T 解析后返回的数据对象
     */
    T parseDocument(String document);
}

首先观察一下国家信息的网页
在这里插入图片描述

接下来我们定义一个解析国家数据的解析器

package net.csdn.document.parser.impl;

import net.csdn.document.MyHttpClient;
import net.csdn.document.module.Country;
import net.csdn.document.parser.DocumentParser;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * <p> 国家网页内容解析。</p>
 *
 */
public class CountryDocumentParser implements DocumentParser<List<Country>> {
    public static void main(String[] args) throws IOException {
        CountryDocumentParser countryDocumentParser = new CountryDocumentParser();
        String s = MyHttpClient.fetchHtmlSync("http://www.weather.com.cn/forecast/world.shtml");
        System.out.println(s);
//        List<Country> CountryList = countryDocumentParser.parseDocument(s);
//        System.out.println(CountryList.size());
//
//        for(Country country:CountryList)System.out.println(country);
    }

    @Override
    public List<Country> parseDocument(String doc) {
        Document document = Jsoup.parse(doc);
        List<Country> locationList = new ArrayList<>();
        //类似前端的dom选择器，我们只需要先定位到国家标签，然后往下找我们需要的数据就行了
        Elements countryElements = document.getElementsByClass("guojia");
        countryElements.forEach(e -> {
            //data-id属性的值就是我们的国家id
            Elements elements = e.getElementsByAttribute("data-id");
            if (elements != null) {
                elements.forEach(e1 -> {
                    Country location = new Country();
                    String id = e1.attr("data-id");
                    //解析中文名称
                    String name = e1.getElementsByTag("span").get(0).text();
                    //解析英文名称
                    String en = e1.getElementsByTag("span").get(1).text();
                    location.setId(id);
                    location.setName(name);
                    location.setEn(en);
                    locationList.add(location);
                });
            }
        });

        return locationList;
    }
}

然后我们运行一下main方法，如果打印出来的结果是203，则说明我们解析的数据是正确的。

城市信息抓取
首先我们点击国家名称
在这里插入图片描述
然后我们可以在浏览器的控制台中看到该请求

接下来我们只需要把返回的这段内容，解析成我们的City对象就行了，这里代码的解析思路为，将该返回结果变成一个格式规范的JSON字符串，然后将其反序列化成城市列表。

代码如下

package net.csdn.document.parser.impl;

import com.alibaba.fastjson.JSON;
import net.csdn.document.MyHttpClient;
import net.csdn.document.module.City;
import net.csdn.document.parser.DocumentParser;

import javax.crypto.Cipher;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * <p>城市网页解析器 。</p>
 *
 */
public class CityDocumentParser implements DocumentParser<List<City>> {
    public static void main(String[] args) throws IOException {
        CityDocumentParser cityDocumentParser = new CityDocumentParser();
        String s = MyHttpClient.fetchHtmlSync("http://d1.weather.com.cn/gw/gj13.html?_=1647495151785");
        List<City> cities = cityDocumentParser.parseDocument(s);
        for(City city:cities)System.out.println(city);
    }

    @Override
    public List<City> parseDocument(String document) {
        //正则表达式替换，.表示任意字符，*表示任意次数，这里我们希望将var gjxx=去掉
        String s1 = document.replaceAll(".*=", "")
                //将sd:替换成空字符串
                .replaceFirst("\"sd\":", "")
                //将类似"A":[这样的字符串替换为空
                .replaceAll(".+:\\[", "")
                //将]替换为空
                .replaceAll("]", "")
                //将第一个{替换为[
                .replaceFirst("\\{", "[")
                //将最后一个}替换为]
                .replaceAll("}\\s*$", "]");
        try {
            List<City> locations = JSON.parseArray(s1, City.class);
            return locations;
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("错误json为：" + document);
        }
        return new ArrayList<>();
    }
}

同样我们运行一下main方法，如果结果是7，则说明方法没有问题。

天气信息抓取
首先我们点击城市名称
在这里插入图片描述
然后我们点击7天，就可以在浏览器中看到获取未来7天天气的请求地址。

然后我们编写代码来解析该网页的地址，观察下面的网页内容，我们可以很容易找到天气相关的信息。
在这里插入图片描述
代码如下

package net.csdn.document.parser.impl;

import net.csdn.document.MyHttpClient;
import net.csdn.document.module.Weather;
import net.csdn.document.parser.DocumentParser;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * <p> 天气信息解析。</p>
 *
 */
public class WeatherDocumentParser implements DocumentParser<List<Weather>> {
    public static void main(String[] args) throws IOException {
        WeatherDocumentParser weatherDocumentParser = new WeatherDocumentParser();
        String s = MyHttpClient.fetchHtmlSync("http://www.weather.com.cn/weather/155260100.shtml");
        List<Weather> weatherList = weatherDocumentParser.parseDocument(s);
        System.out.println(weatherList.size());
    }

    @Override
    public List<Weather> parseDocument(String doc) {
        List<Weather> weathers = new ArrayList<>();
        Document document = Jsoup.parse(doc);
        //获得未来7天天气的标签
        Element root = document.getElementById("7d");
        Elements li = root.getElementsByTag("li");
        for (Element e : li) {
            Weather w = new Weather();
            String h1 = e.getElementsByTag("h1").text();
            if (StringUtil.isBlank(h1)) {
                continue;
            }
            w.setDate(h1);
            w.setWeather(e.getElementsByClass("wea").text());
            w.setTemperature(e.getElementsByClass("tem").text());
            w.setWindy(e.getElementsByClass("win").text());
            weathers.add(w);
        }
        return weathers;
    }
}

同样我们运行一下代码，如果控制台结果为7，这说明这一个天气解析没有问题

完整抓取流程
单个的解析我们都实现之后，接下来我们创建一个WeatherSpider类，将任务完整串起来。

package net.csdn.document;

import com.alibaba.fastjson.JSONObject;
import net.csdn.document.module.City;
import net.csdn.document.module.Country;
import net.csdn.document.module.Weather;
import net.csdn.document.parser.impl.CityDocumentParser;
import net.csdn.document.parser.impl.CountryDocumentParser;
import net.csdn.document.parser.impl.WeatherDocumentParser;
import org.jetbrains.annotations.NotNull;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;

/**
 * <p> 天气爬虫。</p>
 */
public class WeatherSpider {
    ExecutorService executorService = Executors.newFixedThreadPool(5);
    CityDocumentParser cityDocumentParser = new CityDocumentParser();
    CountryDocumentParser countryDocumentParser = new CountryDocumentParser();
    WeatherDocumentParser weatherDocumentParser = new WeatherDocumentParser();

    public static void main(String[] args) throws IOException {
        WeatherSpider weatherSpider = new WeatherSpider();
        weatherSpider.start(new File("weather.json"));
    }

    /**
     * 开始抓取
     *
     * @param destFile {@link File}写入的目标文件
     * @throws IOException
     */
    private void start(File destFile) throws IOException {
        //记录开始时间
        long start = System.currentTimeMillis();
        FileOutputStream fileOutputStream = new FileOutputStream(destFile, false);
        //缓存国家与城市的映射关系
        Map<Country, Future<List<City>>> cityMap = new LinkedHashMap<>();
        //获取所有国家
        List<Country> countryList = getCountries(cityMap);
        System.out.println("爬取所有国家耗时" + (System.currentTimeMillis() - start) + " 国家数量" + countryList.size());
        //缓存城市与天气的映射关系
        Map<City, Future<List<Weather>>> weatherMap = new LinkedHashMap<>();
        //获取城市数据
        getCities(cityMap, weatherMap);
        System.out.println("城市总数" + weatherMap.keySet().size());
        System.out.println("爬取所有城市耗时" + (System.currentTimeMillis() - start));
        //获取天气数据
        getWeathers(weatherMap);
        System.out.println("爬取所有天气耗时" + (System.currentTimeMillis() - start));
        try {
            fileOutputStream.write(JSONObject.toJSONString(countryList).getBytes());
        } catch (IOException ioException) {
            ioException.printStackTrace();
        }

        fileOutputStream.close();
        executorService.shutdown();
        System.out.println("总耗时：" + (System.currentTimeMillis() - start));
    }

    /**
     * 获取天气数据
     *
     * @param weatherMap 城市和天气的映射，key为城市，value为天气集合，也就是一周的天气
     */
    private void getWeathers(Map<City, Future<List<Weather>>> weatherMap) {
        weatherMap.forEach((key, value) -> {
            try {
                List<Weather> weatherList = value.get(60, TimeUnit.SECONDS);
                key.setWeatherList(weatherList);
            } catch (InterruptedException | ExecutionException | TimeoutException e) {
                e.printStackTrace();
            }
        });
    }

    /**
     * 获取城市数据
     *
     * @param cityMap    国家和城市的映射
     * @param weatherMap 城市和天气的映射
     */
    private void getCities(Map<Country, Future<List<City>>> cityMap, Map<City, Future<List<Weather>>> weatherMap) {
        cityMap.forEach((key, value) -> {
            try {
                List<City> locationList = value.get(60, TimeUnit.SECONDS);
                key.setCityList(locationList);
                locationList.forEach((city) -> {
                    String cityId = city.getId();
                    //获取城市的近七天天气
                    String weatherUrl = String.format("http://www.weather.com.cn/weather/%s.shtml", cityId);
                    Future<List<Weather>> weatherFutureList = executorService.submit(() -> weatherDocumentParser.parseDocument(fetchHtmlContent(weatherUrl)));
                    weatherMap.put(city, weatherFutureList);
                });
            } catch (InterruptedException | ExecutionException | TimeoutException e) {
                e.printStackTrace();
            }
        });
    }

    /**
     * 获取国家信息
     *
     * @param cityMap 国家和城市的映射
     * @return 国家列表
     */
    @NotNull
    private List<Country> getCountries(Map<Country, Future<List<City>>> cityMap) {
        String content = fetchHtmlContent("http://www.weather.com.cn/forecast/world.shtml");
        List<Country> countryList = countryDocumentParser.parseDocument(content);
        for (Country country : countryList) {
            String id = country.getId();
            //获取国家下面的所有城市
            String url = String.format("http://d1.weather.com.cn/gw/gj%s.html?_=" + System.currentTimeMillis(), id);
            Future<List<City>> future = executorService.submit(() -> cityDocumentParser.parseDocument(fetchHtmlContent(url)));
            cityMap.put(country, future);
        }
        return countryList;
    }

    /**
     * 获取网页数据
     *
     * @param url 目标地址
     * @return 网页文本内容
     */
    private String fetchHtmlContent(String url) {
        try {
            return MyHttpClient.fetchHtmlSync(url);
        } catch (IOException ioException) {
            ioException.printStackTrace();
            return null;
        }
    }

}