Python爬虫，数据可视化之matplotlib初步--制作一个高楼高度的条形统计图全思路-CFANZ编程社区

实现步骤：

1. 通过爬虫，爬取高楼的信息

2. 筛选我们的需要的信息

3. 制作一个表格

4. 解析表格，制作统计图

1 -- 爬虫

首先观察目标网页

发现所有有用的信息都在<p>标签中，这就很好办了，使用BeautifulSoup直接把信息提取出来就好。

import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

response = None
info_list = []


# 爬虫模块，爬取网页源码
def spider(url):
    global info_list
    try:
        response = requests.get(url)
        response.encoding = 'utf-8'
    except Exception as e:
        print(e)

    bs = BeautifulSoup(response.text, 'html.parser')
    content = bs.find_all('p')  # 将标签中的源码提取出来

但是我们发现这样操作过后有标签残留，所以说我们使用get_text()方法，提取文本，并加入一个列表里。由于前三个是垃圾信息，所以说删除了。

import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

response = None
info_list = []


# 爬虫模块，爬取网页源码
def spider(url):
    global info_list
    try:
        response = requests.get(url)
        response.encoding = 'utf-8'
    except Exception as e:
        print(e)

    bs = BeautifulSoup(response.text, 'html.parser')
    content = bs.find_all('p')  # 将标签中的源码提取出来
    for word in content:
        result = word.get_text()  # 这个函数只能提取单一元素不能是列表所以说遍历一下
        info_list.append(result)
    del info_list[0:3]

2 -- 筛选信息

我们提取出来后列表里的样子是这样的：

可是我们只需要名字和高度，所以要将其他的信息全部删除。

我们可以观察到，第0项是名字，第5项也是名字，所以说，每个我们需要得到的信息，索引之差都是5，所以我们只要for循环加上步长，是i的值是0，5，10.........这样5个5个加的，就可以以i为索引，提取我们想要的信息了。

def make_form():
    name_list = []
    height_list = []
    # for循环加步长，(x, y, step)一个都不能少
    for i in range(0, len(info_list), 5):
        print(info_list[i])

但是我们做统计图，不想要前面的世界第啥啥啥，但是这又是一个字符串，如何批量的编辑呢？

这里，我把字符串拆成了列表，使用del把前面的元素删了，由于第十一个之后需要删除的元素变成了8个，不然就剩了个冒号，所以我添加了条件判断，如果是冒号就删了，注意冒号是中文的冒号。

def make_form():
    name_list = []
    height_list = []
    # for循环加步长，(x, y, step)一个都不能少
    for i in range(0, len(info_list), 5):
        # 为了删除字符串的前几个无用字符，把字符串拆成列表删了再合起来
        temporary_name = list(info_list[i])
        del temporary_name[0:7]
        if temporary_name[0] == '：':
            del temporary_name[0]
        temporary_name = ''.join(temporary_name)
        name_list.append(temporary_name)

结果就很符合我的要求了。

接下来同理，我们搞一下高度。

    for j in range(2, len(info_list), 5):
        temporary_height = list(info_list[j])
        del temporary_height[0:7]
        temporary_height.pop()
        temporary_height = ''.join(temporary_height)
        height_list.append(temporary_height)

3 -- 制作表格

利用with语句把之前我们弄的数据给填上去

# 制作表格
def make_form():
    name_list = []
    height_list = []
    # for循环加步长，(x, y, step)一个都不能少
    for i in range(0, len(info_list), 5):
        # 为了删除字符串的前几个无用字符，把字符串拆成列表删了再合起来
        temporary_name = list(info_list[i])
        del temporary_name[0:7]
        if temporary_name[0] == '：':
            del temporary_name[0]
        temporary_name = ''.join(temporary_name)
        name_list.append(temporary_name)

    for j in range(2, len(info_list), 5):
        temporary_height = list(info_list[j])
        del temporary_height[0:7]
        temporary_height.pop()
        temporary_height = ''.join(temporary_height)
        height_list.append(temporary_height)
    with open('building.csv', 'w') as f:
        # 制作一个表格
        f.write('名称,高度\n')
        for k in range(len(name_list)):
            name = name_list[k]
            height = height_list[k]
            content = f'{name},{height}\n'
            f.write(content)

4 -- 使用matplotlib.pyplot来制作统计图，用pandas解析csv

首先import一下需要的库

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

我们用pandas解析csv，可以很方便的使用表格里的数据，其中，rcParams是为了引入字体，让中文字不是方框的，不然只能显示英文。我是Mac OS所以用这句。

# 画条形统计图
def make_photo():
    # 用一个编辑器
    matplotlib.use('TkAgg')
    # 解析csv文件，前面会加0开始的序号
    data = pd.read_csv('building.csv')
    print(data)
    # 添加字体，保证没方框
    plt.rcParams['font.family'] = ['Hiragino Sans GB']
    # data['表头名字']提取这一列的数据
    plt.bar(data['名称'], data['高度'])
    # 标题，横纵坐标标签
    plt.title('世界高楼排名')
    plt.xlabel('名称')
    plt.ylabel('单位：米')
    # 显示出来
    plt.show()

windows把rcParams改成这句：

plt.rcParams['font. family'] = ['SimHei']

接下来大功告成，完整代码：

import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

response = None
info_list = []


# 爬虫模块，爬取网页源码
def spider(url):
    global info_list
    try:
        response = requests.get(url)
        response.encoding = 'utf-8'
    except Exception as e:
        print(e)

    bs = BeautifulSoup(response.text, 'html.parser')
    content = bs.find_all('p')  # 将标签中的源码提取出来
    for word in content:
        result = word.get_text()  # 这个函数只能提取单一元素不能是列表所以说遍历一下
        info_list.append(result)
    del info_list[0:3]
    print(info_list)


# 制作表格
def make_form():
    name_list = []
    height_list = []
    # for循环加步长，(x, y, step)一个都不能少
    for i in range(0, len(info_list), 5):
        # 为了删除字符串的前几个无用字符，把字符串拆成列表删了再合起来
        temporary_name = list(info_list[i])
        del temporary_name[0:7]
        if temporary_name[0] == '：':
            del temporary_name[0]
        temporary_name = ''.join(temporary_name)
        name_list.append(temporary_name)

    for j in range(2, len(info_list), 5):
        temporary_height = list(info_list[j])
        del temporary_height[0:7]
        temporary_height.pop()
        temporary_height = ''.join(temporary_height)
        height_list.append(temporary_height)
    with open('building.csv', 'w') as f:
        # 制作一个表格
        f.write('名称,高度\n')
        for k in range(len(name_list)):
            name = name_list[k]
            height = height_list[k]
            content = f'{name},{height}\n'
            f.write(content)


# 画条形统计图
def make_photo():
    # 用一个编辑器
    matplotlib.use('TkAgg')
    # 解析csv文件，前面会加0开始的序号
    data = pd.read_csv('building.csv')
    print(data)
    # 添加字体，保证没方框
    plt.rcParams['font.family'] = ['Hiragino Sans GB']
    # data['表头名字']提取这一列的数据
    plt.bar(data['名称'], data['高度'])
    # 标题，横纵坐标标签
    plt.title('世界高楼排名')
    plt.xlabel('名称')
    plt.ylabel('单位：米')
    # 显示出来
    plt.show()


spider('https://jingyan.baidu.com/article/cbf0e500b24b112eab289379.html')
make_form()
make_photo()

效果：