删除数据
- drop(labels,axis=0,index,columns) 删除在axis轴上的指定labels索引标签的数据
- 删除多行或多列
- dropna(axis=None,how=‘any|all’)删除存在NAN值的行或列
- drop_duplicates(keeps=‘first|last’)删除重复行的数据,keep指定保留行的第一列还是最后一列
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
# 创建10位同学的五门课程成绩
df = DataFrame(np.random.randint(1,151,size=(10,5)),
columns=['MySQL','PostgreSQL','Oracle','MongoDB','SQLite'])
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136 | 28 | 108 | 18 | 132 |
1 | 53 | 12 | 137 | 45 | 14 |
2 | 61 | 59 | 132 | 74 | 103 |
3 | 150 | 106 | 4 | 74 | 61 |
4 | 50 | 95 | 135 | 78 | 140 |
5 | 70 | 112 | 62 | 62 | 128 |
6 | 125 | 133 | 138 | 70 | 135 |
7 | 46 | 39 | 71 | 50 | 117 |
8 | 50 | 119 | 59 | 89 | 130 |
9 | 122 | 93 | 29 | 56 | 16 |
# 构造两行数据
df.loc[10] = df.loc[6]
df.loc[11] = df.loc[7]
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136 | 28 | 108 | 18 | 132 |
1 | 53 | 12 | 137 | 45 | 14 |
2 | 61 | 59 | 132 | 74 | 103 |
3 | 150 | 106 | 4 | 74 | 61 |
4 | 50 | 95 | 135 | 78 | 140 |
5 | 70 | 112 | 62 | 62 | 128 |
6 | 125 | 133 | 138 | 70 | 135 |
7 | 46 | 39 | 71 | 50 | 117 |
8 | 50 | 119 | 59 | 89 | 130 |
9 | 122 | 93 | 29 | 56 | 16 |
10 | 125 | 133 | 138 | 70 | 135 |
11 | 46 | 39 | 71 | 50 | 117 |
# 删除"PostgreSQL","MongoDB"两列,同时删除3,4两行
# 只删除行
# axis : {0 or 'index', 1 or 'columns'}, default 0
df.drop(labels=['PostgreSQL','MongoDB'],axis='columns') # 也可以写成axis=1
MySQL | Oracle | SQLite | |
---|---|---|---|
0 | 136 | 108 | 132 |
1 | 53 | 137 | 14 |
2 | 61 | 132 | 103 |
3 | 150 | 4 | 61 |
4 | 50 | 135 | 140 |
5 | 70 | 62 | 128 |
6 | 125 | 138 | 135 |
7 | 46 | 71 | 117 |
8 | 50 | 59 | 130 |
9 | 122 | 29 | 16 |
10 | 125 | 138 | 135 |
11 | 46 | 71 | 117 |
# 只删除列
df.drop(labels=[3,4],axis=0)
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136 | 28 | 108 | 18 | 132 |
1 | 53 | 12 | 137 | 45 | 14 |
2 | 61 | 59 | 132 | 74 | 103 |
5 | 70 | 112 | 62 | 62 | 128 |
6 | 125 | 133 | 138 | 70 | 135 |
7 | 46 | 39 | 71 | 50 | 117 |
8 | 50 | 119 | 59 | 89 | 130 |
9 | 122 | 93 | 29 | 56 | 16 |
10 | 125 | 133 | 138 | 70 | 135 |
11 | 46 | 39 | 71 | 50 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136 | 28 | 108 | 18 | 132 |
1 | 53 | 12 | 137 | 45 | 14 |
2 | 61 | 59 | 132 | 74 | 103 |
3 | 150 | 106 | 4 | 74 | 61 |
4 | 50 | 95 | 135 | 78 | 140 |
5 | 70 | 112 | 62 | 62 | 128 |
6 | 125 | 133 | 138 | 70 | 135 |
7 | 46 | 39 | 71 | 50 | 117 |
8 | 50 | 119 | 59 | 89 | 130 |
9 | 122 | 93 | 29 | 56 | 16 |
10 | 125 | 133 | 138 | 70 | 135 |
11 | 46 | 39 | 71 | 50 | 117 |
# 同时删除行和同时删除列
df.drop(index=[3,4],columns=['PostgreSQL','MongoDB'],inplace=False) # inplace=False 表示不在原本中删除
MySQL | Oracle | SQLite | |
---|---|---|---|
0 | 136 | 108 | 132 |
1 | 53 | 137 | 14 |
2 | 61 | 132 | 103 |
5 | 70 | 62 | 128 |
6 | 125 | 138 | 135 |
7 | 46 | 71 | 117 |
8 | 50 | 59 | 130 |
9 | 122 | 29 | 16 |
10 | 125 | 138 | 135 |
11 | 46 | 71 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136 | 28 | 108 | 18 | 132 |
1 | 53 | 12 | 137 | 45 | 14 |
2 | 61 | 59 | 132 | 74 | 103 |
3 | 150 | 106 | 4 | 74 | 61 |
4 | 50 | 95 | 135 | 78 | 140 |
5 | 70 | 112 | 62 | 62 | 128 |
6 | 125 | 133 | 138 | 70 | 135 |
7 | 46 | 39 | 71 | 50 | 117 |
8 | 50 | 119 | 59 | 89 | 130 |
9 | 122 | 93 | 29 | 56 | 16 |
10 | 125 | 133 | 138 | 70 | 135 |
11 | 46 | 39 | 71 | 50 | 117 |
# axis=0 默认删除行索引及数据
df.drop([1,3])
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136 | 28 | 108 | 18 | 132 |
2 | 61 | 59 | 132 | 74 | 103 |
4 | 50 | 95 | 135 | 78 | 140 |
5 | 70 | 112 | 62 | 62 | 128 |
6 | 125 | 133 | 138 | 70 | 135 |
7 | 46 | 39 | 71 | 50 | 117 |
8 | 50 | 119 | 59 | 89 | 130 |
9 | 122 | 93 | 29 | 56 | 16 |
10 | 125 | 133 | 138 | 70 | 135 |
11 | 46 | 39 | 71 | 50 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136 | 28 | 108 | 18 | 132 |
1 | 53 | 12 | 137 | 45 | 14 |
2 | 61 | 59 | 132 | 74 | 103 |
3 | 150 | 106 | 4 | 74 | 61 |
4 | 50 | 95 | 135 | 78 | 140 |
5 | 70 | 112 | 62 | 62 | 128 |
6 | 125 | 133 | 138 | 70 | 135 |
7 | 46 | 39 | 71 | 50 | 117 |
8 | 50 | 119 | 59 | 89 | 130 |
9 | 122 | 93 | 29 | 56 | 16 |
10 | 125 | 133 | 138 | 70 | 135 |
11 | 46 | 39 | 71 | 50 | 117 |
# 修改第三个索引位置,MySQL的值改为NAN
df.loc[3,'MySQL'] = np.nan
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 删除有NAN值的行
df.dropna(axis=0,how='any') # 当前行只要有一个NAN值都会删除
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df.dropna(axis=0,how='all') # 当前行所有的值是NAN才会删除
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 删除重复行
df.drop_duplicates() # 默认保留第一次出现的行
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 删除重复行
df.drop_duplicates(keep="last") # 默认保留相同行的最后一次出现的行
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 查询重复行的,返回的是布尔值
df.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 True
11 True
dtype: bool
df[df.duplicated()]
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df[df.duplicated()].index
Int64Index([10, 11], dtype='int64')
# 删除10,11行
# 内容为bool值的Series,只能作为索引切片使用
df.drop(df[df.duplicated()].index)
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
数据映射
- rename() 针对索引标签的重命名
- replace() 针对数据,替换数据
- map() 针对数据,根据数据可以映射成新的数据
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 修改MySQL和PostgreSQL的列名分别为MySQL1m和PostgreSQL11m
"""
df.rename(
mapper=None, 其实是个字典
index=None, 替换行标签的内容
columns=None, 替换列标签的内容
axis=None, 0——行标签;1——列标签
copy=True,
inplace=False, 是否在原本上替换,False表示不在原本上替换
level=None, # 指定更改的索引层数
errors='ignore',
)
"""
# 修改列时,没指定axis的位置时,不进行更改
df.rename({'MySQL':'MySQL1m','PostgreSQL':'PostgreSQL11m'},axis=1)
MySQL1m | PostgreSQL11m | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 把1映射为A,把0映射为B
df.rename({1:'A',0:'B'}) # 修改行时,不用加axis,默认的
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
B | 136.0 | 28 | 108 | 18 | 132 |
A | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df.rename(columns={'MySQL':'MySQL1m','PostgreSQL':'PostgreSQL11m'})
MySQL1m | PostgreSQL11m | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# mapper\index\columns的类型都是字典类型 rename进行修改时,如果在原本里找不到要替换的值,不会报错,只是不修改
# 将14的成绩,修改为140,将4分的成绩修改为180
"""
df.replace(
to_replace=None, 去替换的 类型是可以是列表,可以是元组,也可以是字典
value=None, 值
inplace=False,
limit=None,
regex=False,
method='pad',
)
"""
df.replace(14,140)
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 140 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 将14的成绩,修改为140,将4分的成绩修改为180
df.replace((14,4),(140,180))
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 140 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 180 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df.replace({14:140,4:180})
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 140 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 180 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# method : {'pad', 'ffill', 'bfill', `None`} 使用method方法是value参数必须是None
# 将nan值使用列后的数值填充
df.replace(np.nan,method='bfill')
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | 50.0 | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# 将nan值使用列前的数值填充
df.replace(np.nan,method='ffill')
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | 61.0 | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df.replace(np.nan,method='pad')
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | 61.0 | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df.replace(1,method='ffill')
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df.replace(1,method='bfill')
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
# map函数针对一个Series对象
df['DB'] = df['Oracle'].map({108:90,132:100})
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 | 90.0 |
1 | 53.0 | 12 | 137 | 45 | 14 | NaN |
2 | 61.0 | 59 | 132 | 74 | 103 | 100.0 |
3 | NaN | 106 | 4 | 74 | 61 | NaN |
4 | 50.0 | 95 | 135 | 78 | 140 | NaN |
5 | 70.0 | 112 | 62 | 62 | 128 | NaN |
6 | 125.0 | 133 | 138 | 70 | 135 | NaN |
7 | 46.0 | 39 | 71 | 50 | 117 | NaN |
8 | 50.0 | 119 | 59 | 89 | 130 | NaN |
9 | 122.0 | 93 | 29 | 56 | 16 | NaN |
10 | 125.0 | 133 | 138 | 70 | 135 | NaN |
11 | 46.0 | 39 | 71 | 50 | 117 | NaN |
# 根据Oracle的成绩,设置A,B,C三个等级,A级大于80,B级大于60,C级小于60
df['DB'] =df['Oracle'].map(lambda item: 'A' if item>80 else 'B' if item > 60 else 'C')
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 | A |
1 | 53.0 | 12 | 137 | 45 | 14 | A |
2 | 61.0 | 59 | 132 | 74 | 103 | A |
3 | NaN | 106 | 4 | 74 | 61 | C |
4 | 50.0 | 95 | 135 | 78 | 140 | A |
5 | 70.0 | 112 | 62 | 62 | 128 | B |
6 | 125.0 | 133 | 138 | 70 | 135 | A |
7 | 46.0 | 39 | 71 | 50 | 117 | B |
8 | 50.0 | 119 | 59 | 89 | 130 | C |
9 | 122.0 | 93 | 29 | 56 | 16 | C |
10 | 125.0 | 133 | 138 | 70 | 135 | A |
11 | 46.0 | 39 | 71 | 50 | 117 | B |
数据统计信息
- describe()
df.describe() # 此时不包含NAN值
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
count | 11.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 |
mean | 80.363636 | 80.666667 | 90.333333 | 61.333333 | 102.333333 |
std | 37.749895 | 43.020784 | 47.206189 | 18.936897 | 45.976938 |
min | 46.000000 | 12.000000 | 4.000000 | 18.000000 | 14.000000 |
25% | 50.000000 | 39.000000 | 61.250000 | 50.000000 | 92.500000 |
50% | 61.000000 | 94.000000 | 89.500000 | 66.000000 | 122.500000 |
75% | 123.500000 | 113.750000 | 135.500000 | 74.000000 | 132.750000 |
max | 136.000000 | 133.000000 | 138.000000 | 89.000000 | 140.000000 |
# include 可以将nan值看成是int类型的值,会统计NAN的行数
df.describe(include=[np.int])
PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|
count | 12.000000 | 12.000000 | 12.000000 | 12.000000 |
mean | 80.666667 | 90.333333 | 61.333333 | 102.333333 |
std | 43.020784 | 47.206189 | 18.936897 | 45.976938 |
min | 12.000000 | 4.000000 | 18.000000 | 14.000000 |
25% | 39.000000 | 61.250000 | 50.000000 | 92.500000 |
50% | 94.000000 | 89.500000 | 66.000000 | 122.500000 |
75% | 113.750000 | 135.500000 | 74.000000 | 132.750000 |
max | 133.000000 | 138.000000 | 89.000000 | 140.000000 |
df.std() # 默认axis=0
MySQL 37.749895
PostgreSQL 43.020784
Oracle 47.206189
MongoDB 18.936897
SQLite 45.976938
dtype: float64
df.std(axis=1)
0 57.173420
1 50.790747
2 31.236197
3 42.594014
4 38.187694
5 31.003226
6 28.472794
7 31.627520
8 35.359581
9 44.144082
10 28.472794
11 31.627520
dtype: float64
df.std(axis=1).sort_values()
6 28.472794
10 28.472794
5 31.003226
2 31.236197
7 31.627520
11 31.627520
8 35.359581
4 38.187694
3 42.594014
9 44.144082
1 50.790747
0 57.173420
dtype: float64
# 查看第0行索引
df.loc[0]
MySQL 136
PostgreSQL 28
Oracle 108
MongoDB 18
SQLite 132
DB A
Name: 0, dtype: object
# 查看第6行索引
df.loc[6]
MySQL 125
PostgreSQL 133
Oracle 138
MongoDB 70
SQLite 135
DB A
Name: 6, dtype: object
take排序索引标签
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 | A |
1 | 53.0 | 12 | 137 | 45 | 14 | A |
2 | 61.0 | 59 | 132 | 74 | 103 | A |
3 | NaN | 106 | 4 | 74 | 61 | C |
4 | 50.0 | 95 | 135 | 78 | 140 | A |
5 | 70.0 | 112 | 62 | 62 | 128 | B |
6 | 125.0 | 133 | 138 | 70 | 135 | A |
7 | 46.0 | 39 | 71 | 50 | 117 | B |
8 | 50.0 | 119 | 59 | 89 | 130 | C |
9 | 122.0 | 93 | 29 | 56 | 16 | C |
10 | 125.0 | 133 | 138 | 70 | 135 | A |
11 | 46.0 | 39 | 71 | 50 | 117 | B |
# 把"DB"列移动到"Oracle"前列
df.take([0,1,5,2,3,4],axis=1) # 方法一
MySQL | PostgreSQL | DB | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | A | 108 | 18 | 132 |
1 | 53.0 | 12 | A | 137 | 45 | 14 |
2 | 61.0 | 59 | A | 132 | 74 | 103 |
3 | NaN | 106 | C | 4 | 74 | 61 |
4 | 50.0 | 95 | A | 135 | 78 | 140 |
5 | 70.0 | 112 | B | 62 | 62 | 128 |
6 | 125.0 | 133 | A | 138 | 70 | 135 |
7 | 46.0 | 39 | B | 71 | 50 | 117 |
8 | 50.0 | 119 | C | 59 | 89 | 130 |
9 | 122.0 | 93 | C | 29 | 56 | 16 |
10 | 125.0 | 133 | A | 138 | 70 | 135 |
11 | 46.0 | 39 | B | 71 | 50 | 117 |
# 把"DB"列移动到"Oracle"前列
df.take([0,1,-1,2,3,4],axis=1) # 方法二
MySQL | PostgreSQL | DB | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | A | 108 | 18 | 132 |
1 | 53.0 | 12 | A | 137 | 45 | 14 |
2 | 61.0 | 59 | A | 132 | 74 | 103 |
3 | NaN | 106 | C | 4 | 74 | 61 |
4 | 50.0 | 95 | A | 135 | 78 | 140 |
5 | 70.0 | 112 | B | 62 | 62 | 128 |
6 | 125.0 | 133 | A | 138 | 70 | 135 |
7 | 46.0 | 39 | B | 71 | 50 | 117 |
8 | 50.0 | 119 | C | 59 | 89 | 130 |
9 | 122.0 | 93 | C | 29 | 56 | 16 |
10 | 125.0 | 133 | A | 138 | 70 | 135 |
11 | 46.0 | 39 | B | 71 | 50 | 117 |
df2 = df.take([0,1,-1,2,3,4],axis=1)
df2
MySQL | PostgreSQL | DB | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | A | 108 | 18 | 132 |
1 | 53.0 | 12 | A | 137 | 45 | 14 |
2 | 61.0 | 59 | A | 132 | 74 | 103 |
3 | NaN | 106 | C | 4 | 74 | 61 |
4 | 50.0 | 95 | A | 135 | 78 | 140 |
5 | 70.0 | 112 | B | 62 | 62 | 128 |
6 | 125.0 | 133 | A | 138 | 70 | 135 |
7 | 46.0 | 39 | B | 71 | 50 | 117 |
8 | 50.0 | 119 | C | 59 | 89 | 130 |
9 | 122.0 | 93 | C | 29 | 56 | 16 |
10 | 125.0 | 133 | A | 138 | 70 | 135 |
11 | 46.0 | 39 | B | 71 | 50 | 117 |
# 索引的随机排列
np.random.permutation(df.index)
array([ 5, 2, 7, 8, 1, 3, 6, 0, 4, 11, 10, 9], dtype=int64)
# 行索引标签随机排列
df.take(np.random.permutation(df.index)) # 默认axis=0
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
11 | 46.0 | 39 | 71 | 50 | 117 | B |
4 | 50.0 | 95 | 135 | 78 | 140 | A |
9 | 122.0 | 93 | 29 | 56 | 16 | C |
1 | 53.0 | 12 | 137 | 45 | 14 | A |
6 | 125.0 | 133 | 138 | 70 | 135 | A |
7 | 46.0 | 39 | 71 | 50 | 117 | B |
10 | 125.0 | 133 | 138 | 70 | 135 | A |
5 | 70.0 | 112 | 62 | 62 | 128 | B |
2 | 61.0 | 59 | 132 | 74 | 103 | A |
8 | 50.0 | 119 | 59 | 89 | 130 | C |
3 | NaN | 106 | 4 | 74 | 61 | C |
0 | 136.0 | 28 | 108 | 18 | 132 | A |
# 随机排列列索引标签
np.random.permutation(df.columns)
array(['Oracle', 'MongoDB', 'SQLite', 'DB', 'PostgreSQL', 'MySQL'],
dtype=object)
column_map = dict.fromkeys(df.columns,0)
column_map
{'MySQL': 0, 'PostgreSQL': 0, 'Oracle': 0, 'MongoDB': 0, 'SQLite': 0, 'DB': 0}
column_map = dict(enumerate(df.columns))
column_map
{0: 'MySQL', 1: 'PostgreSQL', 2: 'Oracle', 3: 'MongoDB', 4: 'SQLite', 5: 'DB'}
column_map = { v:k for k,v in dict(enumerate(df.columns)).items()}
column_map
{'MySQL': 0, 'PostgreSQL': 1, 'Oracle': 2, 'MongoDB': 3, 'SQLite': 4, 'DB': 5}
# 列的随机排列
random_columns = np.random.permutation(df.columns)
column_index = [column_map[k] for k in random_columns]
df.take(column_index,axis=1)
Oracle | PostgreSQL | MongoDB | SQLite | MySQL | DB | |
---|---|---|---|---|---|---|
0 | 108 | 28 | 18 | 132 | 136.0 | A |
1 | 137 | 12 | 45 | 14 | 53.0 | A |
2 | 132 | 59 | 74 | 103 | 61.0 | A |
3 | 4 | 106 | 74 | 61 | NaN | C |
4 | 135 | 95 | 78 | 140 | 50.0 | A |
5 | 62 | 112 | 62 | 128 | 70.0 | B |
6 | 138 | 133 | 70 | 135 | 125.0 | A |
7 | 71 | 39 | 50 | 117 | 46.0 | B |
8 | 59 | 119 | 89 | 130 | 50.0 | C |
9 | 29 | 93 | 56 | 16 | 122.0 | C |
10 | 138 | 133 | 70 | 135 | 125.0 | A |
11 | 71 | 39 | 50 | 117 | 46.0 | B |
df.take(column_index,axis=1)
Oracle | PostgreSQL | MongoDB | SQLite | MySQL | DB | |
---|---|---|---|---|---|---|
0 | 108 | 28 | 18 | 132 | 136.0 | A |
1 | 137 | 12 | 45 | 14 | 53.0 | A |
2 | 132 | 59 | 74 | 103 | 61.0 | A |
3 | 4 | 106 | 74 | 61 | NaN | C |
4 | 135 | 95 | 78 | 140 | 50.0 | A |
5 | 62 | 112 | 62 | 128 | 70.0 | B |
6 | 138 | 133 | 70 | 135 | 125.0 | A |
7 | 71 | 39 | 50 | 117 | 46.0 | B |
8 | 59 | 119 | 89 | 130 | 50.0 | C |
9 | 29 | 93 | 56 | 16 | 122.0 | C |
10 | 138 | 133 | 70 | 135 | 125.0 | A |
11 | 71 | 39 | 50 | 117 | 46.0 | B |
数据分类处理
- groupby([‘列名’,…])返回DataFrameGroupBy
- 分组之后,可以针对某一数值列进行聚合操作(sum,mean,max,min,std等)
- 可以自定义聚合函数,使用transform或apply
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 | A |
1 | 53.0 | 12 | 137 | 45 | 14 | A |
2 | 61.0 | 59 | 132 | 74 | 103 | A |
3 | NaN | 106 | 4 | 74 | 61 | C |
4 | 50.0 | 95 | 135 | 78 | 140 | A |
5 | 70.0 | 112 | 62 | 62 | 128 | B |
6 | 125.0 | 133 | 138 | 70 | 135 | A |
7 | 46.0 | 39 | 71 | 50 | 117 | B |
8 | 50.0 | 119 | 59 | 89 | 130 | C |
9 | 122.0 | 93 | 29 | 56 | 16 | C |
10 | 125.0 | 133 | 138 | 70 | 135 | A |
11 | 46.0 | 39 | 71 | 50 | 117 | B |
df.groupby('DB')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x08A144D0>
df.groupby('DB')['SQLite'].sum() # 对SQLite这一列求和
DB
A 659
B 362
C 207
Name: SQLite, dtype: int32
df.groupby('DB')['SQLite'].mean() # 对SQLite这一列求平均值
DB
A 109.833333
B 120.666667
C 69.000000
Name: SQLite, dtype: float64
df.groupby('DB')['SQLite'].count() # 对SQLite这一列求出现的次数
DB
A 6
B 3
C 3
Name: SQLite, dtype: int64
# 对两列进行聚合
df.groupby('DB')['SQLite','PostgreSQL'].mean()
SQLite | PostgreSQL | |
---|---|---|
DB | ||
A | 109.833333 | 76.666667 |
B | 120.666667 | 63.333333 |
C | 69.000000 | 106.000000 |
# 针对所有进行聚合——求平均值
df.groupby('DB').mean()
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
DB | |||||
A | 91.666667 | 76.666667 | 131.333333 | 59.166667 | 109.833333 |
B | 54.000000 | 63.333333 | 68.000000 | 54.000000 | 120.666667 |
C | 86.000000 | 106.000000 | 30.666667 | 73.000000 | 69.000000 |
# transform() 聚合后的结果是不去重的
df.groupby('DB').transform(sum)
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 550.0 | 460 | 788 | 355 | 659 |
1 | 550.0 | 460 | 788 | 355 | 659 |
2 | 550.0 | 460 | 788 | 355 | 659 |
3 | 172.0 | 318 | 92 | 219 | 207 |
4 | 550.0 | 460 | 788 | 355 | 659 |
5 | 162.0 | 190 | 204 | 162 | 362 |
6 | 550.0 | 460 | 788 | 355 | 659 |
7 | 162.0 | 190 | 204 | 162 | 362 |
8 | 172.0 | 318 | 92 | 219 | 207 |
9 | 172.0 | 318 | 92 | 219 | 207 |
10 | 550.0 | 460 | 788 | 355 | 659 |
11 | 162.0 | 190 | 204 | 162 | 362 |
# apply() 聚合后的结果是去重的
df.groupby('DB').apply(sum)
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
DB | ||||||
A | 550.0 | 460 | 788 | 355 | 659 | AAAAAA |
B | 162.0 | 190 | 204 | 162 | 362 | BBB |
C | 172.0 | 318 | 92 | 219 | 207 | CCC |
# 练习:根据DB分组,并计算出所有学科的总成绩
def sum_data(item):
print(type(item))
return item.sum()
df.groupby('DB').apply(sum_data)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
DB | ||||||
A | 550.0 | 460 | 788 | 355 | 659 | AAAAAA |
B | 162.0 | 190 | 204 | 162 | 362 | BBB |
C | 172.0 | 318 | 92 | 219 | 207 | CCC |
def sum_data(item):
display(item)
return item.sum()
df.groupby('DB').apply(sum_data)
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 | A |
1 | 53.0 | 12 | 137 | 45 | 14 | A |
2 | 61.0 | 59 | 132 | 74 | 103 | A |
4 | 50.0 | 95 | 135 | 78 | 140 | A |
6 | 125.0 | 133 | 138 | 70 | 135 | A |
10 | 125.0 | 133 | 138 | 70 | 135 | A |
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
5 | 70.0 | 112 | 62 | 62 | 128 | B |
7 | 46.0 | 39 | 71 | 50 | 117 | B |
11 | 46.0 | 39 | 71 | 50 | 117 | B |
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
3 | NaN | 106 | 4 | 74 | 61 | C |
8 | 50.0 | 119 | 59 | 89 | 130 | C |
9 | 122.0 | 93 | 29 | 56 | 16 | C |
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
DB | ||||||
A | 550.0 | 460 | 788 | 355 | 659 | AAAAAA |
B | 162.0 | 190 | 204 | 162 | 362 | BBB |
C | 172.0 | 318 | 92 | 219 | 207 | CCC |
df.sum()
MySQL 884
PostgreSQL 968
Oracle 1084
MongoDB 736
SQLite 1228
DB AAACABABCCAB
dtype: object
df.iloc[:,:-1]
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | |
---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 |
1 | 53.0 | 12 | 137 | 45 | 14 |
2 | 61.0 | 59 | 132 | 74 | 103 |
3 | NaN | 106 | 4 | 74 | 61 |
4 | 50.0 | 95 | 135 | 78 | 140 |
5 | 70.0 | 112 | 62 | 62 | 128 |
6 | 125.0 | 133 | 138 | 70 | 135 |
7 | 46.0 | 39 | 71 | 50 | 117 |
8 | 50.0 | 119 | 59 | 89 | 130 |
9 | 122.0 | 93 | 29 | 56 | 16 |
10 | 125.0 | 133 | 138 | 70 | 135 |
11 | 46.0 | 39 | 71 | 50 | 117 |
df.iloc[:,:-1].sum()
MySQL 884.0
PostgreSQL 968.0
Oracle 1084.0
MongoDB 736.0
SQLite 1228.0
dtype: float64
df.iloc[:,:-1].values
array([[136., 28., 108., 18., 132.],
[ 53., 12., 137., 45., 14.],
[ 61., 59., 132., 74., 103.],
[ nan, 106., 4., 74., 61.],
[ 50., 95., 135., 78., 140.],
[ 70., 112., 62., 62., 128.],
[125., 133., 138., 70., 135.],
[ 46., 39., 71., 50., 117.],
[ 50., 119., 59., 89., 130.],
[122., 93., 29., 56., 16.],
[125., 133., 138., 70., 135.],
[ 46., 39., 71., 50., 117.]])
df.iloc[:,:-1].values.sum()
nan
df.iloc[:,:-1].fillna(0).values.sum()
4900.0
def sum_data(item):
return item.iloc[:,:-1].fillna(0).values.sum()
df.groupby('DB').apply(sum_data)
DB
A 2812.0
B 1080.0
C 1008.0
dtype: float64
def sum_data(item):
return Series({'总成绩':item.iloc[:,:-1].fillna(0).values.sum()})
df.groupby('DB').apply(sum_data)
总成绩 | |
---|---|
DB | |
A | 2812.0 |
B | 1080.0 |
C | 1008.0 |
df
MySQL | PostgreSQL | Oracle | MongoDB | SQLite | DB | |
---|---|---|---|---|---|---|
0 | 136.0 | 28 | 108 | 18 | 132 | A |
1 | 53.0 | 12 | 137 | 45 | 14 | A |
2 | 61.0 | 59 | 132 | 74 | 103 | A |
3 | NaN | 106 | 4 | 74 | 61 | C |
4 | 50.0 | 95 | 135 | 78 | 140 | A |
5 | 70.0 | 112 | 62 | 62 | 128 | B |
6 | 125.0 | 133 | 138 | 70 | 135 | A |
7 | 46.0 | 39 | 71 | 50 | 117 | B |
8 | 50.0 | 119 | 59 | 89 | 130 | C |
9 | 122.0 | 93 | 29 | 56 | 16 | C |
10 | 125.0 | 133 | 138 | 70 | 135 | A |
11 | 46.0 | 39 | 71 | 50 | 117 | B |
# 计算MySQL总成绩,PostgreSQL总成绩,Oracle+MongoDB+SQLite的总成绩
def sum_data(item):
MySQL_total = item['MySQL'].sum()
PostgreSQL_total = item['PostgreSQL'].sum()
DB_Total = item[['Oracle','MongoDB','SQLite']].fillna(0).values.sum()
return Series({'DBS':DB_Total,'MySQL':MySQL_total,'PostgreSQL':PostgreSQL_total})
df.groupby('DB').apply(sum_data)
DBS | MySQL | PostgreSQL | |
---|---|---|---|
DB | |||
A | 1802.0 | 550.0 | 460.0 |
B | 728.0 | 162.0 | 190.0 |
C | 518.0 | 172.0 | 318.0 |