Pandas第一课笔记（基本的数据处理）-CFANZ编程社区

import pandas as pd

df = pd.read_excel('data/team.xlsx')
# df = pd.read_excel('https://www.gairuo.com/file/data/dataset/team.xlsx')
# print(df)
#
# print(df.head())  # 前五条
# print(df.tail())  # 后五条
# print(df.sample(5))  # 随机查看5条
# print(df.shape)
# print(df.info())  # 所有相关信息
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    100 non-null    object
 1   team    100 non-null    object
 2   Q1      100 non-null    int64 
 3   Q2      100 non-null    int64 
 4   Q3      100 non-null    int64 
 5   Q4      100 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 4.8+ KB
None
'''
# print(df.describe())  # 数值型列的汇总统计
'''
               Q1          Q2          Q3          Q4
count  100.000000  100.000000  100.000000  100.000000
mean    49.200000   52.550000   52.670000   52.780000
std     29.962603   29.845181   26.543677   27.818524
min      1.000000    1.000000    1.000000    2.000000
25%     19.500000   26.750000   29.500000   29.500000
50%     51.500000   49.500000   55.000000   53.000000
75%     74.250000   77.750000   76.250000   75.250000
max     98.000000   99.000000   99.000000   99.000000
'''
# print(df.dtypes)
'''
name    object
team    object
Q1       int64
Q2       int64
Q3       int64
Q4       int64
dtype: object
'''
# print(df.axes)  # 行、列名称
'''[RangeIndex(start=0, stop=100, step=1), Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')]'''
# print(df.columns)  # 仅显示列名称
'''Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')'''


# 设置 name 为索引
df.set_index('name', inplace=True)
# print(df)  # 原先自动加入的索引0-99不再存在


## 显示指定列
# print(df['Q1'])  # 查看指定列，返回Series类型数据（同时显示索引）
# print(df.Q1)  # 同上
# print(df[['team', 'Q1']])  # 查看指定多列（同时显示索引）
# print(df.loc[:, ['team', 'Q1']])  # 同上，如果用到自然索引要用df.iloc



## 显示指定行
# print(df[df.index=='Liver'])
# print(df[0:3])
# print(df[0:10:2])
# print(df.iloc[:10, :])


##  显示指定行和列
# print(df.loc['Ben', 'Q1':'Q4'])
'''
Q1    21
Q2    43
Q3    41
Q4    74
Name: Ben, dtype: object
'''
# print(df.loc['Eorge':'Harlie', 'team':'Q3'])
'''
       team  Q1  Q2  Q3
name                   
Eorge     C  93  96  71
Oah       D  65  49  61
Harlie    C  24  13  87
'''


##  按一定条件显示数据
# print(df[df.Q1 > 90])
# print(df[df.team == 'C'])
# print(df[df.index == 'Oscar'])
## 组合条件，多重筛选
# print(df[(df['Q1'] > 90) & (df['team']=='C')])
# print(df[df['team']=='C'].loc[df.Q1>90])


## 排序
# print(df.sort_values(by='Q1'))
# print(df.sort_values(by='Q1', ascending=False))  # 降序
# print(df.sort_values(['team', 'Q1'], ascending=[True, False]))


##  分组聚合
# print(df.groupby('team').sum())
# print(df.groupby('team').mean())
# print(df.groupby('team').agg({'Q1':'sum', 'Q2':'count', 'Q3':'mean', 'Q4':'max'}))
# print(df.groupby('team').agg({'Q1':sum, 'Q2':'count', 'Q3':'mean', 'Q4':max}))  # 同上
'''
        Q1  Q2         Q3  Q4
team                         
A     1066  17  51.470588  97
B      975  22  54.636364  99
C     1056  22  48.545455  98
D      860  19  65.315789  99
E      963  20  44.050000  98
'''


##  数据转置
# print(df.groupby('team').sum().T)
# print(df.groupby('team').sum().stack())  #按team列分组，每组再按剩余列分小组，计算各小组总和
# print(df.groupby('team').sum().unstack())  #按除team列之外的其它列分大组，再按team列分小组，计算各小组总和



## 增加列
# df['one'] = 1
# print(df)
# df['total'] = df.Q1 + df.Q2 + df.Q3 + df.Q4
# print(df)
# df['total'] = df.loc[:, 'Q1':'Q4'].apply(lambda x: sum(x), axis=1)
# print(df)
# df['total'] = df.sum(axis=1)
# print(df)



# print(df.mean())  # 所有列均值
# print(df.mean(1))  # 所有行均值
# print(df.corr())
'''
          Q1        Q2        Q3        Q4
Q1  1.000000  0.142088 -0.125436 -0.010781
Q2  0.142088  1.000000  0.056270 -0.033213
Q3 -0.125436  0.056270  1.000000  0.041418
Q4 -0.010781 -0.033213  0.041418  1.000000
'''
# print(df.count())  # 每一列非空值的个数
# df.max()
# df.min()
# df.median()  # 中位数
# df.std()
# df.var()
# df.mode()  # 众数

import matplotlib.pyplot as plt

# df['Q1'].plot()
# plt.show()
# df.loc['Ben', 'Q1':'Q4'].plot()
# plt.show()
# df.loc['Ben', 'Q1':'Q4'].plot.bar()  # 柱状图
# plt.show()
# df.loc['Ben', 'Q1':'Q4'].plot.barh()  # 横向柱状图
# plt.show()

## 多条折线图
# print(df.groupby('team').sum().T)
# df.groupby('team').sum().T.plot()
# plt.show()

## 饼图
# print(df.groupby('team').count())
# df.groupby('team').count().Q1.plot.pie()
# plt.show()


df.to_excel('data/team-done.xlsx')  # 导出文件
df.to_csv('data/team-done.csv')