一、定义缺失值检测函数
def missing_values_table(df):
# 总的缺失值
mis_val = df.isnull().sum()
# 缺失值占比
mis_val_percent = 100 * df.isnull().sum() / len(df)
# 将上述值合并成表
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# 重命名列名
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# 按缺失值占比降序排列
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# 显示结果
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
二、#绘制记录数和违约率的柱状图,以函数的形式呈现,方便后面使用(主要用来分析非数值型字段)
## df_data原始数据
## target 目标字段
## feature要分析的字段
## label_rotation 坐标标签是否旋转
## horizontal_layout 水平还是垂直放置
def plot_stats(df_data, target, feature,label_rotation=False,horizontal_layout=True):
temp = df_data[feature].value_counts()
df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})
cat_perc = df_data[[feature, target]].groupby([feature],as_index=False).mean()
cat_perc.sort_values(by=target, ascending=False, inplace=True)
if(horizontal_layout):
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
else:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))
sns.set_color_codes("pastel")
s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=45)
s = sns.barplot(ax=ax2, x = feature, y=target, order=cat_perc[feature], data=cat_perc)
if(label_rotation):
s.set_xticklabels(s.get_xticklabels(),rotation=45)
plt.ylabel('Percent of target with value 1 [%]', fontsize=10)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.show();
三、绘制记录数和违约率的柱状图,以函数的形式呈现,方便后面使用(主要用来分析数值型字段)
## df_data 数据框
## var 数值型变量名
def plot_distribution(df_data , var):
i = 0
t1 = df_data.loc[df_data['left'] != 0]
t0 = df_data.loc[df_data['left'] == 0]
len_var = len(var)
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(2,2,figsize=(12,12))
for feature in var:
i += 1
plt.subplot(len_var,1,i)
sns.kdeplot(t1[feature], bw=0.5,label="left = 1")
sns.kdeplot(t0[feature], bw=0.5,label="left = 0")
plt.ylabel('Density plot', fontsize=12)
plt.xlabel(feature, fontsize=12)
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show();
四、条形图画法1-离散型变量:反映在某个自变量的取值范围下,目标变量发生的概率
## data_df:原始数据; X_col:自变量列名 ; Y_col:目标变量列名
#data_df = df
#X_col = 'salary'
#Y_col= 'left'
def plot_explore2_char(data_df , X_col , Y_col):
plt.figure(figsize=(14,14),dpi=100)
plt.subplot(2,2,1)
data_df[X_col].value_counts().plot(kind='bar')
plt.xticks(rotation = 75); plt.xlabel(X_col +' name '); plt.ylabel('Amount of employee number')
plt.title('emp Group')
五、条形图画法2-连续型变量:反映在某个自变量的取值范围下,目标变量发生的概率
##条形图画法2-连续型变量
## 旨在反映在某个自变量的取值范围下,目标变量发生的概率
## data_df:原始数据; X_col:自变量列名 ; Y_col:目标变量列名
#data_df = df
#X_col = 'average_monthly_hours'
#Y_col= 'left'
def plot_explore2_num(data_df , X_col , Y_col):
import copy
import numpy as np
data_explore = copy.deepcopy(data_df)
tmp_col_name =X_col+'2'
#data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(round(data_explore[X_col].min()), round(data_explore[X_col].max()), num = 20))
data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(96, 310, num = 11))
age_groups = data_explore.groupby(tmp_col_name).mean()
# 绘制条形图
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('whitegrid')
plt.figure()
plt.bar(age_groups.index.astype(str), 100 * age_groups[Y_col])
plt.xticks(rotation = 45); plt.xlabel(X_col + '_abandon');
plt.ylabel(Y_col+'_probability(%)')
plt.title(Y_col + 'probability to ' + X_col +'abandon' )
六、主函数调用
if __name__ == "__main__":
import pandas as pd
df= pd.read_csv('D:\PycharmProjects\lessonOnLine\data\HR2.csv')
df.isnull().sum()
missing_values = missing_values_table(df)
#missing_values.head(20)
# df.columns.tolist()
# df['EMPID'] = df.index.tolist()
# bureau_agg = df.groupby('EMPID', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
# bureau_agg_department = df.groupby('department', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
#
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
# ## 条形图的画法1-分类变量的画法
# ## department 的原始分布情况
# plt.figure(figsize=(14,14),dpi=100)
# plt.subplot(2,2,1)
# df['department'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('department name '); plt.ylabel('Amount of employee number')
# plt.title('emp Group')
# ## 目标变量取值为1时, department的取值情况
# plt.subplot(2,2,2)
# df[df['left'] == 1]['department'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('department name'); plt.ylabel('left number')
# plt.title('left Group')
# ## 各个部门离职的概率
# department_groups = df.groupby('department').mean()
# plt.subplot(2,2,3)
# plt.bar(department_groups.index.astype(str), 100 * department_groups['left'])
# plt.xticks(rotation = 45); plt.xlabel('department'); plt.ylabel('left probability(%)')
# plt.title('left probability to salary');
#
#
#
# ## salary 的原始分布情况
# plt.figure(figsize=(14,14),dpi=100)
# plt.subplot(2,2,1)
# df['salary'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('salary degree '); plt.ylabel('Amount of employee number')
# plt.title('emp Group')
# ## 目标变量取值为1时, department的取值情况
# plt.subplot(2,2,2)
# df[df['left'] == 1]['salary'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('salary degree'); plt.ylabel('left number')
# plt.title('left Group')
# ## salay取不同值时,离职的概率
# salary_groups = df.groupby('salary').mean()
# plt.subplot(2,2,3)
# plt.bar(salary_groups.index.astype(str), 100 * salary_groups['left'])
# plt.xticks(rotation = 45); plt.xlabel('salary'); plt.ylabel('left probability(%)')
# plt.title('left probability to salary');
#
# ##条形图画法2-连续型变量
# ## 旨在反映在某个自变量的取值范围下,目标变量发生的概率
# df['average_monthly_hours2'] = pd.cut(df['average_monthly_hours'], bins = np.linspace(96, 310, num = 11))
# age_groups = df.groupby('average_monthly_hours2').mean()
# #plt.figure(figsize = (8, 8))
#
# # 绘制条形图
# plt.bar(age_groups.index.astype(str), 100 * age_groups['left'])
# plt.xticks(rotation = 45); plt.xlabel('average_monthly_hours abandon'); plt.ylabel('left probability(%)')
# plt.title('left probability to average_monthly_hours abandon');
#
##
df[df['department'] == 'sale']['left'].value_counts()
## 函数的调用
#plot_stats(df, 'left','department',label_rotation= True,horizontal_layout=True)
plot_distribution(df, ['number_project','average_monthly_hours', 'time_spend_company'])
plot_explore2_char(df ,'salary', 'left')
plot_explore2_num(df ,'average_monthly_hours' , 'left')