监督算法建模前数据质量检查-CFANZ编程社区

一、定义缺失值检测函数

def missing_values_table(df):
        # 总的缺失值
        mis_val = df.isnull().sum()
        
        # 缺失值占比
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # 将上述值合并成表
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # 重命名列名
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # 按缺失值占比降序排列
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # 显示结果
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")

二、#绘制记录数和违约率的柱状图，以函数的形式呈现，方便后面使用（主要用来分析非数值型字段）

 ## df_data原始数据
 ## target 目标字段
 ## feature要分析的字段
 ## label_rotation 坐标标签是否旋转
 ## horizontal_layout 水平还是垂直放置
def plot_stats(df_data, target, feature,label_rotation=False,horizontal_layout=True):
    temp = df_data[feature].value_counts()
    df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})

    cat_perc = df_data[[feature, target]].groupby([feature],as_index=False).mean()
    cat_perc.sort_values(by=target, ascending=False, inplace=True)
    
    if(horizontal_layout):
        fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
    else:
        fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))
    sns.set_color_codes("pastel")
    s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)
    if(label_rotation):
        s.set_xticklabels(s.get_xticklabels(),rotation=45)
    
    s = sns.barplot(ax=ax2, x = feature, y=target, order=cat_perc[feature], data=cat_perc)
    if(label_rotation):
        s.set_xticklabels(s.get_xticklabels(),rotation=45)
    plt.ylabel('Percent of target with value 1 [%]', fontsize=10)
    plt.tick_params(axis='both', which='major', labelsize=10)
    plt.show();

三、绘制记录数和违约率的柱状图，以函数的形式呈现，方便后面使用（主要用来分析数值型字段）

## df_data 数据框
## var 数值型变量名
def plot_distribution(df_data , var):
    i = 0
    t1 = df_data.loc[df_data['left'] != 0]
    t0 = df_data.loc[df_data['left'] == 0]
    len_var = len(var)
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(2,2,figsize=(12,12))

    for feature in var:
        i += 1
        plt.subplot(len_var,1,i)
        sns.kdeplot(t1[feature], bw=0.5,label="left = 1")
        sns.kdeplot(t0[feature], bw=0.5,label="left = 0")
        plt.ylabel('Density plot', fontsize=12)
        plt.xlabel(feature, fontsize=12)
        locs, labels = plt.xticks()
        plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show();

四、条形图画法1-离散型变量：反映在某个自变量的取值范围下，目标变量发生的概率

## data_df:原始数据； X_col：自变量列名 ； Y_col：目标变量列名
#data_df  = df
#X_col = 'salary' 
#Y_col= 'left' 
def plot_explore2_char(data_df , X_col , Y_col):
    plt.figure(figsize=(14,14),dpi=100)
    plt.subplot(2,2,1)
    data_df[X_col].value_counts().plot(kind='bar')
    plt.xticks(rotation = 75); plt.xlabel(X_col +' name '); plt.ylabel('Amount of  employee number')
    plt.title('emp Group')

五、条形图画法2-连续型变量：反映在某个自变量的取值范围下，目标变量发生的概率

##条形图画法2-连续型变量
## 旨在反映在某个自变量的取值范围下，目标变量发生的概率    
## data_df:原始数据； X_col：自变量列名 ； Y_col：目标变量列名
#data_df  = df
#X_col = 'average_monthly_hours' 
#Y_col= 'left'   
    
def plot_explore2_num(data_df , X_col , Y_col):
    import copy 
    import numpy as np 
    data_explore = copy.deepcopy(data_df)
    tmp_col_name =X_col+'2'
    #data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(round(data_explore[X_col].min()), round(data_explore[X_col].max()), num = 20))
    data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(96, 310, num = 11))
    age_groups  = data_explore.groupby(tmp_col_name).mean()
    # 绘制条形图
    import matplotlib
    import matplotlib.pyplot as plt
    import seaborn as sns
    color = sns.color_palette()
    sns.set_style('whitegrid')
    plt.figure()
    plt.bar(age_groups.index.astype(str), 100 * age_groups[Y_col])
    plt.xticks(rotation = 45); plt.xlabel(X_col + '_abandon'); 
    plt.ylabel(Y_col+'_probability(%)')
    plt.title(Y_col + 'probability to ' + X_col +'abandon' )

六、主函数调用

if __name__ == "__main__":
     import pandas as pd
     df= pd.read_csv('D:\PycharmProjects\lessonOnLine\data\HR2.csv')
     df.isnull().sum()             

     missing_values = missing_values_table(df)
     #missing_values.head(20)
#     df.columns.tolist()
#     df['EMPID'] = df.index.tolist()
#     bureau_agg = df.groupby('EMPID', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
#     bureau_agg_department = df.groupby('department', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
#     
     
     import numpy as np 
     import matplotlib
     import matplotlib.pyplot as plt
     import seaborn as sns
     color = sns.color_palette()
     
    
#     ## 条形图的画法1-分类变量的画法
#     ## department 的原始分布情况
#     plt.figure(figsize=(14,14),dpi=100)
#     plt.subplot(2,2,1)
#     df['department'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('department name '); plt.ylabel('Amount of  employee number')
#     plt.title('emp Group')
#     ## 目标变量取值为1时， department的取值情况
#     plt.subplot(2,2,2)
#     df[df['left'] == 1]['department'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('department name'); plt.ylabel('left number')
#     plt.title('left Group')
#     ## 各个部门离职的概率
#     department_groups = df.groupby('department').mean()
#     plt.subplot(2,2,3)
#     plt.bar(department_groups.index.astype(str), 100 * department_groups['left'])
#     plt.xticks(rotation = 45); plt.xlabel('department'); plt.ylabel('left probability(%)')
#     plt.title('left probability to salary');
#     
#     
#     
#     ## salary 的原始分布情况
#     plt.figure(figsize=(14,14),dpi=100)
#     plt.subplot(2,2,1)
#     df['salary'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('salary degree '); plt.ylabel('Amount of  employee number')
#     plt.title('emp Group')
#     ## 目标变量取值为1时， department的取值情况
#     plt.subplot(2,2,2)
#     df[df['left'] == 1]['salary'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('salary degree'); plt.ylabel('left number')
#     plt.title('left Group')
#     ## salay取不同值时，离职的概率
#     salary_groups  = df.groupby('salary').mean()
#     plt.subplot(2,2,3)
#     plt.bar(salary_groups.index.astype(str), 100 * salary_groups['left'])
#     plt.xticks(rotation = 45); plt.xlabel('salary'); plt.ylabel('left probability(%)')
#     plt.title('left probability to salary');
#      
#     ##条形图画法2-连续型变量
#     ## 旨在反映在某个自变量的取值范围下，目标变量发生的概率
#     df['average_monthly_hours2'] = pd.cut(df['average_monthly_hours'], bins = np.linspace(96, 310, num = 11))
#     age_groups  = df.groupby('average_monthly_hours2').mean()
#     #plt.figure(figsize = (8, 8))
#        
#     # 绘制条形图
#     plt.bar(age_groups.index.astype(str), 100 * age_groups['left'])
#     plt.xticks(rotation = 45); plt.xlabel('average_monthly_hours abandon'); plt.ylabel('left probability(%)')
#     plt.title('left probability to average_monthly_hours abandon');
#     
     ## 
     
     df[df['department'] == 'sale']['left'].value_counts()
     ## 函数的调用
     #plot_stats(df, 'left','department',label_rotation= True,horizontal_layout=True)
     plot_distribution(df, ['number_project','average_monthly_hours', 'time_spend_company'])
     
     plot_explore2_char(df ,'salary', 'left')
     plot_explore2_num(df ,'average_monthly_hours' , 'left')