0
点赞
收藏
分享

微信扫一扫

基于Kaggle的经典AI项目二—数据清洗


文章目录

  • ​​1 数据类型 修改​​
  • ​​2 重复样本处理— 删除重复样本​​
  • ​​3 缺失值处理​​
  • ​​3.1 行列--缺失值>40%删除处理​​
  • ​​3.2 统一填充 缺失率少于1%的列​​
  • ​​3.3 相关性高的 连续型变量业务填充​​
  • ​​3.4 区分度高的分类型变量业务填充​​
  • ​​3.5 统一填充剩余变量​​
  • ​​4 连续型变量奇异值处理​​
  • ​​4.1 y变量处理--重点变量处理​​
  • ​​4.2 其余变量统一处理​​


开发环境jupyter notebook

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings("ignore") #忽略警告

train = pd.read_csv("train.csv")
evaluate = pd.read_csv("test.csv")
evaluate.head() #查看
evaluate.info() #查看数据信息

1 数据类型 修改

clear_dtypes = train.drop('SalePrice', axis=1).dtypes    #修改数据类型
train['GarageQual'].value_counts()
train.dtypes['GarageQual']

# 有序型变量转换
train = train.replace({
"BsmtCond" : {"No": 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
"BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
"ALQ" : 5, "GLQ" : 6},
"BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
"ALQ" : 5, "GLQ" : 6},
"BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
"ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
"ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
"FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5,
"Min2" : 6, "Min1" : 7, "Typ" : 8},
"GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
"LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
"PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
"PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
"Street" : {"Grvl" : 1, "Pave" : 2},
"Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}
})

# 查看数据类型并修改
train['MSSubClass'].dtypes # MSSubClass: 出售的房屋类型:分类型
train['YrSold'].dtypes # 房子的出售年月
train['MoSold'].dtypes

train['MSSubClass'] = train['MSSubClass'].astype(str) # 修改数据类型
train['YearBuilt'] = train['YearBuilt'].astype(str)
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)

2 重复样本处理— 删除重复样本

train.duplicated().sum()                                # 统计重复样本个数 
train.drop_duplicates(inplace=True) # 删除重复样本

3 缺失值处理

3.1 行列–缺失值>40%删除处理

# 行缺失处理--删除缺失率>40%的行

(train.isnull().sum(axis=1)/train.shape[1] > 0.4).sum() # 统计缺失率>40%的行数;train.shape[1]:所有列76列

nans_del_indexs = train.index[(train.isnull()\ # 删除40%的数据都缺失的行
.sum(axis=1)/train.shape[1] > 0.4)]
train.drop(labels=nans_del_indexs, axis=0, inplace=True)

# 列缺失处理 

def anova(train_cate_y, categorical, y): # 类别性变量区分度计算
anv = pd.DataFrame(index=categorical)
anv['feature'] = categorical
pvals = []
for c in categorical:
print(c)
samples = []
for cls in train_cate_y[c].dropna().unique(): # 删除每一列数据 Na 值,去重
s = train_cate_y[train_cate_y[c] == cls][y].values # 获取处理好列的 SalePrice值
samples.append(s) # 某特征下不同取值对应的房价组合形成二维列表
pval = stats.f_oneway(*samples)[1] # 一元方差分析得到F,P:P越小,对方差的影响越大。
pvals.append(pval)
anv['pval'] = pvals
return anv.sort_values('pval')

categorical = [column for column in train\
.columns if train.dtypes[column] == 'object'] # 类型变量集合
y = 'SalePrice'
corr_cate = anova(train, categorical, y)
corr_cate['disparity'] = np.log(20*1./corr_cate['pval']\
.values)/np.log(20) # 区分度

# 统计每个列缺失值比例:
NAs = pd.concat([(100*train.isnull().sum(axis=0)/train.shape[0]),
100*evaluate.isnull().sum()/evaluate.shape[0]],
axis=1, keys=['Train', 'evaluate'])
NAs['type'] = train.dtypes[NAs.index]

NAs['corr_conti_y'] = train.corr()['SalePrice'] # 加入连续性变量相关度
NAs['corr_cate_y'] = corr_cate['disparity'] # 加入类别性变量区分度

NAs = NAs[NAs[['Train', 'evaluate']].sum(axis=1) > 0]\ # 只保留有缺失值的列
.sort_values(by='Train', ascending=False)
NAs.head()

Train

evaluate

type

corr_conti_y

corr_cate_y

PoolQC

99.520548

99.794380

float64

0.447936

NaN

MiscFeature

96.301370

96.504455

object

NaN

1.753202

Alley

93.767123

92.666210

object

NaN

6.618486

Fence

80.753425

80.123372

object

NaN

3.026003

FireplaceQu

47.260274

50.034270

float64

0.295794

NaN

# 删除缺失率超过40%的列 

clear_nans_del_cols = NAs.index[((NAs[['Train', 'evaluate']]>40).sum(axis=1)>0).values]
clear_nans_del_cols

train.drop(labels=clear_nans_del_cols, axis=1, inplace=True)

3.2 统一填充 缺失率少于1%的列
  • 连续型用中位数
  • 分类型用众数

# 查找 数值型 类别型列

clear_nans_less1_cols = NAs.index[((NAs[['Train', 'evaluate']]<1).sum(axis=1)==2).values].values
clear_nans_less1_cols_num = train[clear_nans_less1_cols].select_dtypes(include=[np.number]).columns
clear_nans_less1_cols_class = train[clear_nans_less1_cols].select_dtypes(include=[np.object]).columns

# 生成中位数、众数待用
clear_nans_less1_cols_num_median = train[clear_nans_less1_cols_num].median()
clear_nans_less1_cols_class_mode = train[clear_nans_less1_cols_class].mode()

# 连续型用 中位数填充
# 分类型用 众数填充
train[clear_nans_less1_cols_num] = train[clear_nans_less1_cols_num]\
.fillna(clear_nans_less1_cols_num_median)
train[clear_nans_less1_cols_class] = train[clear_nans_less1_cols_class]\
.fillna(clear_nans_less1_cols_class_mode)

3.3 相关性高的 连续型变量业务填充

""" 
GarageYrBlt 车库建成日期 GarageYrBlt:0.055479,0.053425 float64 0.486362
车库的五个变量GarageType、GarageYrBlt、GarageFinish、GarageQual、GarageCond是由于房子没有车库而缺失。
GarageYrBlt 为车库的年份,我们用房子的建造年份来替代

MasVnrArea 外墙装饰材料的面积,Na即面积为0 MasVnrArea,0.005479,0.010274,float64,0.477493
LotFrontage 房子与街道的距离,Na即距离为0 LotFrontage,0.177397,0.155479,float64,0.351799
"""
train.loc[train['GarageYrBlt'].isnull(), 'GarageYrBlt'] = train.loc[train['GarageYrBlt']\
.isnull(), 'YearBuilt']

train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train['LotFrontage'] = train['LotFrontage'].fillna(0)

3.4 区分度高的分类型变量业务填充

"""
BsmtQual 地下室质量 BsmtQual,0.025342,0.030137,object,NaN,143.208288
BsmtExposure、BsmtFinType2、BsmtQual、BsmtCond、BsmtFinType1地下室变量,房子没有地下室而缺失。
KitchenQual: String, //厨房质量, 这里用众数替换
"""

clear_KitchenQual_mode = train['KitchenQual'].mode() #获取众数

train['BsmtQual'].fillna('None', inplace=True)
train['KitchenQual'].fillna(clear_KitchenQual_mode, inplace=True) #使用众数填充

3.5 统一填充剩余变量
  • 连续型用 0
  • 分类型用 None

# 查看剩余变量情况
remain_col = train.columns[train.isnull().sum(axis=0)>0]
NAs.loc[remain_col, ]

Train

evaluate

type

corr_conti_y

corr_cate_y

MasVnrType

0.547945

1.096642

object

NaN

50.437263

BsmtCond

2.534247

3.084304

float64

0.160658

NaN

BsmtExposure

2.602740

3.015764

float64

0.352958

NaN

BsmtFinType1

2.534247

2.878684

float64

0.277436

NaN

BsmtFinType2

2.602740

2.878684

float64

-0.040139

NaN

Electrical

0.068493

0.000000

object

NaN

14.665360

GarageType

5.547945

5.209047

object

NaN

51.655312

GarageFinish

5.547945

5.346127

object

NaN

72.421212

GarageQual

5.547945

5.346127

float64

0.156693

NaN

GarageCond

5.547945

5.346127

float64

0.125013

NaN

# 查找:数值型列;类别型列对其进行 0、None 填充
clear_nans_remain_cols_num = train[remain_col].select_dtypes(include=[np.number]).columns
clear_nans_remain_cols_class = train[remain_col].select_dtypes(include=[np.object]).columns

train[clear_nans_remain_cols_num] = train[clear_nans_remain_cols_num].fillna(0)
train[clear_nans_remain_cols_class] = train[clear_nans_remain_cols_class].fillna('None')

# 查看缺失值是否处理完毕
train.isnull().sum(axis=0).sum()

4 连续型变量奇异值处理

train.SalePrice.plot(kind='box',sym='b*')                # y变量箱线图 

# 查看与y变量 高相关性的变量的箱线图
train_number = train.select_dtypes(include = [np.number]).drop(['Id'], axis=1)

corr = train.corr()
corr_threshvalue = 0.5 #number of variables for heatmap
corr_cols = corr.loc[:, corr.loc['SalePrice',:]\
.abs() > corr_threshvalue].columns #查找房价大于阈值的列

train_number[corr_cols].plot(sym='b*', kind='box', subplots=True, figsize=(16, 4))

基于Kaggle的经典AI项目二—数据清洗_lua_03

# 找到异常值上下临界点
clear_number_q = train_number.quantile(q=[0, 0.05, 0.25, 0.5, 0.75, 0.95, 1], axis=0)

# outlier_expand(离群拓展)为异常值定义伸缩参数,标准为1.5
lower_outlier_expand = 1.5
upper_outlier_expand = 3
clear_number_q.loc['lower_outlier',:] = clear_number_q.loc[0.25, :]\
- (clear_number_q.loc[0.75, :]\
- clear_number_q.loc[0.25, :])*lower_outlier_expand
clear_number_q.loc['upper_outlier',:] = clear_number_q.loc[0.75, :]\
+ (clear_number_q.loc[0.75, :]\
- clear_number_q.loc[0.25, :])*upper_outlier_expand

# 统计各个列 高于上临界点的数量
uppper_cnt = (train_number > clear_number_q.loc['upper_outlier', :]).sum()
uppper_cnt

# 统计各个列 低于下临界点的数量
lower_cnt = (train_number < clear_number_q.loc['lower_outlier', :]).sum()
lower_cnt

# 汇总各个列的奇异值数量
pd.DataFrame({'uppper_cnt': uppper_cnt,
'lower_cnt': lower_cnt,
'all': uppper_cnt+lower_cnt,
'all_rate': 100*(uppper_cnt+lower_cnt)/train.shape[0],
'corr': corr.loc['SalePrice',:]
}).sort_values(by='corr', ascending=False)

4.1 y变量处理–重点变量处理

# 获取超出阈值(上临界,下临界)的SalePrice
train.ix[train['SalePrice'] > clear_number_q.loc['upper_outlier', 'SalePrice'], 'SalePrice'] = clear_number_q.loc['upper_outlier', 'SalePrice']
train.ix[train['SalePrice'] < clear_number_q.loc['lower_outlier', 'SalePrice'], 'SalePrice'] = clear_number_q.loc['lower_outlier', 'SalePrice']

# 双变量分析SaleSt/ GraveStand——重点变量
var = 'GrLivArea'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

#deleting points 删除点
train.sort_values(by = 'GrLivArea', ascending = False)[:2]

#删除 ID=1299/524 的行索引
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)

4.2 其余变量统一处理

clear_number_para = train_number.drop('SalePrice', axis=1).columns 

train[clear_number_para] = train[clear_number_para]\
.where(train[clear_number_para] < clear_number_q.loc['upper_outlier', :],
clear_number_q.loc['upper_outlier', :], axis=1)

train[clear_number_para] = train[clear_number_para]\
.where(train[clear_number_para] > clear_number_q.loc['lower_outlier', :],
clear_number_q.loc['lower_outlier', :], axis=1)

train[corr_cols].plot(sym='b*', kind='box', subplots=True, figsize=(20, 8))

基于Kaggle的经典AI项目二—数据清洗_lua_04


举报

相关推荐

0 条评论