缺失值分析处理
可视化&数据分析
代码
- importnumpyasnp
- importpandasaspd
- importmatplotlib.pyplotasplt
- importseabornassns
- importwarnings
- warnings.filterwarnings('ignore')
- train = pd.read_csv("train.csv")
- test = pd.read_csv("test.csv")
- gender= pd.read_csv("gender.csv")
- print(train.head())
- print(test.head())
- print(gender.head())
- data=pd.concat([train,test],ignore_index=True)
- data
- print(data.info())
- print(data.isnull().sum())#Cabin缺失数据较多,可直接删除
- data.describe()#查看年龄统计数据数据,使用年龄的平均值填充空白值
- data.Age=data.Age.fillna(data.Age.mean())#用平均值填充年龄
- data
- data[data.Fare.isnull()]
- data.Fare=data.Fare.fillna(data[(data.Embarked =='S')&(data.Pclass ==3)].Fare.median())
- data[data.Embarked.isnull()]
- data.groupby(by=['Pclass','Embarked']).Fare.median()
- data.groupby(by=['Pclass','Embarked']).Survived.count()
- data.Embarked=data.Embarked.fillna('C')
- data.iloc[61]
- data=data.drop('Cabin',axis=1)#Cabin数据缺失将近70%,删除Cabin列。
- data
- print(data.isnull().sum())
- sns.barplot(x='Pclass',y='Survived',data=data)
- plt.subplots(figsize=(15,8))
- sns.kdeplot(data.loc[(data['Survived']==0),'Pclass'],shade=True,color='red',label='Not Survived')
- sns.kdeplot(data.loc[(data['Survived']==1),'Pclass'],shade=True,color='blue',label='Survived')
- labels=['1','2','3']
- plt.xticks(sorted(data.Pclass.unique()),labels)
- plt.show()
- sns.barplot(x='Sex',y='Survived',data=data)#女性生存数量大于男性
- sns.barplot(x='Parch',y='Survived',data=data)
- sns.barplot(x='SibSp',y='Survived',data=data)
- sns.barplot(x='Embarked',y='Survived',data=data)