会用到的库
import seaborn as sns
import matplotlib.pyplot as plt
数据相关性热力图
plt.figure(figsize=(20,20))
sns.heatmap(data.corr(), annot = False, cmap = "BuPu", alpha = 0.5, fmt = ".4f", cbar = True)
plt.show()
查看某一特征的概率密度曲线
plt.figure(figsize=(12,5))
plt.title("Distribution of CNT_INSTALMENT")
ax = sns.distplot(pos_5K.CNT_INSTALMENT.dropna())
plt.show()
查看数据总体的缺失值情况
missing_value_train = train.isnull().mean()
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
plt.plot(missing_value_train.sort_values() )
#打印缺失值在20%以内的特征以做填充
print('缺失率在百分之20以内的特征')
missing = df.isnull().mean()
missing==pd.DataFrame(missing.sort_values()).reset_index()
for i in range(len(missing)):
if missing.loc[i,'rate']<0.2:
print(missing.loc[i,'id'])
#查看整体缺失情况
import missingno as msno
msno.matrix(view,labels=True)
折线图高级设置
from pyecharts import options as opts
from pyecharts.charts import Bar
%matplotlib notebook
l1=[str(i) for i in a.id]
l2=a.rate.tolist()
bar = ( Bar(init_opts=opts.InitOpts(width="500px",height="2000px")) #图像长宽
.add_xaxis(l1)
.add_yaxis("l2", l2)
.reversal_axis() #设置xy轴反转
.set_series_opts(
label_opts=opts.LabelOpts(position="right",is_show=False)#数字不显示
)
.set_global_opts(
title_opts=opts.TitleOpts(title="横向柱状图"),
xaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(rotate=500,font_size = 20),#字体大小
interval=0.1
),
yaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(rotate=20,font_size = 10),#字体大小
interval=0.01
)
)
)
bar.render_notebook()
plt图像保存
plt.savefig('name.png')
查看测试集与训练集中的特征分布
#特征分布
train["oringin"]="train"
test["oringin"]="test"
data=pd.concat([train,test],axis=0,ignore_index=True)
#View data
#fig = plt.figure(figsize=(6, 6))
for column in data.columns[0:-1]:
g = sns.kdeplot(data[column][(data["oringin"] == "train")], color="Red", shade =True)
g = sns.kdeplot(data[column][(data["oringin"] == "test")], ax =g, color="Green", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()
查看缺失值与伪缺失值,比如Nan与-999
多图像同时输出&解决口字乱码
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 4 ,figsize=(16,6))#4个格子同时输出
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False#解决口字乱码
ax[0].set(xlabel='用户记录数')
sns.kdeplot(admin_cnt[admin_cnt['count']<12].loc[(0, '2021-10-01')]['count'].values, ax=ax[0]).set_title('测试集--十月用户记录数')
ax[1].legend(labels=['训练集九月'], loc="upper right")
ax[1].set(xlabel='用户记录数')
sns.kdeplot(admin_cnt[admin_cnt['count']<12].loc[(1, '2021-09-01')]['count'].values, ax=ax[1])
ax[2].legend(labels=['训练集九月'], loc="upper right")
ax[2].set(xlabel='用户记录数')
sns.kdeplot(admin_cnt[admin_cnt['count']<12].loc[(1, '2021-08-01')]['count'].values, ax=ax[2]);
ax[3].legend(labels=['训练集九月'], loc="upper right")
ax[3].set(xlabel='用户记录数')
sns.kdeplot(admin_cnt[admin_cnt['count']<12].loc[(1, '2021-07-01')]['count'].values, ax=ax[3]);
查看训练集与测试集或训练集与验证集中,以某一特征分类的数据的,一列特征所占这种情况的数量与比例
def groupby_cnt_ratio(df, col):
if isinstance(col, str):
col = [col]
key = ['is_train', 'a3'] + col
# groupby function
cnt_stat = df.groupby(key).size().to_frame('count')
ratio_stat = (cnt_stat / cnt_stat.groupby(['is_train','a3']).sum()).rename(columns={'count':'count_ratio'})
return pd.merge(cnt_stat, ratio_stat, on=key, how='outer').sort_values(by=['count'], ascending=False)
#示例
admin_cnt = groupby_cnt_ratio(df, 'core_cust_id')#查看数据集df以时间a3为分类情况下,用户记录所出#现的次数与所占比例