0
点赞
收藏
分享

微信扫一扫

连续型特征按固定区间/分位数分箱

连续型特征按固定区间/分位数分箱

# 价格按指定区间分箱
def price_cut_bins(s):
    unique_len = len(s.unique())
    if unique_len>10:
        s= s.replace(-1,np.nan)
        df_bin = pd.to_numeric(pd.cut(s, bins=[0,100,200,300,500,700,1000,1500,2000,2500,99999999],
                                      labels=[100,200,300,500,700,1000,1500,2000,2500,99999]
                                      ,include_lowest = True
                                     )
                              )
#         df_bin.fillna(-1,inplace=True)
        return df_bin
    return s

# 按分位数分箱
def perc_cut_bins(s):
    unique_len = len(s.unique())
    if unique_len>21:
        s = s.replace(-1,np.nan)
        q=[0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.8,0.95,1]
        perc_list = list(s.quantile(q)) + [999999999]
        print(perc_list)
        perc_list_fin = sorted(list(set([round(i,6) for i in perc_list])))
        print(perc_list_fin)
#         labels = [str(i)+':'+str(perc_list_fin[i])+'-'+str(perc_list_fin[i+1]) for i in range(len(perc_list_fin)-1)]
        labels = [ round(i,6) for i in perc_list_fin[:-1]]
        print(labels)
        df_bin = pd.to_numeric(pd.cut(s, bins = perc_list_fin
                                       , labels=labels
                                       , include_lowest = True
                                      )
                              )
#         df_bin.fillna(-1,inplace=True)
        return df_bin
    return s

举报

相关推荐

0 条评论