目录
3.2under_sampling RandomUnderSampler
4.2over_sampling RandomOverSampler
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import OneClassSVM
from pylab import rcParams
from sklearn.metrics import precision_score
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]
一、数据处理
data = pd.read_csv('creditcard.csv',sep=',')
data.info()
'''结果:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Time 284807 non-null float64
1 V1 284807 non-null float64
2 V2 284807 non-null float64
3 V3 284807 non-null float64
4 V4 284807 non-null float64
5 V5 284807 non-null float64
6 V6 284807 non-null float64
7 V7 284807 non-null float64
8 V8 284807 non-null float64
9 V9 284807 non-null float64
10 V10 284807 non-null float64
11 V11 284807 non-null float64
12 V12 284807 non-null float64
13 V13 284807 non-null float64
14 V14 284807 non-null float64
15 V15 284807 non-null float64
16 V16 284807 non-null float64
17 V17 284807 non-null float64
18 V18 284807 non-null float64
19 V19 284807 non-null float64
20 V20 284807 non-null float64
21 V21 284807 non-null float64
22 V22 284807 non-null float64
23 V23 284807 non-null float64
24 V24 284807 non-null float64
25 V25 284807 non-null float64
26 V26 284807 non-null float64
27 V27 284807 non-null float64
28 V28 284807 non-null float64
29 Amount 284807 non-null float64
30 Class 284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
'''
Y = data['Class']
'''结果:
0 0
1 0
2 0
3 0
4 0
..
284802 0
284803 0
284804 0
284805 0
284806 0
Name: Class, Length: 284807, dtype: int64
'''
X = data.drop('Class',axis=1,inplace=False)
print(X.shape)
print(Y.shape)
#结果:(284807, 30)
#结果:(284807,)
count_classes = pd.value_counts(data['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency")
## Get the Fraud and the normal dataset
fraud = data[data['Class']==1]#欺诈信息
normal = data[data['Class']==0]
print(fraud.shape,normal.shape)
#结果:(492, 31) (284315, 31)
#两类数据差距较大
二、不做处理建模
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#结果:Accuracy: 99.95%
confusion_matrix(y_true=y_test, y_pred=y_pred)
'''结果:
array([[85297, 11],
[ 31, 104]], dtype=int64)
'''
precision_score(y_test, y_pred)
#结果:0.9043478260869565
#对于预测结果为0的较为准确,但是我们需要预测的是为1,欺诈数据
#不做处理建模的数据,对于0的准确,对于1的相对而言更不准确
from matplotlib import pyplot as plt
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)
labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()
#对于预测结果为0的较为准确,但是我们需要预测的是为1,欺诈数据
#不做处理建模的数据,对于0的准确,对于1的相对而言更不准确
三、under_sampling建模
3.1under_sampling NearMiss
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res,y_res=nm.fit_resample(X,Y)
X_res.shape,y_res.shape
#结果:((984, 30), (984,))
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))
'''结果:
Original dataset shape Counter({0: 284315, 1: 492})
Resampled dataset shape Counter({0: 492, 1: 492})
将不平衡数据进行处理,将多的类数据减少到与少的数据一样
'''
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=1)
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#结果:Accuracy: 95.61%
confusion_matrix(y_true=y_test, y_pred=y_pred)
'''结果:
array([[140, 2],
[ 11, 143]], dtype=int64
'''
precision_score(y_test, y_pred)
#结果:0.986206896551724
3.2under_sampling RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
rus.fit(X, Y)
X_res, y_res = rus.fit_resample(X, Y)
X_res.shape,y_res.shape
#结果:((984, 30), (984,))
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))
'''结果:
Original dataset shape Counter({0: 284315, 1: 492})
Resampled dataset shape Counter({0: 492, 1: 492})
'''
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=1)
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#结果:Accuracy: 94.93%
confusion_matrix(y_true=y_test, y_pred=y_pred)
'''结果:
array([[139, 3],
[ 12, 142]], dtype=int64)
'''
precision_score(y_test, y_pred)
#结果:0.9793103448275862
评价
四、 over_sampling建模
4.1over_sampling SMOTETomek
from imblearn.combine import SMOTETomek
# Implementing Oversampling for Handling Imbalanced
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_resample(X,Y)
X_res.shape,y_res.shape
#结果:((567562, 30), (567562,))
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))
'''结果:
Original dataset shape Counter({0: 284315, 1: 492})
Resampled dataset shape Counter({0: 283781, 1: 283781})
欺诈类数据,较少的数据增多到与多的那类数据一样
'''
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=1)
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#结果:Accuracy: 99.98%
confusion_matrix(y_true=y_test, y_pred=y_pred)
'''结果:
array([[85257, 36],
[ 0, 84976]], dtype=int64)
'''
precision_score(y_test, y_pred)
#结果:0.9995765303721827
4.2over_sampling RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(random_state=10)
X_res, y_res = os.fit_resample(X, Y)
X_res.shape,y_res.shape
#结果:((568630, 30), (568630,))
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))
'''结果:
Original dataset shape Counter({0: 284315, 1: 492})
Resampled dataset shape Counter({0: 284315, 1: 284315})
'''
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=1)
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#结果:Accuracy: 99.99%
confusion_matrix(y_true=y_test, y_pred=y_pred)
'''结果:
array([[85412, 16],
[ 0, 85161]], dtype=int64)
'''
precision_score(y_test, y_pred)
#结果:0.9998121558636721