1 前置知识
- 逻辑回归解决二分类问题。
- sigmoid函数:非线性,值域(0,1)概率值


2 逻辑回归原理


3 癌症分类案例
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 获取数据
data=pd.read_csv('breast-cancer-wisconsin.csv')
# 基本数据处理
data=data.replace('?',np.NAN)
data=data.dropna()
x=data.iloc[:,1:-1]
# y=data[:,-1]
y=data['Class']
# 分割数据
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=22)
# 特征工程(标准化)
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)
# 逻辑回归
LR=LogisticRegression()
LR.fit(x_train,y_train)
# 模型评估
y_predict=LR.predict(x_test)
print(y_predict)
acc=LR.score(x_test,y_test)
print(acc)

4 分类问题评估
4.1 混淆矩阵

4.2 精确率&召回率



from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
import pandas as ps
# 真实值
y_true=['恶性','恶性','恶性','恶性','恶性','恶性','良性','良性','良性','良性']
# 预测值
y_predict_A=['恶性','恶性','恶性','良性','良性','良性','良性','良性','良性','良性']
y_predict_B=['恶性','恶性','恶性','恶性','恶性','恶性','恶性','恶性','恶性','良性']
labels=['恶性','良性']
# 构建混淆矩阵
m_A=confusion_matrix(y_true,y_predict_A,labels=labels)
m_B=confusion_matrix(y_true,y_predict_B,labels=labels)
df_A=pd.DataFrame(data=m_A,columns=labels,index=labels)
df_B=pd.DataFrame(data=m_B,columns=labels,index=labels)
# 混淆矩阵
print(df_A)
print(df_B)
print('*'*20)
# 准确率
print(accuracy_score(y_true,y_predict_A))
print(accuracy_score(y_true,y_predict_B))
print('*'*20)
# 精确率
print(precision_score(y_true,y_predict_A,pos_label='恶性'))
print(precision_score(y_true,y_predict_B,pos_label='恶性'))
print('*'*20)
# 召回率
print(recall_score(y_true,y_predict_A,pos_label='恶性'))
print(recall_score(y_true,y_predict_B,pos_label='恶性'))
# f1-score
print('*'*20)
print(f1_score(y_true,y_predict_A,pos_label='恶性'))
print(f1_score(y_true,y_predict_B,pos_label='恶性'))
