逻辑回归—kaggle糖尿病预测
import warnings
warnings.filterwarnings('ignore')#忽略匹配警告
data=np.loadtxt(r"C:\践\pima-indians-diabetes.data.csv",delimiter=",",skiprows=1,dtype=np.float)
data
array([[ 6. , 148. , 72. , ..., 0.627, 50. , 1. ],
[ 1. , 85. , 66. , ..., 0.351, 31. , 0. ],
[ 8. , 183. , 64. , ..., 0.672, 32. , 1. ],
...,
[ 5. , 121. , 72. , ..., 0.245, 30. , 0. ],
[ 1. , 126. , 60. , ..., 0.349, 47. , 1. ],
[ 1. , 93. , 70. , ..., 0.315, 23. , 0. ]])
#分离特征变量和分类变量
X=data[:,:-1]
y=data[:,-1]
#特征标准化
mu=X.mean(axis=0)
std=X.std(axis=0)
X=(X-mu)/std
#添加全1列
x_ones=np.ones((X.shape[0],1))
X=np.hstack((X,x_ones))
#拆分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,
y,
test_size=0.3,
random_state=8)
#将因变量转为列向量
y_train=y_train.reshape(-1,1)
y_test=y_test.reshape(-1,1)
print(y_train.shape,y_test.shape)
(537, 1) (231, 1)
#初始化theta值
theta=np.ones([X_train.shape[1],1])
theta
array([[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.]])
#设置步长值
alpha=0.001
#定义sigmoid函数
def sigmoid(z):
s=1.0/(1+np.exp(-z))
return s
num_iters=10000
m=200
for i in range(num_iters):
h=sigmoid(np.dot(X_train,theta))
theta=theta-alpha*np.dot(X_train.T,(h-y_train))/m
print(theta)
[[ 0.39210287]
[ 1.10657783]
[-0.24092243]
[ 0.0223229 ]
[-0.17137676]
[ 0.61819121]
[ 0.45880179]
[ 0.12971106]
[-0.84498429]]
#预测
pred_y=sigmoid(np.dot(X_test,theta))
#预测结果二值化
pred_y[pred_y>0.5]=1
pred_y[pred_y<=0.5]=0
print(pred_y.reshape(1,-1))
[[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.
0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]]
print(y_test.reshape(1,-1))
[[0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1.
0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0.
1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1.
1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1.]]
print("预测准确率为:",np.sum(pred_y==y_test)/len(y_test))
预测准确率为: 0.7878787878787878
sklearn 实现逻辑回归
导入数据,分类,标准化如上步骤。
from sklearn.linear_model import LogisticRegression
#实例化
logist=LogisticRegression()
#模型训练
logist.fit(X_train,y_train)
#模型预测
y_predict=logist.predict(X_test)
print(y_predict)
[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.
0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]
#计算模型准确率
print("准确率:",np.sum((y_predict==y_test))/len(y_test))
准确率: 0.7792207792207793