利用贝叶斯分类器对模型进行垃圾邮件分类(email文件)以及利用简单交叉法计算模型的错误率(划分数据集时20%作为测试集)
import numpy as np
def createVocabList(dataSet):
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document)
return list(vocabSet)
#myvocabList=createVocabList(listOPosts)
#print('特征属性:',myvocabList)
def setOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]=1
else:
print("the word:%s is not in my Vocabulary!" %word)
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs)
# p0Num=np.zeros(numWords)
#p1Num=np.zeros(numWords)
p0Num=np.ones(numWords)
p1Num=np.ones(numWords)
p0Denom=2
p1Denom=2
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=np.sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=np.sum(trainMatrix[i])
#p1Vect=p1Num/p1Denom
# p0Vect=p0Num/p0Denom
p1Vect=np.log(p1Num/p1Denom)
p0Vect=np.log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1=sum(vec2Classify*p1Vec)+pClass1
p0=sum(vec2Classify*p0Vec)+1.0-pClass1
if p1>p0:
return 1
else:
return 0
from sklearn.model_selection import train_test_split#实现随即划分训练集以及测试集
#预处理-分词(分割文本字符串)
def splitWords(filename):
bigString=open(filename,'r',encoding='utf8').read()
import re
regEx=re.compile('\W+')#分隔符是除字母和数字以外的长度不小于1的字符
listOfTokens=regEx.split(bigString)#字符列表
return [tok.lower() for tok in listOfTokens if len(tok)>2]
#训练模型以及利用交叉验证法测试模型效果
def spamTest():
#预处理,拿到所有邮件的词汇列表以及所有标签
from os import listdir
m=len(listdir('email/spam'))#垃圾文件多少个
#print(m)
docList=[];classList=[]
for i in range(1,m+1):
wordList=splitWords('email/spam/%d.txt'%i)#一封邮件里面包含的词汇
docList.append(wordList)
classList.append(1)#1代表垃圾邮件的标签
wordList=splitWords('email/ham/%d.txt'%i)#一封邮件里面包含的词汇
docList.append(wordList)
classList.append(0)#0代表正常邮件的标签
#找到所有邮件的特征属性
vocabList=createVocabList(docList)
print("特征:",len(vocabList))
#找到词汇向量
train_test_matrix=[]
for WList in docList:
print(WList)
train_test_matrix.append(setOfWords2Vec(vocabList,WList))
print("词汇向量:",len(train_test_matrix))
#预处理,找到所有的训练集(数据集化分)留存交叉验证(随机化分训练集和测试集)
#用法:train_test_split(数据集,标签,test_size=0.2) 例如0.2时。20%作为测试集
x_train,x_test,y_train,y_test=train_test_split(train_test_matrix,classList,test_size=0.2)
#求出概率p0,p1,pA
p0Vect,p1Vect,pAbusive=trainNB0(x_train,y_train)#词汇向量,标签
print("p0Vect:",p0Vect,"p1Vect:",p1Vect,"pAbusive:",pAbusive)
#可以使用NBC分类器队邮件进行化分
i=0;errcount=0
for testVec in x_test:
Clabel=classifyNB(testVec,p0Vect,p1Vect,pAbusive)#分类器分类的标签
print(Clabel,y_test[i])
#计算错误率
#找到出错的邮件包含了哪些词汇,用到了列表生成式
if Clabel!=y_test[i]:
errcount+=1
print('去重后:',[vec for vec in vocabList if testVec[vocabList.index(vec)]!=0])
#l=[i for i in range(50) if train_test_matrix[i] == testvec]
#for j in l:
# print("没有去重后的:",docList[j])
i+=1
print('错误率:',errcount/len(x_test))#len(x_test)求出测试邮件的个数
spamTest()
输出如下