利用贝叶斯分类器对模型进行垃圾邮件分类(email文件)以及利用简单交叉法计算模型的错误率(划分数据集时20%作为测试集)
import numpy as np
def createVocabList(dataSet):
    vocabSet=set([])
    for document in dataSet:
        vocabSet=vocabSet|set(document)
    return list(vocabSet)
#myvocabList=createVocabList(listOPosts)
#print('特征属性:',myvocabList)
def setOfWords2Vec(vocabList,inputSet):
    returnVec=[0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]=1
        else:
            print("the word:%s is not in my Vocabulary!" %word)
    return returnVec
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs=len(trainMatrix)
    numWords=len(trainMatrix[0])
    pAbusive=sum(trainCategory)/float(numTrainDocs)
   # p0Num=np.zeros(numWords)
    #p1Num=np.zeros(numWords)
    p0Num=np.ones(numWords)
    p1Num=np.ones(numWords)
    p0Denom=2
    p1Denom=2
    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            p1Num+=trainMatrix[i]
            p1Denom+=np.sum(trainMatrix[i])
        else:
            p0Num+=trainMatrix[i]
            p0Denom+=np.sum(trainMatrix[i])
    #p1Vect=p1Num/p1Denom
   # p0Vect=p0Num/p0Denom
    p1Vect=np.log(p1Num/p1Denom)
    p0Vect=np.log(p0Num/p0Denom)
    return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1=sum(vec2Classify*p1Vec)+pClass1
    p0=sum(vec2Classify*p0Vec)+1.0-pClass1
    if p1>p0:
        return 1
    else:
        return 0from sklearn.model_selection import train_test_split#实现随即划分训练集以及测试集
 #预处理-分词(分割文本字符串)   
def splitWords(filename):
    bigString=open(filename,'r',encoding='utf8').read()
    import re
    regEx=re.compile('\W+')#分隔符是除字母和数字以外的长度不小于1的字符
    listOfTokens=regEx.split(bigString)#字符列表
    return [tok.lower() for tok in listOfTokens if len(tok)>2]
#训练模型以及利用交叉验证法测试模型效果
def spamTest():
    #预处理,拿到所有邮件的词汇列表以及所有标签
    from os import listdir
    m=len(listdir('email/spam'))#垃圾文件多少个
    #print(m)
    docList=[];classList=[]
    for i in range(1,m+1):
        wordList=splitWords('email/spam/%d.txt'%i)#一封邮件里面包含的词汇
        docList.append(wordList)
        classList.append(1)#1代表垃圾邮件的标签
        wordList=splitWords('email/ham/%d.txt'%i)#一封邮件里面包含的词汇
        docList.append(wordList)
        classList.append(0)#0代表正常邮件的标签
    #找到所有邮件的特征属性
    vocabList=createVocabList(docList)
    print("特征:",len(vocabList))
    #找到词汇向量
    train_test_matrix=[]
    for WList in docList:
        print(WList)
        train_test_matrix.append(setOfWords2Vec(vocabList,WList))
    print("词汇向量:",len(train_test_matrix))
    
    #预处理,找到所有的训练集(数据集化分)留存交叉验证(随机化分训练集和测试集)
    #用法:train_test_split(数据集,标签,test_size=0.2) 例如0.2时。20%作为测试集
    x_train,x_test,y_train,y_test=train_test_split(train_test_matrix,classList,test_size=0.2)
    
    #求出概率p0,p1,pA
    p0Vect,p1Vect,pAbusive=trainNB0(x_train,y_train)#词汇向量,标签
    print("p0Vect:",p0Vect,"p1Vect:",p1Vect,"pAbusive:",pAbusive)
    
    #可以使用NBC分类器队邮件进行化分
    i=0;errcount=0
    for testVec in x_test:
        Clabel=classifyNB(testVec,p0Vect,p1Vect,pAbusive)#分类器分类的标签
        print(Clabel,y_test[i])
        #计算错误率
        #找到出错的邮件包含了哪些词汇,用到了列表生成式
        if Clabel!=y_test[i]:
            errcount+=1
            print('去重后:',[vec for vec in vocabList if testVec[vocabList.index(vec)]!=0])
            #l=[i for i in range(50) if train_test_matrix[i] == testvec]
            #for j in l:
            #        print("没有去重后的:",docList[j])
        i+=1    
    print('错误率:',errcount/len(x_test))#len(x_test)求出测试邮件的个数
spamTest()输出如下



















