机器学习实战第4章-朴素贝叶斯

作者: 异想派 | 来源:发表于2017-03-18 17:52 被阅读74次

机器学习实战朴素贝叶斯
朴素贝叶斯法解析实践
机器学习数学原理（4）——朴素贝叶斯模型
朴素贝叶斯
机器学习实战-朴素贝叶斯
机器学习实战-朴素贝叶斯
机器学习实战-朴素贝叶斯
朴素贝叶斯法
朴素贝叶斯
朴素贝叶斯分类算法

从文本中构建词向量

def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

if __name__=='__main__':
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)
    a=setofwords2vec(myvocablist,listoposts[3])
    print myvocablist
    print a

从词向量计算概率

from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=zeros(numwords)
    p1num=zeros(numwords)
    p0denom=0.0 ; p1denom=0.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=p1num/p1denom
    p0vect=p0num/p0denom
    return p0vect,p1vect,pabusive

if __name__=='__main__':
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)
    print myvocablist
    print trainmat
    print pab
    print p0v
    print p1v

测试算法，根据现实情况修改分类器

'''
Created on March, 2017

@author: yang
'''

from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=ones(numwords)
    p1num=ones(numwords)
    p0denom=2.0 ; p1denom=2.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=log(p1num/p1denom)
    p0vect=log(p0num/p0denom)
    return p0vect,p1vect,pabusive

def classifynb(vec2classify,p0vec,p1vec,pclass1):
    p1=sum(vec2classify*p1vec)+log(pclass1)
    p0=sum(vec2classify*p0vec)+log(1-pclass1)
    if p1>p0:
        return 1
    else:
        return 0

def testingnb():
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)  
    testentry=['love','my','dalmation']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    testentry=['stupid','garbage']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)

if __name__=='__main__':
    testingnb()

测试算法：使用朴素贝叶斯进行交叉验证

from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]+=1  #bag-of    
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=ones(numwords)
    p1num=ones(numwords)
    p0denom=2.0 ; p1denom=2.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=log(p1num/p1denom)
    p0vect=log(p0num/p0denom)
    return p0vect,p1vect,pabusive

def classifynb(vec2classify,p0vec,p1vec,pclass1):
    p1=sum(vec2classify*p1vec)+log(pclass1)
    p0=sum(vec2classify*p0vec)+log(1-pclass1)
    if p1>p0:
        return 1
    else:
        return 0

def testingnb():
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)  
    testentry=['love','my','dalmation']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    testentry=['stupid','garbage']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)

def textparse(bigstring):
    import re
    listoftokens=re.split(r'\W*',bigstring)
    return [tok.lower() for tok in listoftokens if len(tok)>2]

def spamtest():
    doclist=[]
    classlist=[]
    fulltext=[]
    for i in range(1,26):
        wordlist=textparse(open('/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/spam/%d.txt' % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(1)
        wordlist=textparse(open("/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/ham/%d.txt" % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(0)
    vocablist=createvocablist(doclist)
    trainingset=range(50)
    testset=[]
    for i in range(10):
        randindex=int(random.uniform(0,len(trainingset)))
        testset.append(trainingset[randindex])
        del (trainingset[randindex])
    trainmat=[]
    trainclasses=[]
    for docindex in trainingset:
        trainmat.append(setofwords2vec(vocablist,doclist[docindex]))
        trainclasses.append(classlist[docindex])
    p0v,p1v,pspam=trainnb0(array(trainmat),array(trainclasses))
    errorcount=0
    for docindex in testset:
        wordvector=setofwords2vec(vocablist,doclist[docindex])
        if classifynb(array(wordvector),p0v,p1v,pspam)!=classlist[docindex]:
            errorcount+=1
    print 'the error rate is: ',float(errorcount)/len(testset) 

if __name__=='__main__':
    spamtest()

机器学习实战朴素贝叶斯
title: 朴素贝叶斯机器学习实战date: 2019-07-28tags: 机器学习贝叶斯categori...
朴素贝叶斯法解析实践
教材选用《统计学习方法》，第一版，李航著；代码取自《机器学习实战》，人民邮电出版社；朴素贝叶斯介绍朴素贝叶斯法...
机器学习数学原理（4）——朴素贝叶斯模型
机器学习数学原理（4）——朴素贝叶斯模型朴素贝叶斯模型（Naive Bayes Model），是一种基于贝叶斯定...
朴素贝叶斯
学习机器学习最简单的算法可以说就是朴素贝叶斯了，今天分享下自己的学习心得。什么是贝叶斯，什么是朴素贝叶斯贝叶...
机器学习实战-朴素贝叶斯
前两章我们要求分类器作出艰难的抉择，不过分类器有时候会产生错误，这时会产生错误结果，这是可以要求分类器给出一个最优...
机器学习实战-朴素贝叶斯
朴素贝叶斯算法(Naive Bayes) 前两章学习的knn和决策树分类都直接给出了答案，但是不能避免一些分类错误...
机器学习实战-朴素贝叶斯
对于特征向量，拥有标签，即分类个数。我们对每一类标签计算概率：然后取概率最大的那个标签作为我们的分类。为了实现这...
朴素贝叶斯法
朴素贝叶斯法朴素贝叶斯法的学习与分类朴素贝叶斯法的参数估计朴素贝叶斯实现高斯朴素贝叶斯实现使用 skle...
朴素贝叶斯
机器学习实战朴素贝叶斯算法可以要求分类器给出一个最优的类别猜测结果，同时给出这个猜测的概率估计值。基于贝叶斯决...
朴素贝叶斯分类算法
朴素贝叶斯分类算法多项式和高斯朴素贝叶斯的解释朴素贝叶斯是一种有监督的机器学习方法，是概率分类器家族的一员。它采...