美文网首页机器学习
机器学习实战第4章-朴素贝叶斯

机器学习实战第4章-朴素贝叶斯

作者: 异想派 | 来源:发表于2017-03-18 17:52 被阅读74次
  • 从文本中构建词向量
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

if __name__=='__main__':
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)
    a=setofwords2vec(myvocablist,listoposts[3])
    print myvocablist
    print a

  • 从词向量计算概率
from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=zeros(numwords)
    p1num=zeros(numwords)
    p0denom=0.0 ; p1denom=0.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=p1num/p1denom
    p0vect=p0num/p0denom
    return p0vect,p1vect,pabusive

if __name__=='__main__':
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)
    print myvocablist
    print trainmat
    print pab
    print p0v
    print p1v

  • 测试算法,根据现实情况修改分类器
'''
Created on March, 2017

@author: yang
'''

from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]=1
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=ones(numwords)
    p1num=ones(numwords)
    p0denom=2.0 ; p1denom=2.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=log(p1num/p1denom)
    p0vect=log(p0num/p0denom)
    return p0vect,p1vect,pabusive

def classifynb(vec2classify,p0vec,p1vec,pclass1):
    p1=sum(vec2classify*p1vec)+log(pclass1)
    p0=sum(vec2classify*p0vec)+log(1-pclass1)
    if p1>p0:
        return 1
    else:
        return 0

def testingnb():
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)  
    testentry=['love','my','dalmation']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    testentry=['stupid','garbage']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)

if __name__=='__main__':
    testingnb()

  • 测试算法:使用朴素贝叶斯进行交叉验证
from numpy import *
def loaddataset():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec=[0,1,0,1,0,1]
    return postinglist,classvec

def createvocablist(dataset):
    vocabset=set([])
    for document in dataset:
        vocabset=vocabset|set(document)   #union of the two sets
    return list(vocabset)

def setofwords2vec(vocablist,inputset):
    returnvec=[0]*len(vocablist)   #create a vector where compose of 0
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)]+=1  #bag-of    
        else:
            print "the word:%s is not in my vocabulary" %word
    return returnvec

def trainnb0(trainmatrix,traincategory):
    numtraindocs=len(trainmatrix)   #number of document
    numwords=len(trainmatrix[0])    #element of each document
    pabusive=sum(traincategory)/float(numtraindocs)
    p0num=ones(numwords)
    p1num=ones(numwords)
    p0denom=2.0 ; p1denom=2.0
    for i in range(numtraindocs):
        if traincategory[i]==1:
            p1num+=trainmatrix[i]
            p1denom+=sum(trainmatrix[i])
        else:
            p0num+=trainmatrix[i]
            p0denom+=sum(trainmatrix[i])
    p1vect=log(p1num/p1denom)
    p0vect=log(p0num/p0denom)
    return p0vect,p1vect,pabusive

def classifynb(vec2classify,p0vec,p1vec,pclass1):
    p1=sum(vec2classify*p1vec)+log(pclass1)
    p0=sum(vec2classify*p0vec)+log(1-pclass1)
    if p1>p0:
        return 1
    else:
        return 0

def testingnb():
    listoposts,listclasses=loaddataset()  #assign postinglist,classvec to a in tuple
    myvocablist=createvocablist(listoposts)   #contain unique value of documents
    trainmat=[]
    for postindoc in listoposts:
        trainmat.append(setofwords2vec(myvocablist,postindoc)) #construct [0,1] vector of each document
    p0v,p1v,pab=trainnb0(trainmat,listclasses)  
    testentry=['love','my','dalmation']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)
    testentry=['stupid','garbage']
    thisdoc=array(setofwords2vec(myvocablist,testentry))
    print testentry,'classified as:',classifynb(thisdoc,p0v,p1v,pab)

def textparse(bigstring):
    import re
    listoftokens=re.split(r'\W*',bigstring)
    return [tok.lower() for tok in listoftokens if len(tok)>2]

def spamtest():
    doclist=[]
    classlist=[]
    fulltext=[]
    for i in range(1,26):
        wordlist=textparse(open('/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/spam/%d.txt' % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(1)
        wordlist=textparse(open("/Users/enniu/Desktop/jqxx/machinelearninginaction/Ch04/email/ham/%d.txt" % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(0)
    vocablist=createvocablist(doclist)
    trainingset=range(50)
    testset=[]
    for i in range(10):
        randindex=int(random.uniform(0,len(trainingset)))
        testset.append(trainingset[randindex])
        del (trainingset[randindex])
    trainmat=[]
    trainclasses=[]
    for docindex in trainingset:
        trainmat.append(setofwords2vec(vocablist,doclist[docindex]))
        trainclasses.append(classlist[docindex])
    p0v,p1v,pspam=trainnb0(array(trainmat),array(trainclasses))
    errorcount=0
    for docindex in testset:
        wordvector=setofwords2vec(vocablist,doclist[docindex])
        if classifynb(array(wordvector),p0v,p1v,pspam)!=classlist[docindex]:
            errorcount+=1
    print 'the error rate is: ',float(errorcount)/len(testset) 

if __name__=='__main__':
    spamtest()

相关文章

  • 机器学习实战 朴素贝叶斯

    title: 朴素贝叶斯 机器学习实战date: 2019-07-28tags: 机器学习 贝叶斯categori...

  • 朴素贝叶斯法解析实践

    教材选用《统计学习方法》,第一版,李航著;代码取自《机器学习实战》,人民邮电出版社; 朴素贝叶斯介绍 朴素贝叶斯法...

  • 机器学习数学原理(4)——朴素贝叶斯模型

    机器学习数学原理(4)——朴素贝叶斯模型 朴素贝叶斯模型(Naive Bayes Model),是一种基于贝叶斯定...

  • 朴素贝叶斯

    学习机器学习最简单的算法可以说就是 朴素贝叶斯了,今天分享下自己的学习心得。 什么是贝叶斯,什么是朴素贝叶斯 贝叶...

  • 机器学习实战-朴素贝叶斯

    前两章我们要求分类器作出艰难的抉择,不过分类器有时候会产生错误,这时会产生错误结果,这是可以要求分类器给出一个最优...

  • 机器学习实战-朴素贝叶斯

    朴素贝叶斯算法(Naive Bayes) 前两章学习的knn和决策树分类都直接给出了答案,但是不能避免一些分类错误...

  • 机器学习实战-朴素贝叶斯

    对于特征向量,拥有标签,即分类个数。我们对每一类标签计算概率:然后取概率最大的那个标签作为我们的分类。 为了实现这...

  • 朴素贝叶斯法

    朴素贝叶斯法 朴素贝叶斯法的学习与分类 朴素贝叶斯法的参数估计 朴素贝叶斯实现 高斯朴素贝叶斯实现 使用 skle...

  • 朴素贝叶斯

    机器学习实战 朴素贝叶斯算法可以要求分类器给出一个最优的类别猜测结果,同时给出这个猜测的概率估计值。 基于贝叶斯决...

  • 朴素贝叶斯分类算法

    朴素贝叶斯分类算法多项式和高斯朴素贝叶斯的解释 朴素贝叶斯是一种有监督的机器学习方法,是概率分类器家族的一员。它采...

网友评论

    本文标题:机器学习实战第4章-朴素贝叶斯

    本文链接:https://www.haomeiwen.com/subject/plfpnttx.html