-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathbayes.py
More file actions
90 lines (76 loc) · 2.44 KB
/
bayes.py
File metadata and controls
90 lines (76 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from numpy import *
# 朴素贝叶斯
def loadData():
'''
读取数据
'''
def textParse(bigstring):
import re
listoftokens = re.split('\W+', bigstring)
return [tok.lower() for tok in listoftokens if len(tok) > 2]
def setofwords2vec(voc, inputsest):
returnvec = [0] * len(voc)
for word in inputsest:
if word in voc:
returnvec[voc.index(word)] += 1
return returnvec
doclist = []
classlist = []
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
doclist.append(wordList)
classlist.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read())
doclist.append(wordList)
classlist.append(0)
vocabset = set([])
for document in doclist:
vocabset = vocabset | set(document)
vocablist = list(vocabset)
trainmat = []
trainclasses = []
for docindex in range(len(classlist)):
trainmat.append(setofwords2vec(vocablist, doclist[docindex]))
trainclasses.append(classlist[docindex])
return trainmat, trainclasses
def trainBNO(trainmatrix, traincategory):
'''
朴素贝叶斯算法
'''
numtraindocs = len(trainmatrix)
numwords = len(trainmatrix[0])
padbusive = sum(traincategory) / float(numtraindocs)
p0num = ones(numwords)
p1num = ones(numwords)
p0denom = 2.0
p1denom = 2.0
for i in range(numtraindocs):
if traincategory[i] == 1:
p1num += trainmatrix[i]
p1denom += sum(trainmatrix[i])
else:
p0num += trainmatrix[i]
p0denom += sum(trainmatrix[i])
p1vect = log(p1num / p1denom)
p0vect = log(p0num / p0denom)
return p0vect, p1vect, padbusive
def classifynb(vec2classify, p0vec, p1vec, pclass1):
'''
计算结果
'''
p1 = sum(vec2classify * p1vec) + log(pclass1)
p0 = sum(vec2classify * p0vec) + log(1.0 - pclass1)
if p1 > p0:
return 1
else:
return 0
def spamtest():
trainingset, traininglabels = loadData()
p0v, p1v, pab = trainBNO(array(trainingset[10:]), array(traininglabels[10:]))
errorCount = 0
for index in range(len(trainingset) - 40):
if classifynb(array(trainingset[index]), p0v, p1v, pab) != traininglabels[index]:
errorCount += 1
print("the error rate is : ", float(errorCount) / (len(trainingset) - 40))
if __name__ == '__main__':
spamtest()