Skip to content

Commit a5a91dc

Browse files
committed
bayes
1 parent 2959a0e commit a5a91dc

File tree

1 file changed

+115
-0
lines changed

1 file changed

+115
-0
lines changed

naive_bayes/naive_bayes.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#encoding=utf-8
2+
3+
import pandas as pd
4+
import numpy as np
5+
import cv2
6+
import random
7+
import time
8+
9+
from sklearn.cross_validation import train_test_split
10+
from sklearn.metrics import accuracy_score
11+
12+
# 二值化
13+
def binaryzation(img):
14+
cv_img = img.astype(np.uint8)
15+
cv2.threshold(cv_img,50,1,cv2.cv.CV_THRESH_BINARY_INV,cv_img)
16+
return cv_img
17+
18+
def Train(trainset,train_labels):
19+
prior_probability = np.zeros(class_num) # 先验概率
20+
conditional_probability = np.zeros((class_num,feature_len,2)) # 条件概率
21+
22+
# 计算先验概率及条件概率
23+
for i in range(len(train_labels)):
24+
img = binaryzation(trainset[i]) # 图片二值化
25+
label = train_labels[i]
26+
27+
prior_probability[label] += 1
28+
29+
for j in range(feature_len):
30+
conditional_probability[label][j][img[j]] += 1
31+
32+
# 将概率归到[1.10001]
33+
for i in range(class_num):
34+
for j in range(feature_len):
35+
36+
# 经过二值化后图像只有0,1两种取值
37+
pix_0 = conditional_probability[i][j][0]
38+
pix_1 = conditional_probability[i][j][1]
39+
40+
# 计算0,1像素点对应的条件概率
41+
probalility_0 = (float(pix_0)/float(pix_0+pix_1))*1000000 + 1
42+
probalility_1 = (float(pix_1)/float(pix_0+pix_1))*1000000 + 1
43+
44+
conditional_probability[i][j][0] = probalility_0
45+
conditional_probability[i][j][1] = probalility_1
46+
47+
return prior_probability,conditional_probability
48+
49+
# 计算概率
50+
def calculate_probability(img,label):
51+
probability = int(prior_probability[label])
52+
53+
for i in range(len(img)):
54+
probability *= int(conditional_probability[label][i][img[i]])
55+
56+
return probability
57+
58+
def Predict(testset,prior_probability,conditional_probability):
59+
predict = []
60+
61+
for img in testset:
62+
63+
# 图像二值化
64+
img = binaryzation(img)
65+
66+
max_label = 0
67+
max_probability = calculate_probability(img,0)
68+
69+
for j in range(1,10):
70+
probability = calculate_probability(img,j)
71+
72+
if max_probability < probability:
73+
max_label = j
74+
max_probability = probability
75+
76+
predict.append(max_label)
77+
78+
return np.array(predict)
79+
80+
81+
class_num = 10
82+
feature_len = 784
83+
84+
if __name__ == '__main__':
85+
86+
print 'Start read data'
87+
88+
time_1 = time.time()
89+
90+
raw_data = pd.read_csv('../data/train.csv',header=0)
91+
data = raw_data.values
92+
93+
imgs = data[0::,1::]
94+
labels = data[::,0]
95+
96+
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
97+
train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)
98+
# print train_features.shape
99+
# print train_features.shape
100+
101+
time_2 = time.time()
102+
print 'read data cost ',time_2 - time_1,' second','\n'
103+
104+
print 'Start training'
105+
prior_probability,conditional_probability = Train(train_features,train_labels)
106+
time_3 = time.time()
107+
print 'training cost ',time_3 - time_2,' second','\n'
108+
109+
print 'Start predicting'
110+
test_predict = Predict(test_features,prior_probability,conditional_probability)
111+
time_4 = time.time()
112+
print 'predicting cost ',time_4 - time_3,' second','\n'
113+
114+
score = accuracy_score(test_labels,test_predict)
115+
print "The accruacy socre is ", score

0 commit comments

Comments
 (0)