1
+ #encoding=utf-8
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import cv2
6
+ import random
7
+ import time
8
+
9
+ from sklearn .cross_validation import train_test_split
10
+ from sklearn .metrics import accuracy_score
11
+
12
+ # 二值化
13
+ def binaryzation (img ):
14
+ cv_img = img .astype (np .uint8 )
15
+ cv2 .threshold (cv_img ,50 ,1 ,cv2 .cv .CV_THRESH_BINARY_INV ,cv_img )
16
+ return cv_img
17
+
18
+ def Train (trainset ,train_labels ):
19
+ prior_probability = np .zeros (class_num ) # 先验概率
20
+ conditional_probability = np .zeros ((class_num ,feature_len ,2 )) # 条件概率
21
+
22
+ # 计算先验概率及条件概率
23
+ for i in range (len (train_labels )):
24
+ img = binaryzation (trainset [i ]) # 图片二值化
25
+ label = train_labels [i ]
26
+
27
+ prior_probability [label ] += 1
28
+
29
+ for j in range (feature_len ):
30
+ conditional_probability [label ][j ][img [j ]] += 1
31
+
32
+ # 将概率归到[1.10001]
33
+ for i in range (class_num ):
34
+ for j in range (feature_len ):
35
+
36
+ # 经过二值化后图像只有0,1两种取值
37
+ pix_0 = conditional_probability [i ][j ][0 ]
38
+ pix_1 = conditional_probability [i ][j ][1 ]
39
+
40
+ # 计算0,1像素点对应的条件概率
41
+ probalility_0 = (float (pix_0 )/ float (pix_0 + pix_1 ))* 1000000 + 1
42
+ probalility_1 = (float (pix_1 )/ float (pix_0 + pix_1 ))* 1000000 + 1
43
+
44
+ conditional_probability [i ][j ][0 ] = probalility_0
45
+ conditional_probability [i ][j ][1 ] = probalility_1
46
+
47
+ return prior_probability ,conditional_probability
48
+
49
+ # 计算概率
50
+ def calculate_probability (img ,label ):
51
+ probability = int (prior_probability [label ])
52
+
53
+ for i in range (len (img )):
54
+ probability *= int (conditional_probability [label ][i ][img [i ]])
55
+
56
+ return probability
57
+
58
+ def Predict (testset ,prior_probability ,conditional_probability ):
59
+ predict = []
60
+
61
+ for img in testset :
62
+
63
+ # 图像二值化
64
+ img = binaryzation (img )
65
+
66
+ max_label = 0
67
+ max_probability = calculate_probability (img ,0 )
68
+
69
+ for j in range (1 ,10 ):
70
+ probability = calculate_probability (img ,j )
71
+
72
+ if max_probability < probability :
73
+ max_label = j
74
+ max_probability = probability
75
+
76
+ predict .append (max_label )
77
+
78
+ return np .array (predict )
79
+
80
+
81
+ class_num = 10
82
+ feature_len = 784
83
+
84
+ if __name__ == '__main__' :
85
+
86
+ print 'Start read data'
87
+
88
+ time_1 = time .time ()
89
+
90
+ raw_data = pd .read_csv ('../data/train.csv' ,header = 0 )
91
+ data = raw_data .values
92
+
93
+ imgs = data [0 ::,1 ::]
94
+ labels = data [::,0 ]
95
+
96
+ # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
97
+ train_features , test_features , train_labels , test_labels = train_test_split (imgs , labels , test_size = 0.33 , random_state = 23323 )
98
+ # print train_features.shape
99
+ # print train_features.shape
100
+
101
+ time_2 = time .time ()
102
+ print 'read data cost ' ,time_2 - time_1 ,' second' ,'\n '
103
+
104
+ print 'Start training'
105
+ prior_probability ,conditional_probability = Train (train_features ,train_labels )
106
+ time_3 = time .time ()
107
+ print 'training cost ' ,time_3 - time_2 ,' second' ,'\n '
108
+
109
+ print 'Start predicting'
110
+ test_predict = Predict (test_features ,prior_probability ,conditional_probability )
111
+ time_4 = time .time ()
112
+ print 'predicting cost ' ,time_4 - time_3 ,' second' ,'\n '
113
+
114
+ score = accuracy_score (test_labels ,test_predict )
115
+ print "The accruacy socre is " , score
0 commit comments