3
3
# @Date: 05-11-16
4
4
# @Email: wendesi@foxmail.com
5
5
# @Last modified by: WenDesi
6
- # @Last modified time: 06 -11-16
6
+ # @Last modified time: 09 -11-16
7
7
8
8
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+ import time
9
13
import math
10
14
import random
11
15
12
16
from collections import defaultdict
13
17
18
+ from sklearn .cross_validation import train_test_split
19
+ from sklearn .metrics import accuracy_score
14
20
15
21
16
22
class MaxEnt (object ):
@@ -21,9 +27,10 @@ def init_params(self, X, Y):
21
27
22
28
self .cal_Pxy_Px (X , Y )
23
29
24
- self .N = len (X )
25
- self .n = len (self .Pxy )
26
- self .M = 2.0
30
+ self .N = len (X ) # 训练集大小
31
+ self .n = len (self .Pxy ) # 书中(x,y)对数
32
+ self .M = 10000.0 # 书91页那个M,但实际操作中并没有用那个值
33
+ # 可认为是学习速率
27
34
28
35
self .build_dict ()
29
36
self .cal_EPxy ()
@@ -49,29 +56,37 @@ def cal_Pxy_Px(self, X, Y):
49
56
self .Px [x ] += 1
50
57
51
58
def cal_EPxy (self ):
59
+ '''
60
+ 计算书中82页最下面那个期望
61
+ '''
52
62
self .EPxy = defaultdict (float )
53
63
for id in xrange (self .n ):
54
64
(x , y ) = self .id2xy [id ]
55
65
self .EPxy [id ] = float (self .Pxy [(x , y )]) / float (self .N )
56
66
57
- def cal_pyx (self ,X , y ):
67
+ def cal_pyx (self , X , y ):
58
68
result = 0.0
59
69
for x in X :
60
70
if self .fxy (x , y ):
61
71
id = self .xy2id [(x , y )]
62
72
result += self .w [id ]
63
- return (math .exp (result ),y )
73
+ return (math .exp (result ), y )
64
74
65
75
def cal_probality (self , X ):
66
- Pyxs = [(self .cal_pyx (X ,y )) for y in self .Y_ ]
76
+ '''
77
+ 计算书85页公式6.22
78
+ '''
79
+ Pyxs = [(self .cal_pyx (X , y )) for y in self .Y_ ]
67
80
Z = sum ([prob for prob , y in Pyxs ])
68
- return [(prob / Z ,y ) for prob ,y in Pyxs ]
69
-
81
+ return [(prob / Z , y ) for prob , y in Pyxs ]
70
82
71
83
def cal_EPx (self ):
84
+ '''
85
+ 计算书83页最上面那个期望
86
+ '''
72
87
self .EPx = [0.0 for i in xrange (self .n )]
73
88
74
- for i ,X in enumerate (self .X_ ):
89
+ for i , X in enumerate (self .X_ ):
75
90
Pyxs = self .cal_probality (X )
76
91
77
92
for x in X :
@@ -98,8 +113,8 @@ def train(self, X, Y):
98
113
sigma = 1 / self .M * math .log (self .EPxy [i ] / self .EPx [i ])
99
114
sigmas .append (sigma )
100
115
101
- if len (filter (lambda x : abs (x ) >= 0.01 , sigmas )) == 0 :
102
- break
116
+ # if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0:
117
+ # break
103
118
104
119
self .w = [self .w [i ] + sigmas [i ] for i in xrange (self .n )]
105
120
@@ -111,67 +126,53 @@ def predict(self, testset):
111
126
return results
112
127
113
128
114
- def build_dataset (label ,original_posins ,radius ,size ):
115
- datasets = []
116
- dim = len (original_posins )
117
-
118
- for i in xrange (size ):
119
- dataset = [label ]
120
- for j in xrange (dim ):
121
- point = random .randint (0 ,2 * radius )- radius + original_posins [j ]
122
- dataset .append (point )
123
- datasets .append (dataset )
124
-
125
- return datasets
126
-
127
-
128
-
129
129
def rebuild_features (features ):
130
+ '''
131
+ 将原feature的(a0,a1,a2,a3,a4,...)
132
+ 变成 (0_a0,1_a1,2_a2,3_a3,4_a4,...)形式
133
+ '''
130
134
new_features = []
131
135
for feature in features :
132
136
new_feature = []
133
- for i ,f in enumerate (feature ):
134
- new_feature .append (str (i )+ '_' + str (f ))
137
+ for i , f in enumerate (feature ):
138
+ new_feature .append (str (i ) + '_' + str (f ))
135
139
new_features .append (new_feature )
136
140
return new_features
137
141
138
142
139
-
140
-
141
-
142
-
143
143
if __name__ == "__main__" :
144
144
145
- # 构建训练集
146
- trainset1 = build_dataset (0 ,[0 ,0 ],10 ,100 )
147
- trainset2 = build_dataset (1 ,[30 ,30 ],10 ,100 )
145
+ print 'Start read data'
148
146
149
- trainset = trainset1
150
- trainset .extend (trainset2 )
151
- random .shuffle (trainset )
147
+ time_1 = time .time ()
152
148
153
- trainset_features = rebuild_features ( map ( lambda x : x [ 1 :], trainset ) )
154
- trainset_labels = map ( lambda x : x [ 0 ], trainset )
149
+ raw_data = pd . read_csv ( '../data/train_binary.csv' , header = 0 )
150
+ data = raw_data . values
155
151
156
- # 训练
157
- met = MaxEnt ()
158
- met .train (trainset_features ,trainset_labels )
152
+ imgs = data [0 ::, 1 ::]
153
+ labels = data [::, 0 ]
159
154
160
- # 构建测试集
161
- testset1 = build_dataset ( 0 ,[ 0 , 0 ], 15 , 500 )
162
- testset2 = build_dataset ( 1 ,[ 30 , 30 ], 15 , 500 )
155
+ # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
156
+ train_features , test_features , train_labels , test_labels = train_test_split (
157
+ imgs , labels , test_size = 0.33 , random_state = 23323 )
163
158
164
- testset = testset1
165
- testset .extend (testset2 )
166
- random .shuffle (testset )
159
+ train_features = rebuild_features (train_features )
160
+ test_features = rebuild_features (test_features )
167
161
168
- testset_features = rebuild_features ( map ( lambda x : x [ 1 :], testset ) )
169
- testset_labels = map ( lambda x : x [ 0 ], testset )
162
+ time_2 = time . time ( )
163
+ print 'read data cost ' , time_2 - time_1 , ' second' , ' \n '
170
164
171
- # 测试
172
- testset_predicts = met .predict (testset_features )
173
- accuracy_score = float (len (filter (lambda x :x == True ,[testset_labels [i ]== testset_predicts [i ] for i in xrange (len (testset_predicts ))])))/ float (len (testset_predicts ))
174
- print "The accruacy socre is " , accuracy_score
165
+ print 'Start training'
166
+ met = MaxEnt ()
167
+ met .train (train_features , train_labels )
175
168
169
+ time_3 = time .time ()
170
+ print 'training cost ' , time_3 - time_2 , ' second' , '\n '
176
171
172
+ print 'Start predicting'
173
+ test_predict = met .predict (test_features )
174
+ time_4 = time .time ()
175
+ print 'predicting cost ' , time_4 - time_3 , ' second' , '\n '
177
176
177
+ score = accuracy_score (test_labels , test_predict )
178
+ print "The accruacy socre is " , score
0 commit comments