Skip to content

Commit 09d7601

Browse files
committed
final finsh maxENT
1 parent d40d83b commit 09d7601

File tree

8 files changed

+282
-132
lines changed

8 files changed

+282
-132
lines changed

.idea/inspectionProfiles/Project_Default.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/inspectionProfiles/profiles_settings.xml

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/mnist_test.iml

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/workspace.xml

Lines changed: 204 additions & 72 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
@Date: 09-08-16
44
@Email: wendesi@foxmail.com
55
@Last modified by: WenDesi
6-
@Last modified time: 08-11-16
6+
@Last modified time: 09-11-16
77
-->
88

99

@@ -42,4 +42,5 @@
4242
<br>代码:[logistic_regression/logistic_regression.py](https://github.com/WenDesi/lihang_book_algorithm/blob/master/logistic_regression/logistic_regression.py)
4343

4444
### 第六章 最大熵模型
45-
正在施工中。。。
45+
博客:[李航《统计学习方法》第六章——用Python实现最大熵模型(MNIST数据集)](http://blog.csdn.net/wds2006sdo/article/details/53106579)
46+
<br>代码:[maxENT/maxENT.py](https://github.com/WenDesi/lihang_book_algorithm/blob/master/maxENT/maxENT.py)

maxENT/maxENT.py

Lines changed: 57 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@
33
# @Date: 05-11-16
44
# @Email: wendesi@foxmail.com
55
# @Last modified by: WenDesi
6-
# @Last modified time: 06-11-16
6+
# @Last modified time: 09-11-16
77

88

9+
import pandas as pd
10+
import numpy as np
11+
12+
import time
913
import math
1014
import random
1115

1216
from collections import defaultdict
1317

18+
from sklearn.cross_validation import train_test_split
19+
from sklearn.metrics import accuracy_score
1420

1521

1622
class MaxEnt(object):
@@ -21,9 +27,10 @@ def init_params(self, X, Y):
2127

2228
self.cal_Pxy_Px(X, Y)
2329

24-
self.N = len(X)
25-
self.n = len(self.Pxy)
26-
self.M = 2.0
30+
self.N = len(X) # 训练集大小
31+
self.n = len(self.Pxy) # 书中(x,y)对数
32+
self.M = 10000.0 # 书91页那个M,但实际操作中并没有用那个值
33+
# 可认为是学习速率
2734

2835
self.build_dict()
2936
self.cal_EPxy()
@@ -49,29 +56,37 @@ def cal_Pxy_Px(self, X, Y):
4956
self.Px[x] += 1
5057

5158
def cal_EPxy(self):
59+
'''
60+
计算书中82页最下面那个期望
61+
'''
5262
self.EPxy = defaultdict(float)
5363
for id in xrange(self.n):
5464
(x, y) = self.id2xy[id]
5565
self.EPxy[id] = float(self.Pxy[(x, y)]) / float(self.N)
5666

57-
def cal_pyx(self,X,y):
67+
def cal_pyx(self, X, y):
5868
result = 0.0
5969
for x in X:
6070
if self.fxy(x, y):
6171
id = self.xy2id[(x, y)]
6272
result += self.w[id]
63-
return (math.exp(result),y)
73+
return (math.exp(result), y)
6474

6575
def cal_probality(self, X):
66-
Pyxs = [(self.cal_pyx(X,y)) for y in self.Y_]
76+
'''
77+
计算书85页公式6.22
78+
'''
79+
Pyxs = [(self.cal_pyx(X, y)) for y in self.Y_]
6780
Z = sum([prob for prob, y in Pyxs])
68-
return [(prob/Z,y) for prob,y in Pyxs]
69-
81+
return [(prob / Z, y) for prob, y in Pyxs]
7082

7183
def cal_EPx(self):
84+
'''
85+
计算书83页最上面那个期望
86+
'''
7287
self.EPx = [0.0 for i in xrange(self.n)]
7388

74-
for i,X in enumerate(self.X_):
89+
for i, X in enumerate(self.X_):
7590
Pyxs = self.cal_probality(X)
7691

7792
for x in X:
@@ -98,8 +113,8 @@ def train(self, X, Y):
98113
sigma = 1 / self.M * math.log(self.EPxy[i] / self.EPx[i])
99114
sigmas.append(sigma)
100115

101-
if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0:
102-
break
116+
# if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0:
117+
# break
103118

104119
self.w = [self.w[i] + sigmas[i] for i in xrange(self.n)]
105120

@@ -111,67 +126,53 @@ def predict(self, testset):
111126
return results
112127

113128

114-
def build_dataset(label,original_posins,radius,size):
115-
datasets = []
116-
dim = len(original_posins)
117-
118-
for i in xrange(size):
119-
dataset = [label]
120-
for j in xrange(dim):
121-
point = random.randint(0,2*radius)-radius+original_posins[j]
122-
dataset.append(point)
123-
datasets.append(dataset)
124-
125-
return datasets
126-
127-
128-
129129
def rebuild_features(features):
130+
'''
131+
将原feature的(a0,a1,a2,a3,a4,...)
132+
变成 (0_a0,1_a1,2_a2,3_a3,4_a4,...)形式
133+
'''
130134
new_features = []
131135
for feature in features:
132136
new_feature = []
133-
for i,f in enumerate(feature):
134-
new_feature.append(str(i)+'_'+str(f))
137+
for i, f in enumerate(feature):
138+
new_feature.append(str(i) + '_' + str(f))
135139
new_features.append(new_feature)
136140
return new_features
137141

138142

139-
140-
141-
142-
143143
if __name__ == "__main__":
144144

145-
# 构建训练集
146-
trainset1 = build_dataset(0,[0,0],10,100)
147-
trainset2 = build_dataset(1,[30,30],10,100)
145+
print 'Start read data'
148146

149-
trainset = trainset1
150-
trainset.extend(trainset2)
151-
random.shuffle(trainset)
147+
time_1 = time.time()
152148

153-
trainset_features = rebuild_features(map(lambda x:x[1:], trainset))
154-
trainset_labels = map(lambda x:x[0], trainset)
149+
raw_data = pd.read_csv('../data/train_binary.csv', header=0)
150+
data = raw_data.values
155151

156-
# 训练
157-
met = MaxEnt()
158-
met.train(trainset_features,trainset_labels)
152+
imgs = data[0::, 1::]
153+
labels = data[::, 0]
159154

160-
# 构建测试集
161-
testset1 = build_dataset(0,[0,0],15,500)
162-
testset2 = build_dataset(1,[30,30],15,500)
155+
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
156+
train_features, test_features, train_labels, test_labels = train_test_split(
157+
imgs, labels, test_size=0.33, random_state=23323)
163158

164-
testset = testset1
165-
testset.extend(testset2)
166-
random.shuffle(testset)
159+
train_features = rebuild_features(train_features)
160+
test_features = rebuild_features(test_features)
167161

168-
testset_features = rebuild_features(map(lambda x:x[1:], testset))
169-
testset_labels = map(lambda x:x[0], testset)
162+
time_2 = time.time()
163+
print 'read data cost ', time_2 - time_1, ' second', '\n'
170164

171-
# 测试
172-
testset_predicts = met.predict(testset_features)
173-
accuracy_score = float(len(filter(lambda x:x==True,[testset_labels[i]==testset_predicts[i] for i in xrange(len(testset_predicts))])))/float(len(testset_predicts))
174-
print "The accruacy socre is ", accuracy_score
165+
print 'Start training'
166+
met = MaxEnt()
167+
met.train(train_features, train_labels)
175168

169+
time_3 = time.time()
170+
print 'training cost ', time_3 - time_2, ' second', '\n'
176171

172+
print 'Start predicting'
173+
test_predict = met.predict(test_features)
174+
time_4 = time.time()
175+
print 'predicting cost ', time_4 - time_3, ' second', '\n'
177176

177+
score = accuracy_score(test_labels, test_predict)
178+
print "The accruacy socre is ", score

0 commit comments

Comments
 (0)