1
+ from pickle import load
2
+ from numpy import array
3
+ from keras .preprocessing .sequence import pad_sequences
4
+ from keras .utils import to_categorical
5
+
6
+ def load_doc (filename ):
7
+ """读取文本文件为string
8
+ Args:
9
+ filename: 文本文件
10
+ Returns:
11
+ string, 文本文件的内容
12
+ """
13
+ # open the file as read only
14
+ file = open (filename , 'r' )
15
+ # read all text
16
+ text = file .read ()
17
+ # close the file
18
+ file .close ()
19
+ return text
20
+
21
+
22
+ def to_list (captions ):
23
+ """将一个字典(key为文件名, value为图像标题list)转换为图像标题list
24
+
25
+ Args:
26
+ captions: 一个字典, key为文件名, value为图像标题list
27
+
28
+ Returns:
29
+ 图像标题list
30
+
31
+ """
32
+ all_desc = list ()
33
+ # 遍历每幅图片
34
+ for key in captions .keys ():
35
+ [all_desc .append (d ) for d in captions [key ]]
36
+ return all_desc
37
+
38
+
39
+ def get_max_length (captions ):
40
+ """从标题字典计算图像标题里面最长的标题的长度
41
+ Args:
42
+ captions: 一个dict, key为文件名(不带.jpg后缀), value为图像标题list
43
+ Returns:
44
+ 最长标题的长度
45
+ """
46
+ lines = to_list (captions )
47
+ return max (len (d .split ()) for d in lines )
48
+
49
+
50
+ def load_set (filename ):
51
+ """从文本文件加载图像名set
52
+ Args:
53
+ filename: 文本文件,每一行都包含一个图像文件名(包含.jpg文件后缀)
54
+ Returns:get_max_length
55
+ set, 文件名,去除了.jpg后缀
56
+ """
57
+
58
+ doc = load_doc (filename )
59
+ dataset = list ()
60
+ # process line by line
61
+ for line in doc .split ('\n ' ):
62
+ # skip empty lines
63
+ if len (line ) < 1 :
64
+ continue
65
+ # get the image identifier
66
+ identifier = line .split ('.' )[0 ]
67
+ dataset .append (identifier )
68
+ return set (dataset )
69
+
70
+ def load_image_names (filename ):
71
+ """
72
+ 从文本文件加载图像名set
73
+ :param filename:文本文件,每一行都包含一个图像文件名(包含.jpg文件后缀)
74
+ :return:set,文件名,去了.jpg后缀
75
+ """
76
+ doc = load_doc (filename )
77
+ data_set = list ()
78
+ # process line by line
79
+ for line in doc .split ('\n ' ):
80
+ # skip empty lines
81
+ if len (line ) < 1 :
82
+ continue
83
+ # get the image identifier
84
+ identifier = line .split ('.' )[0 ]
85
+ data_set .append (identifier )
86
+ return set (data_set )
87
+
88
+
89
+ def load_clean_captions (filename , dataset ):
90
+ """为图像标题首尾分别加上'startseq ' 和 ' endseq', 作为自动标题生成的起始和终止
91
+ Args:
92
+ filename: 文本文件,每一行由图像名,和图像标题构成, 图像的标题已经进行了清洗
93
+ dataset: 图像名list
94
+ Returns:
95
+ dict, key为图像名, value为添加了'startseq'和'endseq'的标题list
96
+ """
97
+ # load document
98
+ doc = load_doc (filename )
99
+ descriptions = dict ()
100
+ for line in doc .split ('\n ' ):
101
+ # split line by white space
102
+ tokens = line .split ()
103
+ # split id from description
104
+ image_id , image_desc = tokens [0 ], tokens [1 :]
105
+ # skip images not in the set
106
+ if image_id in dataset :
107
+ # create list
108
+ if image_id not in descriptions :
109
+ descriptions [image_id ] = list ()
110
+ # wrap description in tokens
111
+ desc = 'startseq ' + ' ' .join (image_desc ) + ' endseq'
112
+ # store
113
+ descriptions [image_id ].append (desc )
114
+ return descriptions
115
+
116
+
117
+ def load_photo_features (filename , dataset ):
118
+ """从图像特征文件中加载给定图像名list对应的图像特征
119
+
120
+ Args:
121
+ filename: 包含图像特征的文件名, 文件加载以后是一个字典,
122
+ key为'Flicker8k_Dataset/' + 文件名,
123
+ value为文件名对应的图表的特征
124
+ dataset: 图像文件名list
125
+
126
+ Returns:
127
+ 图像特征字典, key为文件名,
128
+ value为文件名对应的图表的特征
129
+
130
+ """
131
+ # load all features
132
+ all_features = load (open (filename , 'rb' ))
133
+ # filter features
134
+ features = {k : all_features [k ] for k in dataset }
135
+ return features
136
+
137
+ #根据数据训练模型
138
+ #读取一组图像id
139
+ def load_ids (fn ):
140
+ doc = load_doc (fn )
141
+ ret = list ()
142
+ for line in doc .split ('\n ' ):
143
+ if len (line ) < 1 :
144
+ continue
145
+ id = line .split ('.' )[0 ]
146
+ ret .append (id )
147
+ return set (ret )
148
+
149
+
150
+ def create_sequences (tokenizer , max_length , descriptions , photos_features , vocab_size ):
151
+ """
152
+ 从输入的图片标题list和图片特征构造LSTM的一组输入
153
+
154
+ Args:
155
+ :param tokenizer: 英文单词和整数转换的工具keras.preprocessing.text.Tokenizer
156
+ :param max_length: 训练数据集中最长的标题的长度
157
+ :param descriptions: dict, key 为图像的名(不带.jpg后缀), value 为list, 包含一个图像的几个不同的描述
158
+ :param photos_features: dict, key 为图像的名(不带.jpg后缀), value 为numpy array 图像的特征
159
+ :param vocab_size: 训练集中表的单词数量
160
+ :return: tuple:
161
+ 第一个元素为 numpy array, 元素为图像的特征, 它本身也是 numpy.array
162
+ 第二个元素为 numpy array, 元素为图像标题的前缀, 它自身也是 numpy.array
163
+ 第三个元素为 numpy array, 元素为图像标题的下一个单词(根据图像特征和标题的前缀产生) 也为numpy.array
164
+
165
+ Examples:
166
+ from pickle import load
167
+ tokenizer = load(open('tokenizer.pkl', 'rb'))
168
+ max_length = 6
169
+ descriptions = {'1235345':['startseq one bird on tree endseq', "startseq red bird on tree endseq"],
170
+ '1234546':['startseq one boy play water endseq', "startseq one boy run across water endseq"]}
171
+ photo_features = {'1235345':[ 0.434, 0.534, 0.212, 0.98 ],
172
+ '1234546':[ 0.534, 0.634, 0.712, 0.28 ]}
173
+ vocab_size = 7378
174
+ print(create_sequences(tokenizer, max_length, descriptions, photo_features, vocab_size))
175
+ (array([[ 0.434, 0.534, 0.212, 0.98 ],
176
+ [ 0.434, 0.534, 0.212, 0.98 ],
177
+ [ 0.434, 0.534, 0.212, 0.98 ],
178
+ [ 0.434, 0.534, 0.212, 0.98 ],
179
+ [ 0.434, 0.534, 0.212, 0.98 ],
180
+ [ 0.434, 0.534, 0.212, 0.98 ],
181
+ [ 0.434, 0.534, 0.212, 0.98 ],
182
+ [ 0.434, 0.534, 0.212, 0.98 ],
183
+ [ 0.434, 0.534, 0.212, 0.98 ],
184
+ [ 0.434, 0.534, 0.212, 0.98 ],
185
+ [ 0.534, 0.634, 0.712, 0.28 ],
186
+ [ 0.534, 0.634, 0.712, 0.28 ],
187
+ [ 0.534, 0.634, 0.712, 0.28 ],
188
+ [ 0.534, 0.634, 0.712, 0.28 ],
189
+ [ 0.534, 0.634, 0.712, 0.28 ],
190
+ [ 0.534, 0.634, 0.712, 0.28 ],
191
+ [ 0.534, 0.634, 0.712, 0.28 ],
192
+ [ 0.534, 0.634, 0.712, 0.28 ],
193
+ [ 0.534, 0.634, 0.712, 0.28 ],
194
+ [ 0.534, 0.634, 0.712, 0.28 ],
195
+ [ 0.534, 0.634, 0.712, 0.28 ]]),
196
+ array([[ 0, 0, 0, 0, 0, 2],
197
+ [ 0, 0, 0, 0, 2, 59],
198
+ [ 0, 0, 0, 2, 59, 254],
199
+ [ 0, 0, 2, 59, 254, 6],
200
+ [ 0, 2, 59, 254, 6, 134],
201
+ [ 0, 0, 0, 0, 0, 2],
202
+ [ 0, 0, 0, 0, 2, 26],
203
+ [ 0, 0, 0, 2, 26, 254],
204
+ [ 0, 0, 2, 26, 254, 6],
205
+ [ 0, 2, 26, 254, 6, 134],
206
+ [ 0, 0, 0, 0, 0, 2],
207
+ [ 0, 0, 0, 0, 2, 59],
208
+ [ 0, 0, 0, 2, 59, 16],
209
+ [ 0, 0, 2, 59, 16, 82],
210
+ [ 0, 2, 59, 16, 82, 24],
211
+ [ 0, 0, 0, 0, 0, 2],
212
+ [ 0, 0, 0, 0, 2, 59],
213
+ [ 0, 0, 0, 2, 59, 16],
214
+ [ 0, 0, 2, 59, 16, 165],
215
+ [ 0, 2, 59, 16, 165, 127],
216
+ [ 2, 59, 16, 165, 127, 24]]),
217
+ array([[ 0., 0., 0., ..., 0., 0., 0.],
218
+ [ 0., 0., 0., ..., 0., 0., 0.],
219
+ [ 0., 0., 0., ..., 0., 0., 0.],
220
+ ...,
221
+ [ 0., 0., 0., ..., 0., 0., 0.],
222
+ [ 0., 0., 0., ..., 0., 0., 0.],
223
+ [ 0., 0., 0., ..., 0., 0., 0.]]))
224
+ """
225
+ X1 , X2 , y = list (), list (), list ()
226
+ for key , desc_list in descriptions .items ():
227
+ for desc in desc_list :
228
+ seq = tokenizer .texts_to_sequences ([desc ])[0 ]
229
+ for i in range (1 , len (seq )):
230
+ in_seq , out_seq = seq [:i ], seq [i ]
231
+ #填充in_seq,使得其长度为max_length
232
+ in_seq = pad_sequences ([in_seq ], maxlen = max_length )[0 ]
233
+ out_seq = to_categorical ([out_seq ], num_classes = vocab_size )[0 ]
234
+ X1 .append (photos_features [key ][0 ])
235
+ X2 .append (in_seq )
236
+ y .append (out_seq )
237
+ return array (X1 ), array (X2 ), array (y )
238
+
239
+ if __name__ == '__main__' :
240
+ path = 'D:/learningDoc/CVSource/CVSource/cv_action_codes/Homework/homework2/'
241
+ train_image_names = load_image_names (path + 'task4/Flickr_8k.trainImages.txt' )
242
+ print ("train_image_names=%d" % len (train_image_names ))
243
+ descriptions = load_clean_captions (path + 'task5/descriptions.txt' , train_image_names )
244
+ print (descriptions )
0 commit comments