@@ -64,10 +64,15 @@ def read_rating(self, path):
64
64
65
65
# 生成词向量矩阵
66
66
def read_pretrained_word2vec (self , path , vocab , dim ):
67
+ parent_path = '/' .join (path .split ('/' )[:- 1 ]) + '/'
68
+ if os .path .isfile (parent_path + 'preW.all' ):
69
+ print ('Load pretrained_word2vec from preW.all' )
70
+ W = pickle .load (open (parent_path + 'preW.all' , 'rb' ))
71
+ return W
67
72
if os .path .isfile (path ):
68
73
raw_word2vec = open (path , 'r' )
69
74
else :
70
- print ("Path (word2vec) is wrong!" )
75
+ print ("Path (word2vec) is wrong!" )
71
76
sys .exit ()
72
77
73
78
word2vec_dic = {}
@@ -79,7 +84,7 @@ def read_pretrained_word2vec(self, path, vocab, dim):
79
84
_word = tmp [0 ]
80
85
_vec = np .array (tmp [1 :], dtype = float )
81
86
if _vec .shape [0 ] != dim :
82
- print ("Mismatch the dimension of pre-trained word vector with word embedding dimension!" )
87
+ print ("Mismatch the dimension of pre-trained word vector with word embedding dimension!" )
83
88
sys .exit ()
84
89
word2vec_dic [_word ] = _vec
85
90
mean = mean + _vec
@@ -96,7 +101,10 @@ def read_pretrained_word2vec(self, path, vocab, dim):
96
101
else :
97
102
W [i + 1 ] = np .random .normal (mean , 0.1 , size = dim )
98
103
99
- print ("%d words exist in the given pretrained model" % count )
104
+ print ("%d words exist in the given pretrained model" % count )
105
+ print ('Saving preW.all file.' )
106
+ pickle .dump (W , open (parent_path + 'preW.all' , 'wb' ))
107
+ print ('Done' )
100
108
101
109
return W
102
110
@@ -108,8 +116,10 @@ def split_data(self, ratio, R):
108
116
np .random .shuffle (user_rating )
109
117
train .append ((i , user_rating [0 ]))
110
118
119
+ # "*train" to open a list
111
120
remain_item = set (range (R .shape [1 ])) - set (list (zip (* train ))[1 ])
112
121
122
+ # to make sure that training set contains at least a rating on every user and item
113
123
for j in remain_item :
114
124
item_rating = R .tocsc ().T [j ].nonzero ()[1 ]
115
125
np .random .shuffle (item_rating )
@@ -135,10 +145,10 @@ def split_data(self, ratio, R):
135
145
trainset_u_idx = set (trainset_u_idx )
136
146
trainset_i_idx = set (trainset_i_idx )
137
147
if len (trainset_u_idx ) != R .shape [0 ] or len (trainset_i_idx ) != R .shape [1 ]:
138
- print ("Fatal error in split function. Check your data again or contact authors" )
148
+ print ("Fatal error in split function. Check your data again or contact authors" )
139
149
sys .exit ()
140
150
141
- print ("Finish constructing training set and test set" )
151
+ print ("Finish constructing training set and test set" )
142
152
return train , valid , test
143
153
144
154
def generate_train_valid_test_file_from_R (self , path , R , ratio ):
@@ -153,7 +163,7 @@ def generate_train_valid_test_file_from_R(self, path, R, ratio):
153
163
- ratio: (1-ratio), ratio/2 and ratio/2 of the entire dataset (R) will be training, valid and test set, respectively
154
164
'''
155
165
train , valid , test = self .split_data (ratio , R )
156
- print ("Save training set and test set to %s..." % path )
166
+ print ("Save training set and test set to %s..." % path )
157
167
if not os .path .exists (path ):
158
168
os .makedirs (path )
159
169
@@ -248,7 +258,7 @@ def generate_train_valid_test_file_from_R(self, path, R, ratio):
248
258
formatted_item_test = []
249
259
250
260
for j in range (R .shape [1 ]):
251
- if i in item_ratings_train :
261
+ if j in item_ratings_train :
252
262
formatted = [str (len (item_ratings_train [j ]))]
253
263
formatted .extend (["%d:%.1f" % (i , R_lil [i , j ])
254
264
for i in sorted (item_ratings_train [j ])])
@@ -279,9 +289,9 @@ def generate_train_valid_test_file_from_R(self, path, R, ratio):
279
289
f_train_item .close ()
280
290
f_valid_item .close ()
281
291
f_test_item .close ()
282
- print ("\t train_item.dat, valid_item.dat, test_item.dat files are generated." )
292
+ print ("\t train_item.dat, valid_item.dat, test_item.dat files are generated." )
283
293
284
- print ("Done!" )
294
+ print ("Done!" )
285
295
286
296
def generate_CTRCDLformat_content_file_from_D_all (self , path , D_all ):
287
297
'''
@@ -378,6 +388,7 @@ def preprocess(self, path_rating, path_itemtext, min_rating,
378
388
item = []
379
389
rating = []
380
390
391
+ # convert to CSR format to represent sparse matrix
381
392
for line in all_line :
382
393
tmp = line .split ('::' )
383
394
u = tmp [0 ]
@@ -433,7 +444,7 @@ def preprocess(self, path_rating, path_itemtext, min_rating,
433
444
434
445
# Make vocabulary by document
435
446
vectorizer = TfidfVectorizer (max_df = _max_df , stop_words = {
436
- 'english' }, max_features = _vocab_size )
447
+ 'english' }, max_features = _vocab_size )
437
448
Raw_X = [map_idtoplot [i ] for i in range (R .shape [1 ])]
438
449
vectorizer .fit (Raw_X )
439
450
vocab = vectorizer .vocabulary_
0 commit comments