32
32
33
33
34
34
def load_svmlight_file (f , n_features = None , dtype = np .float64 ,
35
- multilabel = False , zero_based = "auto" , query_id = False ):
35
+ multilabel = False , zero_based = "auto" , query_id = False ,
36
+ offset = 0 , length = - 1 ):
36
37
"""Load datasets in the svmlight / libsvm format into sparse CSR matrix
37
38
38
39
This format is a text-based format, with one sample per line. It does
@@ -77,6 +78,8 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
77
78
bigger sliced dataset: each subset might not have examples of
78
79
every feature, hence the inferred shape might vary from one
79
80
slice to another.
81
+ n_features is only required if ``offset`` or ``length`` are passed a
82
+ non-default value.
80
83
81
84
multilabel : boolean, optional, default False
82
85
Samples may have several labels each (see
@@ -89,7 +92,10 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
89
92
If set to "auto", a heuristic check is applied to determine this from
90
93
the file contents. Both kinds of files occur "in the wild", but they
91
94
are unfortunately not self-identifying. Using "auto" or True should
92
- always be safe.
95
+ always be safe when no ``offset`` or ``length`` is passed.
96
+ If ``offset`` or ``length`` are passed, the "auto" mode falls back
97
+ to ``zero_based=True`` to avoid having the heuristic check yield
98
+ inconsistent results on different segments of the file.
93
99
94
100
query_id : boolean, default False
95
101
If True, will return the query_id array for each file.
@@ -98,6 +104,15 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
98
104
Data type of dataset to be loaded. This will be the data type of the
99
105
output numpy arrays ``X`` and ``y``.
100
106
107
+ offset : integer, optional, default 0
108
+ Ignore the offset first bytes by seeking forward, then
109
+ discarding the following bytes up until the next new line
110
+ character.
111
+
112
+ length : integer, optional, default -1
113
+ If strictly positive, stop reading any new line of data once the
114
+ position in the file has reached the (offset + length) bytes threshold.
115
+
101
116
Returns
102
117
-------
103
118
X : scipy.sparse matrix of shape (n_samples, n_features)
@@ -130,7 +145,7 @@ def get_data():
130
145
X, y = get_data()
131
146
"""
132
147
return tuple (load_svmlight_files ([f ], n_features , dtype , multilabel ,
133
- zero_based , query_id ))
148
+ zero_based , query_id , offset , length ))
134
149
135
150
136
151
def _gen_open (f ):
@@ -150,15 +165,18 @@ def _gen_open(f):
150
165
return open (f , "rb" )
151
166
152
167
153
- def _open_and_load (f , dtype , multilabel , zero_based , query_id ):
168
+ def _open_and_load (f , dtype , multilabel , zero_based , query_id ,
169
+ offset = 0 , length = - 1 ):
154
170
if hasattr (f , "read" ):
155
171
actual_dtype , data , ind , indptr , labels , query = \
156
- _load_svmlight_file (f , dtype , multilabel , zero_based , query_id )
172
+ _load_svmlight_file (f , dtype , multilabel , zero_based , query_id ,
173
+ offset , length )
157
174
# XXX remove closing when Python 2.7+/3.1+ required
158
175
else :
159
176
with closing (_gen_open (f )) as f :
160
177
actual_dtype , data , ind , indptr , labels , query = \
161
- _load_svmlight_file (f , dtype , multilabel , zero_based , query_id )
178
+ _load_svmlight_file (f , dtype , multilabel , zero_based , query_id ,
179
+ offset , length )
162
180
163
181
# convert from array.array, give data the right dtype
164
182
if not multilabel :
@@ -173,7 +191,8 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id):
173
191
174
192
175
193
def load_svmlight_files (files , n_features = None , dtype = np .float64 ,
176
- multilabel = False , zero_based = "auto" , query_id = False ):
194
+ multilabel = False , zero_based = "auto" , query_id = False ,
195
+ offset = 0 , length = - 1 ):
177
196
"""Load dataset from multiple files in SVMlight format
178
197
179
198
This function is equivalent to mapping load_svmlight_file over a list of
@@ -217,7 +236,10 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
217
236
If set to "auto", a heuristic check is applied to determine this from
218
237
the file contents. Both kinds of files occur "in the wild", but they
219
238
are unfortunately not self-identifying. Using "auto" or True should
220
- always be safe.
239
+ always be safe when no offset or length is passed.
240
+ If offset or length are passed, the "auto" mode falls back
241
+ to zero_based=True to avoid having the heuristic check yield
242
+ inconsistent results on different segments of the file.
221
243
222
244
query_id : boolean, defaults to False
223
245
If True, will return the query_id array for each file.
@@ -226,6 +248,15 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
226
248
Data type of dataset to be loaded. This will be the data type of the
227
249
output numpy arrays ``X`` and ``y``.
228
250
251
+ offset : integer, optional, default 0
252
+ Ignore the offset first bytes by seeking forward, then
253
+ discarding the following bytes up until the next new line
254
+ character.
255
+
256
+ length : integer, optional, default -1
257
+ If strictly positive, stop reading any new line of data once the
258
+ position in the file has reached the (offset + length) bytes threshold.
259
+
229
260
Returns
230
261
-------
231
262
[X1, y1, ..., Xn, yn]
@@ -246,16 +277,27 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
246
277
--------
247
278
load_svmlight_file
248
279
"""
249
- r = [_open_and_load (f , dtype , multilabel , bool (zero_based ), bool (query_id ))
280
+ if (offset != 0 or length > 0 ) and zero_based == "auto" :
281
+ # disable heuristic search to avoid getting inconsistent results on
282
+ # different segments of the file
283
+ zero_based = True
284
+
285
+ if (offset != 0 or length > 0 ) and n_features is None :
286
+ raise ValueError (
287
+ "n_features is required when offset or length is specified." )
288
+
289
+ r = [_open_and_load (f , dtype , multilabel , bool (zero_based ), bool (query_id ),
290
+ offset = offset , length = length )
250
291
for f in files ]
251
292
252
- if (zero_based is False
253
- or zero_based == "auto" and all (np .min (tmp [1 ]) > 0 for tmp in r )):
254
- for ind in r :
255
- indices = ind [ 1 ]
293
+ if (zero_based is False or
294
+ zero_based == "auto" and all (len ( tmp [ 1 ]) and np .min (tmp [1 ]) > 0
295
+ for tmp in r )) :
296
+ for _ , indices , _ , _ , _ in r :
256
297
indices -= 1
257
298
258
- n_f = max (ind [1 ].max () for ind in r ) + 1
299
+ n_f = max (ind [1 ].max () if len (ind [1 ]) else 0 for ind in r ) + 1
300
+
259
301
if n_features is None :
260
302
n_features = n_f
261
303
elif n_features < n_f :
0 commit comments