31
31
32
32
33
33
def load_svmlight_file (f , n_features = None , dtype = np .float64 ,
34
- multilabel = False , zero_based = "auto" , query_id = False ):
34
+ multilabel = False , zero_based = "auto" , query_id = False ,
35
+ offset = 0 , length = - 1 ):
35
36
"""Load datasets in the svmlight / libsvm format into sparse CSR matrix
36
37
37
38
This format is a text-based format, with one sample per line. It does
@@ -76,6 +77,8 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
76
77
bigger sliced dataset: each subset might not have examples of
77
78
every feature, hence the inferred shape might vary from one
78
79
slice to another.
80
+ n_features is only required if ``offset`` or ``length`` are passed a
81
+ non-default value.
79
82
80
83
multilabel : boolean, optional, default False
81
84
Samples may have several labels each (see
@@ -88,7 +91,10 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
88
91
If set to "auto", a heuristic check is applied to determine this from
89
92
the file contents. Both kinds of files occur "in the wild", but they
90
93
are unfortunately not self-identifying. Using "auto" or True should
91
- always be safe.
94
+ always be safe when no ``offset`` or ``length`` is passed.
95
+ If ``offset`` or ``length`` are passed, the "auto" mode falls back
96
+ to ``zero_based=True`` to avoid having the heuristic check yield
97
+ inconsistent results on different segments of the file.
92
98
93
99
query_id : boolean, default False
94
100
If True, will return the query_id array for each file.
@@ -97,6 +103,15 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
97
103
Data type of dataset to be loaded. This will be the data type of the
98
104
output numpy arrays ``X`` and ``y``.
99
105
106
+ offset : integer, optional, default 0
107
+ Ignore the offset first bytes by seeking forward, then
108
+ discarding the following bytes up until the next new line
109
+ character.
110
+
111
+ length : integer, optional, default -1
112
+ If strictly positive, stop reading any new line of data once the
113
+ position in the file has reached the (offset + length) bytes threshold.
114
+
100
115
Returns
101
116
-------
102
117
X : scipy.sparse matrix of shape (n_samples, n_features)
@@ -129,7 +144,7 @@ def get_data():
129
144
X, y = get_data()
130
145
"""
131
146
return tuple (load_svmlight_files ([f ], n_features , dtype , multilabel ,
132
- zero_based , query_id ))
147
+ zero_based , query_id , offset , length ))
133
148
134
149
135
150
def _gen_open (f ):
@@ -149,15 +164,18 @@ def _gen_open(f):
149
164
return open (f , "rb" )
150
165
151
166
152
- def _open_and_load (f , dtype , multilabel , zero_based , query_id ):
167
+ def _open_and_load (f , dtype , multilabel , zero_based , query_id ,
168
+ offset = 0 , length = - 1 ):
153
169
if hasattr (f , "read" ):
154
170
actual_dtype , data , ind , indptr , labels , query = \
155
- _load_svmlight_file (f , dtype , multilabel , zero_based , query_id )
171
+ _load_svmlight_file (f , dtype , multilabel , zero_based , query_id ,
172
+ offset , length )
156
173
# XXX remove closing when Python 2.7+/3.1+ required
157
174
else :
158
175
with closing (_gen_open (f )) as f :
159
176
actual_dtype , data , ind , indptr , labels , query = \
160
- _load_svmlight_file (f , dtype , multilabel , zero_based , query_id )
177
+ _load_svmlight_file (f , dtype , multilabel , zero_based , query_id ,
178
+ offset , length )
161
179
162
180
# convert from array.array, give data the right dtype
163
181
if not multilabel :
@@ -172,7 +190,8 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id):
172
190
173
191
174
192
def load_svmlight_files (files , n_features = None , dtype = np .float64 ,
175
- multilabel = False , zero_based = "auto" , query_id = False ):
193
+ multilabel = False , zero_based = "auto" , query_id = False ,
194
+ offset = 0 , length = - 1 ):
176
195
"""Load dataset from multiple files in SVMlight format
177
196
178
197
This function is equivalent to mapping load_svmlight_file over a list of
@@ -216,7 +235,10 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
216
235
If set to "auto", a heuristic check is applied to determine this from
217
236
the file contents. Both kinds of files occur "in the wild", but they
218
237
are unfortunately not self-identifying. Using "auto" or True should
219
- always be safe.
238
+ always be safe when no offset or length is passed.
239
+ If offset or length are passed, the "auto" mode falls back
240
+ to zero_based=True to avoid having the heuristic check yield
241
+ inconsistent results on different segments of the file.
220
242
221
243
query_id : boolean, defaults to False
222
244
If True, will return the query_id array for each file.
@@ -225,6 +247,15 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
225
247
Data type of dataset to be loaded. This will be the data type of the
226
248
output numpy arrays ``X`` and ``y``.
227
249
250
+ offset : integer, optional, default 0
251
+ Ignore the offset first bytes by seeking forward, then
252
+ discarding the following bytes up until the next new line
253
+ character.
254
+
255
+ length : integer, optional, default -1
256
+ If strictly positive, stop reading any new line of data once the
257
+ position in the file has reached the (offset + length) bytes threshold.
258
+
228
259
Returns
229
260
-------
230
261
[X1, y1, ..., Xn, yn]
@@ -245,16 +276,27 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
245
276
--------
246
277
load_svmlight_file
247
278
"""
248
- r = [_open_and_load (f , dtype , multilabel , bool (zero_based ), bool (query_id ))
279
+ if (offset != 0 or length > 0 ) and zero_based == "auto" :
280
+ # disable heuristic search to avoid getting inconsistent results on
281
+ # different segments of the file
282
+ zero_based = True
283
+
284
+ if (offset != 0 or length > 0 ) and n_features is None :
285
+ raise ValueError (
286
+ "n_features is required when offset or length is specified." )
287
+
288
+ r = [_open_and_load (f , dtype , multilabel , bool (zero_based ), bool (query_id ),
289
+ offset = offset , length = length )
249
290
for f in files ]
250
291
251
- if (zero_based is False
252
- or zero_based == "auto" and all (np .min (tmp [1 ]) > 0 for tmp in r )):
253
- for ind in r :
254
- indices = ind [ 1 ]
292
+ if (zero_based is False or
293
+ zero_based == "auto" and all (len ( tmp [ 1 ]) and np .min (tmp [1 ]) > 0
294
+ for tmp in r )) :
295
+ for _ , indices , _ , _ , _ in r :
255
296
indices -= 1
256
297
257
- n_f = max (ind [1 ].max () for ind in r ) + 1
298
+ n_f = max (ind [1 ].max () if len (ind [1 ]) else 0 for ind in r ) + 1
299
+
258
300
if n_features is None :
259
301
n_features = n_f
260
302
elif n_features < n_f :
0 commit comments