@@ -242,6 +242,122 @@ def load_files(container_path, description=None, categories=None,
242
242
DESCR = description )
243
243
244
244
245
+ def load_data (module_path , data_file_name ):
246
+ """Loads data from module_path/data/data_file_name.
247
+
248
+ Parameters
249
+ ----------
250
+ data_file_name : String. Name of csv file to be loaded from
251
+ module_path/data/data_file_name. For example 'wine_data.csv'.
252
+
253
+ Returns
254
+ -------
255
+ data : Numpy Array
256
+ A 2D array with each row representing one sample and each column
257
+ representing the features of a given sample.
258
+
259
+ target : Numpy Array
260
+ A 1D array holding target variables for all the samples in `data.
261
+ For example target[0] is the target varible for data[0].
262
+
263
+ target_names : Numpy Array
264
+ A 1D array containing the names of the classifications. For example
265
+ target_names[0] is the name of the target[0] class.
266
+ """
267
+ with open (join (module_path , 'data' , data_file_name )) as csv_file :
268
+ data_file = csv .reader (csv_file )
269
+ temp = next (data_file )
270
+ n_samples = int (temp [0 ])
271
+ n_features = int (temp [1 ])
272
+ target_names = np .array (temp [2 :])
273
+ data = np .empty ((n_samples , n_features ))
274
+ target = np .empty ((n_samples ,), dtype = np .int )
275
+
276
+ for i , ir in enumerate (data_file ):
277
+ data [i ] = np .asarray (ir [:- 1 ], dtype = np .float64 )
278
+ target [i ] = np .asarray (ir [- 1 ], dtype = np .int )
279
+
280
+ return data , target , target_names
281
+
282
+
283
+ def load_wine (return_X_y = False ):
284
+ """Load and return the wine dataset (classification).
285
+
286
+ .. versionadded:: 0.18
287
+
288
+ The wine dataset is a classic and very easy multi-class classification
289
+ dataset.
290
+
291
+ ================= ==============
292
+ Classes 3
293
+ Samples per class [59,71,48]
294
+ Samples total 178
295
+ Dimensionality 13
296
+ Features real, positive
297
+ ================= ==============
298
+
299
+ Read more in the :ref:`User Guide <datasets>`.
300
+
301
+ Parameters
302
+ ----------
303
+ return_X_y : boolean, default=False.
304
+ If True, returns ``(data, target)`` instead of a Bunch object.
305
+ See below for more information about the `data` and `target` object.
306
+
307
+ Returns
308
+ -------
309
+ data : Bunch
310
+ Dictionary-like object, the interesting attributes are:
311
+ 'data', the data to learn, 'target', the classification labels,
312
+ 'target_names', the meaning of the labels, 'feature_names', the
313
+ meaning of the features, and 'DESCR', the
314
+ full description of the dataset.
315
+
316
+ (data, target) : tuple if ``return_X_y`` is True
317
+
318
+ The copy of UCI ML Wine Data Set dataset is
319
+ downloaded and modified to fit standard format from:
320
+ https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
321
+
322
+ Examples
323
+ --------
324
+ Let's say you are interested in the samples 10, 80, and 140, and want to
325
+ know their class name.
326
+
327
+ >>> from sklearn.datasets import load_wine
328
+ >>> data = load_wine()
329
+ >>> data.target[[10, 80, 140]]
330
+ array([0, 1, 2])
331
+ >>> list(data.target_names)
332
+ ['class_0', 'class_1', 'class_2']
333
+ """
334
+ module_path = dirname (__file__ )
335
+ data , target , target_names = load_data (module_path , 'wine_data.csv' )
336
+
337
+ with open (join (module_path , 'descr' , 'wine_data.rst' )) as rst_file :
338
+ fdescr = rst_file .read ()
339
+
340
+ if return_X_y :
341
+ return data , target
342
+
343
+ return Bunch (data = data , target = target ,
344
+ target_names = target_names ,
345
+ DESCR = fdescr ,
346
+ feature_names = ['alcohol' ,
347
+ 'malic_acid' ,
348
+ 'ash' ,
349
+ 'alcalinity_of_ash' ,
350
+ 'magnesium' ,
351
+ 'total_phenols' ,
352
+ 'flavanoids' ,
353
+ 'nonflavanoid_phenols' ,
354
+ 'proanthocyanins' ,
355
+ 'color_intensity' ,
356
+ 'hue' ,
357
+ 'od280/od315_of_diluted_wines' ,
358
+ 'proline' ])
359
+
360
+
245
361
def load_iris (return_X_y = False ):
246
362
"""Load and return the iris dataset (classification).
247
363
@@ -292,18 +408,7 @@ def load_iris(return_X_y=False):
292
408
['setosa', 'versicolor', 'virginica']
293
409
"""
294
410
module_path = dirname (__file__ )
295
- with open (join (module_path , 'data' , 'iris.csv' )) as csv_file :
296
- data_file = csv .reader (csv_file )
297
- temp = next (data_file )
298
- n_samples = int (temp [0 ])
299
- n_features = int (temp [1 ])
300
- target_names = np .array (temp [2 :])
301
- data = np .empty ((n_samples , n_features ))
302
- target = np .empty ((n_samples ,), dtype = np .int )
303
-
304
- for i , ir in enumerate (data_file ):
305
- data [i ] = np .asarray (ir [:- 1 ], dtype = np .float64 )
306
- target [i ] = np .asarray (ir [- 1 ], dtype = np .int )
411
+ data , target , target_names = load_data (module_path , 'iris.csv' )
307
412
308
413
with open (join (module_path , 'descr' , 'iris.rst' )) as rst_file :
309
414
fdescr = rst_file .read ()
@@ -370,18 +475,7 @@ def load_breast_cancer(return_X_y=False):
370
475
['malignant', 'benign']
371
476
"""
372
477
module_path = dirname (__file__ )
373
- with open (join (module_path , 'data' , 'breast_cancer.csv' )) as csv_file :
374
- data_file = csv .reader (csv_file )
375
- first_line = next (data_file )
376
- n_samples = int (first_line [0 ])
377
- n_features = int (first_line [1 ])
378
- target_names = np .array (first_line [2 :4 ])
379
- data = np .empty ((n_samples , n_features ))
380
- target = np .empty ((n_samples ,), dtype = np .int )
381
-
382
- for count , value in enumerate (data_file ):
383
- data [count ] = np .asarray (value [:- 1 ], dtype = np .float64 )
384
- target [count ] = np .asarray (value [- 1 ], dtype = np .int )
478
+ data , target , target_names = load_data (module_path , 'breast_cancer.csv' )
385
479
386
480
with open (join (module_path , 'descr' , 'breast_cancer.rst' )) as rst_file :
387
481
fdescr = rst_file .read ()
@@ -517,12 +611,12 @@ def load_diabetes(return_X_y=False):
517
611
518
612
(data, target) : tuple if ``return_X_y`` is True
519
613
520
- .. versionadded:: 0.18
614
+ .. versionadded:: 0.18
521
615
"""
522
616
base_dir = join (dirname (__file__ ), 'data' )
523
617
data = np .loadtxt (join (base_dir , 'diabetes_data.csv.gz' ))
524
618
target = np .loadtxt (join (base_dir , 'diabetes_target.csv.gz' ))
525
-
619
+
526
620
if return_X_y :
527
621
return data , target
528
622
@@ -554,7 +648,7 @@ def load_linnerud(return_X_y=False):
554
648
'targets', the two multivariate datasets, with 'data' corresponding to
555
649
the exercise and 'targets' corresponding to the physiological
556
650
measurements, as well as 'feature_names' and 'target_names'.
557
-
651
+
558
652
(data, target) : tuple if ``return_X_y`` is True
559
653
560
654
.. versionadded:: 0.18
@@ -608,7 +702,7 @@ def load_boston(return_X_y=False):
608
702
609
703
(data, target) : tuple if ``return_X_y`` is True
610
704
611
- .. versionadded:: 0.18
705
+ .. versionadded:: 0.18
612
706
613
707
Examples
614
708
--------
0 commit comments