scikit-learn · jnothman · Feb 13, 2017 · Nov 18, 2016 · Nov 19, 2016 · Nov 19, 2016
diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py
@@ -0,0 +1,131 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+=========================================================
+Importance of Feature Scaling
+=========================================================
+
+Feature scaling though standardization (or Z-score normalization)
+can be an important preprocessing step for many machine learning
+algorithms. Standardization involves rescaling the features such
+that they have the properties of a standard normal distribution
+with a mean of zero and a standard deviation of one.
+
+While many algorithms (such as SVM, K-nearest neighbors, and logistic
+regression) require features to be normalized, intuitively we can
+think of Principle Component Analysis (PCA) as being a prime example
+of when normalization is important. In PCA we are interested in the
+components that maximize the variance. If one component (e.g. human
+height) varies less than another (e.g. weight) because of their
+respective scales (meters vs. kilos), PCA might determine that the
+direction of maximal variance more closely corresponds with the
+'weight' axis, if those features are not scaled. As a change in
+height of one meter can be considered much more important than the
+change in weight of one kilogram, this is clearly incorrect.
+
+To illustrate this, PCA is performed comparing the use of data with
+:class:`StandardScaler <sklearn.preprocessing.StandardScaler>` applied,
+to unscaled data. The results are visualized and a clear difference noted.
+The 1st principal component in the unscaled set can be seen. It can be seen
+that feature #13 dominates the direction, being a whole two orders of
+magnitude above the other features. This is contrasted when observing
+the principal component for the scaled version of the data. In the scaled
+version, the orders of magnitude are roughly the same across all the features.
+
+The dataset used is the Wine Dataset available at UCI. This dataset
+has continuous features that are heterogeneous in scale due to differing
+properties that they measure (i.e alcohol content, and malic acid).
+
+The transformed data is then used to train a naive Bayes classifier, and a
+clear difference in prediction accuracies is observed wherein the dataset
+which is scaled before PCA vastly outperforms the unscaled version.
+
+"""
+from __future__ import print_function
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from sklearn.naive_bayes import GaussianNB
+from sklearn import metrics
+import matplotlib.pyplot as plt
+from sklearn.datasets import load_wine
+from sklearn.pipeline import make_pipeline
+print(__doc__)
+
+# Code source: Tyler Lanigan <tylerlanigan@gmail.com>
+#              Sebastian Raschka <mail@sebastianraschka.com>
+
+# License: BSD 3 clause
+
+RANDOM_STATE = 42
+FIG_SIZE = (10, 7)
+
+
+features, target = load_wine(return_X_y=True)
+
+# Make a train/test split using 30% test size
+X_train, X_test, y_train, y_test = train_test_split(features, target,
+                                                    test_size=0.30,
+                                                    random_state=RANDOM_STATE)
+
+# Fit to data and predict using pipelined GNB and PCA.
+unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
+unscaled_clf.fit(X_train, y_train)
+pred_test = unscaled_clf.predict(X_test)
+
+# Fit to data and predict using pipelined scaling, GNB and PCA.
+std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
+std_clf.fit(X_train, y_train)
+pred_test_std = std_clf.predict(X_test)
+
+# Show prediction accuracies in scaled and unscaled data.
+print('\nPrediction accuracy for the normal test dataset with PCA')
+print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))
+
+print('\nPrediction accuracy for the standardized test dataset with PCA')
+print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))
+
+# Extract PCA from pipeline
+pca = unscaled_clf.named_steps['pca']
+pca_std = std_clf.named_steps['pca']
+
+# Show first principal componenets
+print('\nPC 1 without scaling:\n', pca.components_[0])
+print('\nPC 1 with scaling:\n', pca_std.components_[0])
+
+# Scale and use PCA on X_train data for visualization.
+scaler = std_clf.named_steps['standardscaler']
+X_train_std = pca_std.transform(scaler.transform(X_train))
+
+# visualize standardized vs. untouched dataset with PCA performed
+fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)
+
+
+for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
+    ax1.scatter(X_train[y_train == l, 0], X_train[y_train == l, 1],
+                color=c,
+                label='class %s' % l,
+                alpha=0.5,
+                marker=m
+                )
+
+for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
+    ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1],
+                color=c,
+                label='class %s' % l,
+                alpha=0.5,
+                marker=m
+                )
+
+ax1.set_title('Training dataset after PCA')
+ax2.set_title('Standardized training dataset after PCA')
+
+for ax in (ax1, ax2):
+    ax.set_xlabel('1st principal component')
+    ax.set_ylabel('2nd principal component')
+    ax.legend(loc='upper right')
+    ax.grid()
+
+plt.tight_layout()
+
+plt.show()
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
@@ -3,18 +3,18 @@
 including methods to load and fetch popular reference datasets. It also
 features some artificial data generators.
 """
-
+from .base import load_breast_cancer
+from .base import load_boston
 from .base import load_diabetes
 from .base import load_digits
 from .base import load_files
 from .base import load_iris
-from .base import load_breast_cancer
 from .base import load_linnerud
-from .base import load_boston
-from .base import get_data_home
-from .base import clear_data_home
 from .base import load_sample_images
 from .base import load_sample_image
+from .base import load_wine
+from .base import get_data_home
+from .base import clear_data_home
 from .covtype import fetch_covtype
 from .kddcup99 import fetch_kddcup99
 from .mlcomp import load_mlcomp
@@ -78,6 +78,7 @@
            'load_sample_images',
            'load_svmlight_file',
            'load_svmlight_files',
+           'load_wine',
            'make_biclusters',
            'make_blobs',
            'make_circles',

diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
@@ -242,6 +242,122 @@ def load_files(container_path, description=None, categories=None,
                  DESCR=description)
 
 
+def load_data(module_path, data_file_name):
+    """Loads data from module_path/data/data_file_name.
+
+    Parameters
+    ----------
+    data_file_name : String. Name of csv file to be loaded from
+    module_path/data/data_file_name. For example 'wine_data.csv'.
+
+    Returns
+    -------
+    data : Numpy Array
+        A 2D array with each row representing one sample and each column
+        representing the features of a given sample.
+
+    target : Numpy Array
+        A 1D array holding target variables for all the samples in `data.
+        For example target[0] is the target varible for data[0].
+
+    target_names : Numpy Array
+        A 1D array containing the names of the classifications. For example
+        target_names[0] is the name of the target[0] class.
+    """
+    with open(join(module_path, 'data', data_file_name)) as csv_file:
+        data_file = csv.reader(csv_file)
+        temp = next(data_file)
+        n_samples = int(temp[0])
+        n_features = int(temp[1])
+        target_names = np.array(temp[2:])
+        data = np.empty((n_samples, n_features))
+        target = np.empty((n_samples,), dtype=np.int)
+
+        for i, ir in enumerate(data_file):
+            data[i] = np.asarray(ir[:-1], dtype=np.float64)
+            target[i] = np.asarray(ir[-1], dtype=np.int)
+
+    return data, target, target_names
+
+
+def load_wine(return_X_y=False):
+    """Load and return the wine dataset (classification).
+
+    .. versionadded:: 0.18
+
+    The wine dataset is a classic and very easy multi-class classification
+    dataset.
+
+    =================   ==============
+    Classes                          3
+    Samples per class        [59,71,48]
+    Samples total                  178
+    Dimensionality                  13
+    Features            real, positive
+    =================   ==============
+
+    Read more in the :ref:`User Guide <datasets>`.
+
+    Parameters
+    ----------
+    return_X_y : boolean, default=False.
+        If True, returns ``(data, target)`` instead of a Bunch object.
+        See below for more information about the `data` and `target` object.
+
+    Returns
+    -------
+    data : Bunch
+        Dictionary-like object, the interesting attributes are:
+        'data', the data to learn, 'target', the classification labels,
+        'target_names', the meaning of the labels, 'feature_names', the
+        meaning of the features, and 'DESCR', the
+        full description of the dataset.
+
+    (data, target) : tuple if ``return_X_y`` is True
+
+    The copy of UCI ML Wine Data Set dataset is
+    downloaded and modified to fit standard format from:
+    https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
+
+    Examples
+    --------
+    Let's say you are interested in the samples 10, 80, and 140, and want to
+    know their class name.
+
+    >>> from sklearn.datasets import load_wine
+    >>> data = load_wine()
+    >>> data.target[[10, 80, 140]]
+    array([0, 1, 2])
+    >>> list(data.target_names)
+    ['class_0', 'class_1', 'class_2']
+    """
+    module_path = dirname(__file__)
+    data, target, target_names = load_data(module_path, 'wine_data.csv')
+
+    with open(join(module_path, 'descr', 'wine_data.rst')) as rst_file:
+        fdescr = rst_file.read()
+
+    if return_X_y:
+        return data, target
+
+    return Bunch(data=data, target=target,
+                 target_names=target_names,
+                 DESCR=fdescr,
+                 feature_names=['alcohol',
+                                'malic_acid',
+                                'ash',
+                                'alcalinity_of_ash',
+                                'magnesium',
+                                'total_phenols',
+                                'flavanoids',
+                                'nonflavanoid_phenols',
+                                'proanthocyanins',
+                                'color_intensity',
+                                'hue',
+                                'od280/od315_of_diluted_wines',
+                                'proline'])
+
+
 def load_iris(return_X_y=False):
     """Load and return the iris dataset (classification).
 
@@ -292,18 +408,7 @@ def load_iris(return_X_y=False):
     ['setosa', 'versicolor', 'virginica']
     """
     module_path = dirname(__file__)
-    with open(join(module_path, 'data', 'iris.csv')) as csv_file:
-        data_file = csv.reader(csv_file)
-        temp = next(data_file)
-        n_samples = int(temp[0])
-        n_features = int(temp[1])
-        target_names = np.array(temp[2:])
-        data = np.empty((n_samples, n_features))
-        target = np.empty((n_samples,), dtype=np.int)
-
-        for i, ir in enumerate(data_file):
-            data[i] = np.asarray(ir[:-1], dtype=np.float64)
-            target[i] = np.asarray(ir[-1], dtype=np.int)
+    data, target, target_names = load_data(module_path, 'iris.csv')
 
     with open(join(module_path, 'descr', 'iris.rst')) as rst_file:
         fdescr = rst_file.read()
@@ -370,18 +475,7 @@ def load_breast_cancer(return_X_y=False):
     ['malignant', 'benign']
     """
     module_path = dirname(__file__)
-    with open(join(module_path, 'data', 'breast_cancer.csv')) as csv_file:
-        data_file = csv.reader(csv_file)
-        first_line = next(data_file)
-        n_samples = int(first_line[0])
-        n_features = int(first_line[1])
-        target_names = np.array(first_line[2:4])
-        data = np.empty((n_samples, n_features))
-        target = np.empty((n_samples,), dtype=np.int)
-
-        for count, value in enumerate(data_file):
-            data[count] = np.asarray(value[:-1], dtype=np.float64)
-            target[count] = np.asarray(value[-1], dtype=np.int)
+    data, target, target_names = load_data(module_path, 'breast_cancer.csv')
 
     with open(join(module_path, 'descr', 'breast_cancer.rst')) as rst_file:
         fdescr = rst_file.read()
@@ -517,12 +611,12 @@ def load_diabetes(return_X_y=False):
 
     (data, target) : tuple if ``return_X_y`` is True
 
-        .. versionadded:: 0.18    
+        .. versionadded:: 0.18
     """
     base_dir = join(dirname(__file__), 'data')
     data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz'))
     target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz'))
-    
+
     if return_X_y:
         return data, target
 
@@ -554,7 +648,7 @@ def load_linnerud(return_X_y=False):
         'targets', the two multivariate datasets, with 'data' corresponding to
         the exercise and 'targets' corresponding to the physiological
         measurements, as well as 'feature_names' and 'target_names'.
-    
+
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.18
@@ -608,7 +702,7 @@ def load_boston(return_X_y=False):
 
     (data, target) : tuple if ``return_X_y`` is True
 
-        .. versionadded:: 0.18    
+        .. versionadded:: 0.18
 
     Examples
     --------

diff --git a/sklearn/datasets/data/breast_cancer.csv b/sklearn/datasets/data/breast_cancer.csv
@@ -1,4 +1,4 @@
-569,30,malignant,benign,,,,,,,,,,,,,,,,,,,,,,,,,,,
+569,30,malignant,benign
 17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
 20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
 19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0