From f10ec8ed8db5e12de003248a091dcacebf4cdbc3 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Thu, 17 Nov 2016 19:33:53 -0800 Subject: [PATCH 01/26] added plot_scaling_importance --- .../preprocessing/plot_scaling_importance.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 examples/preprocessing/plot_scaling_importance.py diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py new file mode 100644 index 0000000000000..602a002d66eb0 --- /dev/null +++ b/examples/preprocessing/plot_scaling_importance.py @@ -0,0 +1,144 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +========================================================= +Importance of Feature Scaling +========================================================= + +Features scaling though standardization (or Z-score normalization) +can be an importance preprocessing step for many machine learning +algorithms. Standardization involves rescaling the features such +that they’ll have the properties of a standard normal distribution +with a mean of zero and a standard deviation of one. + +While many algorithms (such as SVM, K-nearest neighbors and logistic +regression) require features to be normalized, intuitively we can +think of Principle Component Analysis (PCA) as being a prime example +of when normalization is important. In PCA we are interested in the +components that maximize the variance. If there exists components +(e.g human height) that vary less then other components (e.g human +weight) because of their respective scales (meters vs. kilos) it can +be seen how not scaling the features would cause PCA to determine that +the direction of maximal variance more closely corresponds with the +‘weight’ axis. As a change in height of one meter can be considered much +more important than the change in weight of one kilogram, it is easily +seen that this determination is incorrect. In the case of PCA, scaling +features using normalization is preferred over using min-max scaling as +the primary components are computed using the correlation matrix as opposed +to the covariance matrix. + +In order to illustrate this in an example, PCA will be performed on a dataset +which has been standardized using :class:`StandardScaler `, +and a copy which has remained untouched. The results with be visualized and +a clear difference noted. + +The results will then be used to train a naive Bayes classifier, and a clear +difference the prediction accuracies will be observed. + +""" +from __future__ import print_function +print(__doc__) + + +# Code source: Tyler Lanigan +# Sebastian Raschka + +# License: BSD 3 clause + +import pandas as pd +from sklearn.cross_validation import train_test_split +from sklearn import preprocessing +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler +from sklearn.naive_bayes import GaussianNB +from sklearn import metrics +import matplotlib.pyplot as plt + +# Contants +RAN_STATE = 42 +FIG_SIZE = (10, 7) + +# Read in Data +import pandas as pd + +df = pd.read_csv( + 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', + header = None + ) + +# Assign features and target. Target is a class label with 3 classes. +features = df.values[:, 1:] +target = df.values[:, 0] + +# Make a train/test split using 30% test size +X_train, X_test, y_train, y_test = train_test_split(features, target, + test_size=0.30, random_state=RAN_STATE) + +# Apply Scaling to X_train and X_test +std_scale = preprocessing.StandardScaler().fit(X_train) +X_train_std = std_scale.transform(X_train) +X_test_std = std_scale.transform(X_test) + +# Perform PCA on non-standardized data +pca = PCA(n_components=2).fit(X_train) +X_train = pca.transform(X_train) +X_test = pca.transform(X_test) + +# Perform PCA on standardized data +pca_std = PCA(n_components=2).fit(X_train_std) +X_train_std = pca_std.transform(X_train_std) +X_test_std = pca_std.transform(X_test_std) + +# Fit GaussianNB on standard and non-standardized data +clf = GaussianNB() +fit = clf.fit(X_train, y_train) +clf_std = GaussianNB() +fit_std = clf_std.fit(X_train_std, y_train) + +# Make predictions for standard and non standardized data. +pred_train = clf.predict(X_train) +pred_test = clf.predict(X_test) +pred_train_std = clf_std.predict(X_train_std) +pred_test_std = clf_std.predict(X_test_std) + +print('\nPrediction accuracy for the normal test dataset with PCA') +print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test))) + +print('\nPrediction accuracy for the standardized test dataset with PCA') +print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std))) + + +# visualize standardized vs. untouched dataset with PCA performed + +fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE) + + +for l, c, m in zip(range(1, 4), ('blue', 'red', 'green'), ('^', 's', 'o')): + ax1.scatter(X_train[y_train == l, 0], X_train[y_train == l, 1], + color=c, + label='class %s' % l, + alpha=0.5, + marker=m + ) + +for l, c, m in zip(range(1, 4), ('blue', 'red', 'green'), ('^', 's', 'o')): + ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1], + color=c, + label='class %s' % l, + alpha=0.5, + marker=m + ) + +ax1.set_title('Training dataset after PCA') +ax2.set_title('Standardized training dataset after PCA') + +for ax in (ax1, ax2): + + ax.set_xlabel('1st principal component') + ax.set_ylabel('2nd principal component') + ax.legend(loc='upper right') + ax.grid() +plt.tight_layout() + +plt.show() From 4d1e03fcaf8b404f011be9ee54be44efe5526c83 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 13:31:22 -0800 Subject: [PATCH 02/26] added wine_data.rst file --- sklearn/datasets/descr/wine_data.rst | 85 ++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 sklearn/datasets/descr/wine_data.rst diff --git a/sklearn/datasets/descr/wine_data.rst b/sklearn/datasets/descr/wine_data.rst new file mode 100644 index 0000000000000..de9f8a318af31 --- /dev/null +++ b/sklearn/datasets/descr/wine_data.rst @@ -0,0 +1,85 @@ +Wine Data Database +==================== + +Notes +----- +Data Set Characteristics: + :Number of Instances: 178 (50 in each of three classes) + :Number of Attributes: 13 numeric, predictive attributes and the class + :Attribute Information: + - 1) Alcohol + - 2) Malic acid + - 3) Ash + - 4) Alcalinity of ash + - 5) Magnesium + - 6) Total phenols + - 7) Flavanoids + - 8) Nonflavanoid phenols + - 9) Proanthocyanins + - 10)Color intensity + - 11)Hue + - 12)OD280/OD315 of diluted wines + - 13)Proline + - class: + - class_0 + - class_1 + - class_2 + :Summary Statistics: + + ============== ==== ==== ======= ===== ==================== + Min Max Mean SD + ============== ==== ==== ======= ===== ==================== + Alcohol: 11.0 14.8 13.0 0.811 + Malic Acid: 0.74 5.80 2.34 1.12 + Ash: 1.36 3.23 2.36 0.274 + Alcalinity of Ash: 10.6 30.0 19.5 3.34 + Magnesium: 70.0 162.0 99.7 14.28 + Total Phenols: 0.980 3.88 2.29 0.626 + Flavanoids: 0.340 5.08 2.03 .999 + Nonflavanoid Phenols: 0.130 0.66 0.362 0.124 + Proanthocyanins: 0.410 3.580 1.590 0.572 + Colour Intensity: 1.28 13.0 5.06 2.318 + Hue: 0.480 1.71 0.957 0.226 + OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.710 + Proline: 278 1680 746 315 + ============== ==== ==== ======= ===== ==================== + + :Missing Attribute Values: None + :Class Distribution: class_0 (59), class_1 (71), class_2 (48) + :Creator: R.A. Fisher + :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) + :Date: July, 1988 + +This is a copy of UCI ML Wine recognition datasets. +https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data + +These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines. + +Original Owners: + +Forina, M. et al, PARVUS - +An Extendible Package for Data Exploration, Classification and Correlation. +Institute of Pharmaceutical and Food Analysis and Technologies, Via Brigata Salerno, +16147 Genoa, Italy. + +Relevant Papers +---------- +(1) +S. Aeberhard, D. Coomans and O. de Vel, +Comparison of Classifiers in High Dimensional Settings, +Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of +Mathematics and Statistics, James Cook University of North Queensland. +(Also submitted to Technometrics). + +The data was used with many others for comparing various +classifiers. The classes are separable, though only RDA +has achieved 100% correct classification. +(RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) +(All results using the leave-one-out technique) + +(2) +S. Aeberhard, D. Coomans and O. de Vel, +"THE CLASSIFICATION PERFORMANCE OF RDA" +Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of +Mathematics and Statistics, James Cook University of North Queensland. +(Also submitted to Journal of Chemometrics). From b4b9913d78c4dd7b10b0587b73d0879c9d8a90a9 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 13:36:13 -0800 Subject: [PATCH 03/26] adjusted smal for pep8 --- examples/preprocessing/plot_scaling_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 602a002d66eb0..625f0fa7e0ce2 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -64,7 +64,7 @@ df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', - header = None + header=None ) # Assign features and target. Target is a class label with 3 classes. From 5482907fa2deccb19e5d74d6bd6e8e32263dc3f7 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 13:42:33 -0800 Subject: [PATCH 04/26] added load_wine() to __init__.py --- sklearn/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index c38e99acd3d5b..53fbbeed1cbc5 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -7,6 +7,7 @@ from .base import load_diabetes from .base import load_digits from .base import load_files +from .base import load_wine from .base import load_iris from .base import load_breast_cancer from .base import load_linnerud @@ -66,6 +67,7 @@ 'fetch_rcv1', 'fetch_kddcup99', 'get_data_home', + 'load_wine', 'load_boston', 'load_diabetes', 'load_digits', From 7663f5766a6cae5b4beff43298be39d581323bae Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 13:43:02 -0800 Subject: [PATCH 05/26] added load_wine to base.py --- sklearn/datasets/base.py | 95 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index b83f9d4985e46..8d6a36d0d9d6c 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -242,6 +242,101 @@ def load_files(container_path, description=None, categories=None, DESCR=description) +def load_wine(return_X_y=False): + """Load and return the wine dataset (classification). + + The wine dataset is a classic and very easy multi-class classification + dataset. + + ================= ============== + Classes 3 + Samples per class [59,71,48] + Samples total 178 + Dimensionality 13 + Features real, positive + ================= ============== + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + return_X_y : boolean, default=False. + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object. + + .. versionadded:: 0.18 + + Returns + ------- + data : Bunch + Dictionary-like object, the interesting attributes are: + 'data', the data to learn, 'target', the classification labels, + 'target_names', the meaning of the labels, 'feature_names', the + meaning of the features, and 'DESCR', the + full description of the dataset. + + (data, target) : tuple if ``return_X_y`` is True + + .. versionadded:: 0.18 + + The copy of UCI ML Wine Data Set dataset is + downloaded from: + https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data + + Examples + -------- + Let's say you are interested in the samples 10, 25, and 50, and want to + know their class name. + + >>> from sklearn.datasets import load_wine + >>> data = load_wine() + >>> data.target[[10, 25, 50]] + array([0, 0, 1]) + >>> list(data.target_names) + ['class_0', 'class_1', 'class_2'] + """ + module_path = dirname(__file__) + with open(join(module_path, 'data', 'wine_data.csv')) as csv_file: + data_file = csv.reader(csv_file) + temp = next(data_file) + n_samples = int(temp[0]) + n_features = int(temp[1]) + target_names = np.array(temp[2:]) + data = np.empty((n_samples, n_features)) + target = np.empty((n_samples,), dtype=np.int) + + for i, ir in enumerate(data_file): + data[i] = np.asarray(ir[:-1], dtype=np.float64) + target[i] = np.asarray(ir[-1], dtype=np.int) + + with open(join(module_path, 'descr', 'wine_data.rst')) as rst_file: + fdescr = rst_file.read() + + if return_X_y: + return data, target + + return Bunch(data=data, target=target, + target_names=target_names, + DESCR=fdescr, + feature_names=['alcohol', + 'malic_acid', + 'ash', + 'alcalinity_of_ash', + 'magnesium', + 'total_phenols', + 'flavanoids', + 'nonflavanoid_phenols', + 'proanthocyanins', + 'color_intensity', + 'hue', + 'od280/od315_of_diluted_wines', + 'proline']) + + + + + + def load_iris(return_X_y=False): """Load and return the iris dataset (classification). From d135c5697261eb1591a4bf86047c23746846d512 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 14:13:11 -0800 Subject: [PATCH 06/26] fixed wine_data to have int for class and to pass docstring test --- sklearn/datasets/data/wine_data.csv | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 sklearn/datasets/data/wine_data.csv diff --git a/sklearn/datasets/data/wine_data.csv b/sklearn/datasets/data/wine_data.csv new file mode 100644 index 0000000000000..6c7fe81952aa6 --- /dev/null +++ b/sklearn/datasets/data/wine_data.csv @@ -0,0 +1,179 @@ +178,13,class_0,class_1,class_2 +14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0 +13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0 +13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0 +14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,0 +13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,0 +14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450,0 +14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290,0 +14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295,0 +14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045,0 +13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045,0 +14.1,2.16,2.3,18,105,2.95,3.32,0.22,2.38,5.75,1.25,3.17,1510,0 +14.12,1.48,2.32,16.8,95,2.2,2.43,0.26,1.57,5,1.17,2.82,1280,0 +13.75,1.73,2.41,16,89,2.6,2.76,0.29,1.81,5.6,1.15,2.9,1320,0 +14.75,1.73,2.39,11.4,91,3.1,3.69,0.43,2.81,5.4,1.25,2.73,1150,0 +14.38,1.87,2.38,12,102,3.3,3.64,0.29,2.96,7.5,1.2,3,1547,0 +13.63,1.81,2.7,17.2,112,2.85,2.91,0.3,1.46,7.3,1.28,2.88,1310,0 +14.3,1.92,2.72,20,120,2.8,3.14,0.33,1.97,6.2,1.07,2.65,1280,0 +13.83,1.57,2.62,20,115,2.95,3.4,0.4,1.72,6.6,1.13,2.57,1130,0 +14.19,1.59,2.48,16.5,108,3.3,3.93,0.32,1.86,8.7,1.23,2.82,1680,0 +13.64,3.1,2.56,15.2,116,2.7,3.03,0.17,1.66,5.1,0.96,3.36,845,0 +14.06,1.63,2.28,16,126,3,3.17,0.24,2.1,5.65,1.09,3.71,780,0 +12.93,3.8,2.65,18.6,102,2.41,2.41,0.25,1.98,4.5,1.03,3.52,770,0 +13.71,1.86,2.36,16.6,101,2.61,2.88,0.27,1.69,3.8,1.11,4,1035,0 +12.85,1.6,2.52,17.8,95,2.48,2.37,0.26,1.46,3.93,1.09,3.63,1015,0 +13.5,1.81,2.61,20,96,2.53,2.61,0.28,1.66,3.52,1.12,3.82,845,0 +13.05,2.05,3.22,25,124,2.63,2.68,0.47,1.92,3.58,1.13,3.2,830,0 +13.39,1.77,2.62,16.1,93,2.85,2.94,0.34,1.45,4.8,0.92,3.22,1195,0 +13.3,1.72,2.14,17,94,2.4,2.19,0.27,1.35,3.95,1.02,2.77,1285,0 +13.87,1.9,2.8,19.4,107,2.95,2.97,0.37,1.76,4.5,1.25,3.4,915,0 +14.02,1.68,2.21,16,96,2.65,2.33,0.26,1.98,4.7,1.04,3.59,1035,0 +13.73,1.5,2.7,22.5,101,3,3.25,0.29,2.38,5.7,1.19,2.71,1285,0 +13.58,1.66,2.36,19.1,106,2.86,3.19,0.22,1.95,6.9,1.09,2.88,1515,0 +13.68,1.83,2.36,17.2,104,2.42,2.69,0.42,1.97,3.84,1.23,2.87,990,0 +13.76,1.53,2.7,19.5,132,2.95,2.74,0.5,1.35,5.4,1.25,3,1235,0 +13.51,1.8,2.65,19,110,2.35,2.53,0.29,1.54,4.2,1.1,2.87,1095,0 +13.48,1.81,2.41,20.5,100,2.7,2.98,0.26,1.86,5.1,1.04,3.47,920,0 +13.28,1.64,2.84,15.5,110,2.6,2.68,0.34,1.36,4.6,1.09,2.78,880,0 +13.05,1.65,2.55,18,98,2.45,2.43,0.29,1.44,4.25,1.12,2.51,1105,0 +13.07,1.5,2.1,15.5,98,2.4,2.64,0.28,1.37,3.7,1.18,2.69,1020,0 +14.22,3.99,2.51,13.2,128,3,3.04,0.2,2.08,5.1,0.89,3.53,760,0 +13.56,1.71,2.31,16.2,117,3.15,3.29,0.34,2.34,6.13,0.95,3.38,795,0 +13.41,3.84,2.12,18.8,90,2.45,2.68,0.27,1.48,4.28,0.91,3,1035,0 +13.88,1.89,2.59,15,101,3.25,3.56,0.17,1.7,5.43,0.88,3.56,1095,0 +13.24,3.98,2.29,17.5,103,2.64,2.63,0.32,1.66,4.36,0.82,3,680,0 +13.05,1.77,2.1,17,107,3,3,0.28,2.03,5.04,0.88,3.35,885,0 +14.21,4.04,2.44,18.9,111,2.85,2.65,0.3,1.25,5.24,0.87,3.33,1080,0 +14.38,3.59,2.28,16,102,3.25,3.17,0.27,2.19,4.9,1.04,3.44,1065,0 +13.9,1.68,2.12,16,101,3.1,3.39,0.21,2.14,6.1,0.91,3.33,985,0 +14.1,2.02,2.4,18.8,103,2.75,2.92,0.32,2.38,6.2,1.07,2.75,1060,0 +13.94,1.73,2.27,17.4,108,2.88,3.54,0.32,2.08,8.9,1.12,3.1,1260,0 +13.05,1.73,2.04,12.4,92,2.72,3.27,0.17,2.91,7.2,1.12,2.91,1150,0 +13.83,1.65,2.6,17.2,94,2.45,2.99,0.22,2.29,5.6,1.24,3.37,1265,0 +13.82,1.75,2.42,14,111,3.88,3.74,0.32,1.87,7.05,1.01,3.26,1190,0 +13.77,1.9,2.68,17.1,115,3,2.79,0.39,1.68,6.3,1.13,2.93,1375,0 +13.74,1.67,2.25,16.4,118,2.6,2.9,0.21,1.62,5.85,0.92,3.2,1060,0 +13.56,1.73,2.46,20.5,116,2.96,2.78,0.2,2.45,6.25,0.98,3.03,1120,0 +14.22,1.7,2.3,16.3,118,3.2,3,0.26,2.03,6.38,0.94,3.31,970,0 +13.29,1.97,2.68,16.8,102,3,3.23,0.31,1.66,6,1.07,2.84,1270,0 +13.72,1.43,2.5,16.7,108,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285,0 +12.37,0.94,1.36,10.6,88,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520,1 +12.33,1.1,2.28,16,101,2.05,1.09,0.63,0.41,3.27,1.25,1.67,680,1 +12.64,1.36,2.02,16.8,100,2.02,1.41,0.53,0.62,5.75,0.98,1.59,450,1 +13.67,1.25,1.92,18,94,2.1,1.79,0.32,0.73,3.8,1.23,2.46,630,1 +12.37,1.13,2.16,19,87,3.5,3.1,0.19,1.87,4.45,1.22,2.87,420,1 +12.17,1.45,2.53,19,104,1.89,1.75,0.45,1.03,2.95,1.45,2.23,355,1 +12.37,1.21,2.56,18.1,98,2.42,2.65,0.37,2.08,4.6,1.19,2.3,678,1 +13.11,1.01,1.7,15,78,2.98,3.18,0.26,2.28,5.3,1.12,3.18,502,1 +12.37,1.17,1.92,19.6,78,2.11,2,0.27,1.04,4.68,1.12,3.48,510,1 +13.34,0.94,2.36,17,110,2.53,1.3,0.55,0.42,3.17,1.02,1.93,750,1 +12.21,1.19,1.75,16.8,151,1.85,1.28,0.14,2.5,2.85,1.28,3.07,718,1 +12.29,1.61,2.21,20.4,103,1.1,1.02,0.37,1.46,3.05,0.906,1.82,870,1 +13.86,1.51,2.67,25,86,2.95,2.86,0.21,1.87,3.38,1.36,3.16,410,1 +13.49,1.66,2.24,24,87,1.88,1.84,0.27,1.03,3.74,0.98,2.78,472,1 +12.99,1.67,2.6,30,139,3.3,2.89,0.21,1.96,3.35,1.31,3.5,985,1 +11.96,1.09,2.3,21,101,3.38,2.14,0.13,1.65,3.21,0.99,3.13,886,1 +11.66,1.88,1.92,16,97,1.61,1.57,0.34,1.15,3.8,1.23,2.14,428,1 +13.03,0.9,1.71,16,86,1.95,2.03,0.24,1.46,4.6,1.19,2.48,392,1 +11.84,2.89,2.23,18,112,1.72,1.32,0.43,0.95,2.65,0.96,2.52,500,1 +12.33,0.99,1.95,14.8,136,1.9,1.85,0.35,2.76,3.4,1.06,2.31,750,1 +12.7,3.87,2.4,23,101,2.83,2.55,0.43,1.95,2.57,1.19,3.13,463,1 +12,0.92,2,19,86,2.42,2.26,0.3,1.43,2.5,1.38,3.12,278,1 +12.72,1.81,2.2,18.8,86,2.2,2.53,0.26,1.77,3.9,1.16,3.14,714,1 +12.08,1.13,2.51,24,78,2,1.58,0.4,1.4,2.2,1.31,2.72,630,1 +13.05,3.86,2.32,22.5,85,1.65,1.59,0.61,1.62,4.8,0.84,2.01,515,1 +11.84,0.89,2.58,18,94,2.2,2.21,0.22,2.35,3.05,0.79,3.08,520,1 +12.67,0.98,2.24,18,99,2.2,1.94,0.3,1.46,2.62,1.23,3.16,450,1 +12.16,1.61,2.31,22.8,90,1.78,1.69,0.43,1.56,2.45,1.33,2.26,495,1 +11.65,1.67,2.62,26,88,1.92,1.61,0.4,1.34,2.6,1.36,3.21,562,1 +11.64,2.06,2.46,21.6,84,1.95,1.69,0.48,1.35,2.8,1,2.75,680,1 +12.08,1.33,2.3,23.6,70,2.2,1.59,0.42,1.38,1.74,1.07,3.21,625,1 +12.08,1.83,2.32,18.5,81,1.6,1.5,0.52,1.64,2.4,1.08,2.27,480,1 +12,1.51,2.42,22,86,1.45,1.25,0.5,1.63,3.6,1.05,2.65,450,1 +12.69,1.53,2.26,20.7,80,1.38,1.46,0.58,1.62,3.05,0.96,2.06,495,1 +12.29,2.83,2.22,18,88,2.45,2.25,0.25,1.99,2.15,1.15,3.3,290,1 +11.62,1.99,2.28,18,98,3.02,2.26,0.17,1.35,3.25,1.16,2.96,345,1 +12.47,1.52,2.2,19,162,2.5,2.27,0.32,3.28,2.6,1.16,2.63,937,1 +11.81,2.12,2.74,21.5,134,1.6,0.99,0.14,1.56,2.5,0.95,2.26,625,1 +12.29,1.41,1.98,16,85,2.55,2.5,0.29,1.77,2.9,1.23,2.74,428,1 +12.37,1.07,2.1,18.5,88,3.52,3.75,0.24,1.95,4.5,1.04,2.77,660,1 +12.29,3.17,2.21,18,88,2.85,2.99,0.45,2.81,2.3,1.42,2.83,406,1 +12.08,2.08,1.7,17.5,97,2.23,2.17,0.26,1.4,3.3,1.27,2.96,710,1 +12.6,1.34,1.9,18.5,88,1.45,1.36,0.29,1.35,2.45,1.04,2.77,562,1 +12.34,2.45,2.46,21,98,2.56,2.11,0.34,1.31,2.8,0.8,3.38,438,1 +11.82,1.72,1.88,19.5,86,2.5,1.64,0.37,1.42,2.06,0.94,2.44,415,1 +12.51,1.73,1.98,20.5,85,2.2,1.92,0.32,1.48,2.94,1.04,3.57,672,1 +12.42,2.55,2.27,22,90,1.68,1.84,0.66,1.42,2.7,0.86,3.3,315,1 +12.25,1.73,2.12,19,80,1.65,2.03,0.37,1.63,3.4,1,3.17,510,1 +12.72,1.75,2.28,22.5,84,1.38,1.76,0.48,1.63,3.3,0.88,2.42,488,1 +12.22,1.29,1.94,19,92,2.36,2.04,0.39,2.08,2.7,0.86,3.02,312,1 +11.61,1.35,2.7,20,94,2.74,2.92,0.29,2.49,2.65,0.96,3.26,680,1 +11.46,3.74,1.82,19.5,107,3.18,2.58,0.24,3.58,2.9,0.75,2.81,562,1 +12.52,2.43,2.17,21,88,2.55,2.27,0.26,1.22,2,0.9,2.78,325,1 +11.76,2.68,2.92,20,103,1.75,2.03,0.6,1.05,3.8,1.23,2.5,607,1 +11.41,0.74,2.5,21,88,2.48,2.01,0.42,1.44,3.08,1.1,2.31,434,1 +12.08,1.39,2.5,22.5,84,2.56,2.29,0.43,1.04,2.9,0.93,3.19,385,1 +11.03,1.51,2.2,21.5,85,2.46,2.17,0.52,2.01,1.9,1.71,2.87,407,1 +11.82,1.47,1.99,20.8,86,1.98,1.6,0.3,1.53,1.95,0.95,3.33,495,1 +12.42,1.61,2.19,22.5,108,2,2.09,0.34,1.61,2.06,1.06,2.96,345,1 +12.77,3.43,1.98,16,80,1.63,1.25,0.43,0.83,3.4,0.7,2.12,372,1 +12,3.43,2,19,87,2,1.64,0.37,1.87,1.28,0.93,3.05,564,1 +11.45,2.4,2.42,20,96,2.9,2.79,0.32,1.83,3.25,0.8,3.39,625,1 +11.56,2.05,3.23,28.5,119,3.18,5.08,0.47,1.87,6,0.93,3.69,465,1 +12.42,4.43,2.73,26.5,102,2.2,2.13,0.43,1.71,2.08,0.92,3.12,365,1 +13.05,5.8,2.13,21.5,86,2.62,2.65,0.3,2.01,2.6,0.73,3.1,380,1 +11.87,4.31,2.39,21,82,2.86,3.03,0.21,2.91,2.8,0.75,3.64,380,1 +12.07,2.16,2.17,21,85,2.6,2.65,0.37,1.35,2.76,0.86,3.28,378,1 +12.43,1.53,2.29,21.5,86,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352,1 +11.79,2.13,2.78,28.5,92,2.13,2.24,0.58,1.76,3,0.97,2.44,466,1 +12.37,1.63,2.3,24.5,88,2.22,2.45,0.4,1.9,2.12,0.89,2.78,342,1 +12.04,4.3,2.38,22,80,2.1,1.75,0.42,1.35,2.6,0.79,2.57,580,1 +12.86,1.35,2.32,18,122,1.51,1.25,0.21,0.94,4.1,0.76,1.29,630,2 +12.88,2.99,2.4,20,104,1.3,1.22,0.24,0.83,5.4,0.74,1.42,530,2 +12.81,2.31,2.4,24,98,1.15,1.09,0.27,0.83,5.7,0.66,1.36,560,2 +12.7,3.55,2.36,21.5,106,1.7,1.2,0.17,0.84,5,0.78,1.29,600,2 +12.51,1.24,2.25,17.5,85,2,0.58,0.6,1.25,5.45,0.75,1.51,650,2 +12.6,2.46,2.2,18.5,94,1.62,0.66,0.63,0.94,7.1,0.73,1.58,695,2 +12.25,4.72,2.54,21,89,1.38,0.47,0.53,0.8,3.85,0.75,1.27,720,2 +12.53,5.51,2.64,25,96,1.79,0.6,0.63,1.1,5,0.82,1.69,515,2 +13.49,3.59,2.19,19.5,88,1.62,0.48,0.58,0.88,5.7,0.81,1.82,580,2 +12.84,2.96,2.61,24,101,2.32,0.6,0.53,0.81,4.92,0.89,2.15,590,2 +12.93,2.81,2.7,21,96,1.54,0.5,0.53,0.75,4.6,0.77,2.31,600,2 +13.36,2.56,2.35,20,89,1.4,0.5,0.37,0.64,5.6,0.7,2.47,780,2 +13.52,3.17,2.72,23.5,97,1.55,0.52,0.5,0.55,4.35,0.89,2.06,520,2 +13.62,4.95,2.35,20,92,2,0.8,0.47,1.02,4.4,0.91,2.05,550,2 +12.25,3.88,2.2,18.5,112,1.38,0.78,0.29,1.14,8.21,0.65,2,855,2 +13.16,3.57,2.15,21,102,1.5,0.55,0.43,1.3,4,0.6,1.68,830,2 +13.88,5.04,2.23,20,80,0.98,0.34,0.4,0.68,4.9,0.58,1.33,415,2 +12.87,4.61,2.48,21.5,86,1.7,0.65,0.47,0.86,7.65,0.54,1.86,625,2 +13.32,3.24,2.38,21.5,92,1.93,0.76,0.45,1.25,8.42,0.55,1.62,650,2 +13.08,3.9,2.36,21.5,113,1.41,1.39,0.34,1.14,9.4,0.57,1.33,550,2 +13.5,3.12,2.62,24,123,1.4,1.57,0.22,1.25,8.6,0.59,1.3,500,2 +12.79,2.67,2.48,22,112,1.48,1.36,0.24,1.26,10.8,0.48,1.47,480,2 +13.11,1.9,2.75,25.5,116,2.2,1.28,0.26,1.56,7.1,0.61,1.33,425,2 +13.23,3.3,2.28,18.5,98,1.8,0.83,0.61,1.87,10.52,0.56,1.51,675,2 +12.58,1.29,2.1,20,103,1.48,0.58,0.53,1.4,7.6,0.58,1.55,640,2 +13.17,5.19,2.32,22,93,1.74,0.63,0.61,1.55,7.9,0.6,1.48,725,2 +13.84,4.12,2.38,19.5,89,1.8,0.83,0.48,1.56,9.01,0.57,1.64,480,2 +12.45,3.03,2.64,27,97,1.9,0.58,0.63,1.14,7.5,0.67,1.73,880,2 +14.34,1.68,2.7,25,98,2.8,1.31,0.53,2.7,13,0.57,1.96,660,2 +13.48,1.67,2.64,22.5,89,2.6,1.1,0.52,2.29,11.75,0.57,1.78,620,2 +12.36,3.83,2.38,21,88,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520,2 +13.69,3.26,2.54,20,107,1.83,0.56,0.5,0.8,5.88,0.96,1.82,680,2 +12.85,3.27,2.58,22,106,1.65,0.6,0.6,0.96,5.58,0.87,2.11,570,2 +12.96,3.45,2.35,18.5,106,1.39,0.7,0.4,0.94,5.28,0.68,1.75,675,2 +13.78,2.76,2.3,22,90,1.35,0.68,0.41,1.03,9.58,0.7,1.68,615,2 +13.73,4.36,2.26,22.5,88,1.28,0.47,0.52,1.15,6.62,0.78,1.75,520,2 +13.45,3.7,2.6,23,111,1.7,0.92,0.43,1.46,10.68,0.85,1.56,695,2 +12.82,3.37,2.3,19.5,88,1.48,0.66,0.4,0.97,10.26,0.72,1.75,685,2 +13.58,2.58,2.69,24.5,105,1.55,0.84,0.39,1.54,8.66,0.74,1.8,750,2 +13.4,4.6,2.86,25,112,1.98,0.96,0.27,1.11,8.5,0.67,1.92,630,2 +12.2,3.03,2.32,19,96,1.25,0.49,0.4,0.73,5.5,0.66,1.83,510,2 +12.77,2.39,2.28,19.5,86,1.39,0.51,0.48,0.64,9.899999,0.57,1.63,470,2 +14.16,2.51,2.48,20,91,1.68,0.7,0.44,1.24,9.7,0.62,1.71,660,2 +13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740,2 +13.4,3.91,2.48,23,102,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750,2 +13.27,4.28,2.26,20,120,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835,2 +13.17,2.59,2.37,20,120,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840,2 +14.13,4.1,2.74,24.5,96,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560,2 From e0142c277d7b6d1a78aae3259fdcaa658235f639 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 14:19:39 -0800 Subject: [PATCH 07/26] fix base.py doc string error --- sklearn/datasets/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 8d6a36d0d9d6c..cca2763a77f05 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -290,8 +290,8 @@ def load_wine(return_X_y=False): >>> from sklearn.datasets import load_wine >>> data = load_wine() - >>> data.target[[10, 25, 50]] - array([0, 0, 1]) + >>> data.target[[10, 80, 140]] + array([0, 1, 2]) >>> list(data.target_names) ['class_0', 'class_1', 'class_2'] """ From 0ff40b9983d5b2ffa6ef8ea95966b2462dae810d Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 14:43:32 -0800 Subject: [PATCH 08/26] added load_wine() test to test_base.py --- sklearn/datasets/tests/test_base.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 92fe96fa10656..b86c09a7b9fdd 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -17,6 +17,7 @@ from sklearn.datasets import load_iris from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_boston +from sklearn.datasets import load_wine from sklearn.datasets.base import Bunch from sklearn.externals.six import b, u @@ -209,6 +210,20 @@ def test_load_iris(): assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) +def test_load_wine(): + res = load_iris() + assert_equal(res.data.shape, (178, 13)) + assert_equal(res.target.size, 178) + assert_equal(res.target_names.size, 3) + assert_true(res.DESCR) + + # test return_X_y option + X_y_tuple = load_iris(return_X_y=True) + bunch = load_iris() + assert_true(isinstance(X_y_tuple, tuple)) + assert_array_equal(X_y_tuple[0], bunch.data) + assert_array_equal(X_y_tuple[1], bunch.target) + def test_load_breast_cancer(): res = load_breast_cancer() From b0c659187cff7d816e0d0a40d3221b928dbebfad Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 14:55:28 -0800 Subject: [PATCH 09/26] changed plot_scaling_importance.py to use load_wine() function --- .../preprocessing/plot_scaling_importance.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 625f0fa7e0ce2..460df2c2ce9e7 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -46,7 +46,6 @@ # License: BSD 3 clause -import pandas as pd from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.decomposition import PCA @@ -54,22 +53,14 @@ from sklearn.naive_bayes import GaussianNB from sklearn import metrics import matplotlib.pyplot as plt +from sklearn.datasets import load_wine # Contants RAN_STATE = 42 FIG_SIZE = (10, 7) -# Read in Data -import pandas as pd -df = pd.read_csv( - 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', - header=None - ) - -# Assign features and target. Target is a class label with 3 classes. -features = df.values[:, 1:] -target = df.values[:, 0] +features, target = load_wine(return_X_y=True) # Make a train/test split using 30% test size X_train, X_test, y_train, y_test = train_test_split(features, target, @@ -114,7 +105,7 @@ fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE) -for l, c, m in zip(range(1, 4), ('blue', 'red', 'green'), ('^', 's', 'o')): +for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')): ax1.scatter(X_train[y_train == l, 0], X_train[y_train == l, 1], color=c, label='class %s' % l, @@ -122,7 +113,7 @@ marker=m ) -for l, c, m in zip(range(1, 4), ('blue', 'red', 'green'), ('^', 's', 'o')): +for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')): ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1], color=c, label='class %s' % l, From 4ab096c8c54748a20615dc9c5ed46d1292296355 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sat, 19 Nov 2016 18:50:45 -0800 Subject: [PATCH 10/26] fixed test_base.py --- sklearn/datasets/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index b86c09a7b9fdd..3992101d7587f 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -211,7 +211,7 @@ def test_load_iris(): assert_array_equal(X_y_tuple[1], bunch.target) def test_load_wine(): - res = load_iris() + res = load_wine() assert_equal(res.data.shape, (178, 13)) assert_equal(res.target.size, 178) assert_equal(res.target_names.size, 3) From b615b9a36da7ff1533be14aead97bee438684ef7 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sun, 20 Nov 2016 18:34:46 -0800 Subject: [PATCH 11/26] fixed flake8 issues --- .../preprocessing/plot_scaling_importance.py | 25 +++++++++---------- sklearn/datasets/base.py | 2 +- sklearn/datasets/tests/test_base.py | 1 + 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 460df2c2ce9e7..bc1e536e14a3d 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -1,6 +1,13 @@ #!/usr/bin/python # -*- coding: utf-8 -*- +from sklearn.cross_validation import train_test_split +from sklearn import preprocessing +from sklearn.decomposition import PCA +from sklearn.naive_bayes import GaussianNB +from sklearn import metrics +import matplotlib.pyplot as plt +from sklearn.datasets import load_wine """ ========================================================= Importance of Feature Scaling @@ -29,15 +36,14 @@ to the covariance matrix. In order to illustrate this in an example, PCA will be performed on a dataset -which has been standardized using :class:`StandardScaler `, -and a copy which has remained untouched. The results with be visualized and -a clear difference noted. +which has been standardized using StandardScalerand a copy which has remained +untouched. The results with be visualized and a clear difference noted. The results will then be used to train a naive Bayes classifier, and a clear difference the prediction accuracies will be observed. """ -from __future__ import print_function + print(__doc__) @@ -46,14 +52,6 @@ # License: BSD 3 clause -from sklearn.cross_validation import train_test_split -from sklearn import preprocessing -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler -from sklearn.naive_bayes import GaussianNB -from sklearn import metrics -import matplotlib.pyplot as plt -from sklearn.datasets import load_wine # Contants RAN_STATE = 42 @@ -64,7 +62,8 @@ # Make a train/test split using 30% test size X_train, X_test, y_train, y_test = train_test_split(features, target, - test_size=0.30, random_state=RAN_STATE) + test_size=0.30, + random_state=RAN_STATE) # Apply Scaling to X_train and X_test std_scale = preprocessing.StandardScaler().fit(X_train) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index cca2763a77f05..cac5c9cc3fb60 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -318,7 +318,7 @@ def load_wine(return_X_y=False): return Bunch(data=data, target=target, target_names=target_names, DESCR=fdescr, - feature_names=['alcohol', + feature_names=['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 3992101d7587f..fade1c49d0b56 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -210,6 +210,7 @@ def test_load_iris(): assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) + def test_load_wine(): res = load_wine() assert_equal(res.data.shape, (178, 13)) From 3f4ddaed79423768f6d9d96e0bcecef2eb578972 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Sun, 20 Nov 2016 18:54:20 -0800 Subject: [PATCH 12/26] moved import below docstring --- .../preprocessing/plot_scaling_importance.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index bc1e536e14a3d..082669121b7fe 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -1,13 +1,5 @@ #!/usr/bin/python # -*- coding: utf-8 -*- - -from sklearn.cross_validation import train_test_split -from sklearn import preprocessing -from sklearn.decomposition import PCA -from sklearn.naive_bayes import GaussianNB -from sklearn import metrics -import matplotlib.pyplot as plt -from sklearn.datasets import load_wine """ ========================================================= Importance of Feature Scaling @@ -43,9 +35,17 @@ difference the prediction accuracies will be observed. """ - +from __future__ import print_function print(__doc__) +from sklearn.cross_validation import train_test_split +from sklearn import preprocessing +from sklearn.decomposition import PCA +from sklearn.naive_bayes import GaussianNB +from sklearn import metrics +import matplotlib.pyplot as plt +from sklearn.datasets import load_wine + # Code source: Tyler Lanigan # Sebastian Raschka From 2b2b45bafcfcb09eeb042c77fa9b2cdd033d973a Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Mon, 28 Nov 2016 13:07:05 -0800 Subject: [PATCH 13/26] moved print(__doc__) below import statements --- examples/preprocessing/plot_scaling_importance.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 082669121b7fe..a2b4bc077bb2c 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -36,8 +36,6 @@ """ from __future__ import print_function -print(__doc__) - from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.decomposition import PCA @@ -45,7 +43,7 @@ from sklearn import metrics import matplotlib.pyplot as plt from sklearn.datasets import load_wine - +print(__doc__) # Code source: Tyler Lanigan # Sebastian Raschka From 36e35516680b4238c18705d82414d3ab7194b22c Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Mon, 28 Nov 2016 13:45:42 -0800 Subject: [PATCH 14/26] corrected spelling, ordering, and switched to using model_selection instead of cross_validation as per first pass comments from @lesteve --- .../preprocessing/plot_scaling_importance.py | 16 +++++++--------- sklearn/datasets/__init__.py | 13 ++++++------- sklearn/datasets/base.py | 13 ++++++++++--- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index a2b4bc077bb2c..a87836eb7290f 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -6,7 +6,7 @@ ========================================================= Features scaling though standardization (or Z-score normalization) -can be an importance preprocessing step for many machine learning +can be an important preprocessing step for many machine learning algorithms. Standardization involves rescaling the features such that they’ll have the properties of a standard normal distribution with a mean of zero and a standard deviation of one. @@ -20,7 +20,7 @@ weight) because of their respective scales (meters vs. kilos) it can be seen how not scaling the features would cause PCA to determine that the direction of maximal variance more closely corresponds with the -‘weight’ axis. As a change in height of one meter can be considered much +'weight' axis. As a change in height of one meter can be considered much more important than the change in weight of one kilogram, it is easily seen that this determination is incorrect. In the case of PCA, scaling features using normalization is preferred over using min-max scaling as @@ -28,7 +28,7 @@ to the covariance matrix. In order to illustrate this in an example, PCA will be performed on a dataset -which has been standardized using StandardScalerand a copy which has remained +which has been standardized using Standard Scaler and a copy which has remained untouched. The results with be visualized and a clear difference noted. The results will then be used to train a naive Bayes classifier, and a clear @@ -36,7 +36,7 @@ """ from __future__ import print_function -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.decomposition import PCA from sklearn.naive_bayes import GaussianNB @@ -50,9 +50,7 @@ # License: BSD 3 clause - -# Contants -RAN_STATE = 42 +RANDOM_STATE = 42 FIG_SIZE = (10, 7) @@ -61,7 +59,7 @@ # Make a train/test split using 30% test size X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, - random_state=RAN_STATE) + random_state=RANDOM_STATE) # Apply Scaling to X_train and X_test std_scale = preprocessing.StandardScaler().fit(X_train) @@ -122,11 +120,11 @@ ax2.set_title('Standardized training dataset after PCA') for ax in (ax1, ax2): - ax.set_xlabel('1st principal component') ax.set_ylabel('2nd principal component') ax.legend(loc='upper right') ax.grid() + plt.tight_layout() plt.show() diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 53fbbeed1cbc5..c43c0c4758b10 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -3,19 +3,18 @@ including methods to load and fetch popular reference datasets. It also features some artificial data generators. """ - +from .base import load_breast_cancer +from .base import load_boston from .base import load_diabetes from .base import load_digits from .base import load_files -from .base import load_wine from .base import load_iris -from .base import load_breast_cancer from .base import load_linnerud -from .base import load_boston -from .base import get_data_home -from .base import clear_data_home from .base import load_sample_images from .base import load_sample_image +from .base import load_wine +from .base import get_data_home +from .base import clear_data_home from .covtype import fetch_covtype from .kddcup99 import fetch_kddcup99 from .mlcomp import load_mlcomp @@ -67,7 +66,6 @@ 'fetch_rcv1', 'fetch_kddcup99', 'get_data_home', - 'load_wine', 'load_boston', 'load_diabetes', 'load_digits', @@ -80,6 +78,7 @@ 'load_sample_images', 'load_svmlight_file', 'load_svmlight_files', + 'load_wine', 'make_biclusters', 'make_blobs', 'make_circles', diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index cac5c9cc3fb60..e66f4b59132f6 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -245,6 +245,8 @@ def load_files(container_path, description=None, categories=None, def load_wine(return_X_y=False): """Load and return the wine dataset (classification). + .. versionadded:: 0.18 + The wine dataset is a classic and very easy multi-class classification dataset. @@ -264,7 +266,6 @@ def load_wine(return_X_y=False): If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` object. - .. versionadded:: 0.18 Returns ------- @@ -277,15 +278,21 @@ def load_wine(return_X_y=False): (data, target) : tuple if ``return_X_y`` is True - .. versionadded:: 0.18 The copy of UCI ML Wine Data Set dataset is downloaded from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data + The file has been modified: + -to include class labels class_0, class_1 and class_2; + -to have the target classification as the last column; + -to rename target variables from 1, 2, and 3 to 0, 1 and 2; + -to include to amount of datapoints and class labels. + + Examples -------- - Let's say you are interested in the samples 10, 25, and 50, and want to + Let's say you are interested in the samples 10, 80, and 140, and want to know their class name. >>> from sklearn.datasets import load_wine From ba2abfe98bb26ff7712d2d155c7f274f995eb076 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Mon, 28 Nov 2016 16:15:24 -0800 Subject: [PATCH 15/26] added principal component 1 comparison --- .../preprocessing/plot_scaling_importance.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index a87836eb7290f..b3dda5a8b2533 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -24,12 +24,18 @@ more important than the change in weight of one kilogram, it is easily seen that this determination is incorrect. In the case of PCA, scaling features using normalization is preferred over using min-max scaling as -the primary components are computed using the correlation matrix as opposed -to the covariance matrix. - -In order to illustrate this in an example, PCA will be performed on a dataset -which has been standardized using Standard Scaler and a copy which has remained -untouched. The results with be visualized and a clear difference noted. +the primary components are computed using the correlation matrix as +opposed to the covariance matrix. + +In order to illustrate this in an example, PCA will be performed on a +dataset which has been standardized using Standard Scaler and a copy +which has remained untouched. The results with be visualized and a clear +difference noted. The 1st principal component in the unscaled set is shown. +It can be seen that feature #13 dominates the direction, being a whole +two orders of magnitude above the other features. This is contrasted when +observing the prinicpal component for the scaled version of the data. In the +scaled version, the orders of magnitude is roughly the same across all the +features. The results will then be used to train a naive Bayes classifier, and a clear difference the prediction accuracies will be observed. @@ -94,9 +100,10 @@ print('\nPrediction accuracy for the standardized test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std))) +print('\nPC 1 without scaling:\n', pca.components_[0]) +print('\nPC 1 with scaling:\n', pca_std.components_[0]) # visualize standardized vs. untouched dataset with PCA performed - fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE) From 494a3e3b5346b510a807eb347e73073e3aacb8e6 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Thu, 8 Dec 2016 19:16:28 -0800 Subject: [PATCH 16/26] corrected spacing in plot_scaling_importance.py and description for load_wine in base.py --- examples/preprocessing/plot_scaling_importance.py | 4 ++-- sklearn/datasets/base.py | 14 +++----------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index b3dda5a8b2533..aac9eeb490b34 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -16,7 +16,7 @@ think of Principle Component Analysis (PCA) as being a prime example of when normalization is important. In PCA we are interested in the components that maximize the variance. If there exists components -(e.g human height) that vary less then other components (e.g human +(e.g human height) that vary less than other components (e.g human weight) because of their respective scales (meters vs. kilos) it can be seen how not scaling the features would cause PCA to determine that the direction of maximal variance more closely corresponds with the @@ -38,7 +38,7 @@ features. The results will then be used to train a naive Bayes classifier, and a clear -difference the prediction accuracies will be observed. +difference in prediction accuracies will be observed. """ from __future__ import print_function diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index e66f4b59132f6..f51edd8d37322 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -252,7 +252,7 @@ def load_wine(return_X_y=False): ================= ============== Classes 3 - Samples per class [59,71,48] + Samples per class [59,71,48] Samples total 178 Dimensionality 13 Features real, positive @@ -266,7 +266,6 @@ def load_wine(return_X_y=False): If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` object. - Returns ------- data : Bunch @@ -278,18 +277,11 @@ def load_wine(return_X_y=False): (data, target) : tuple if ``return_X_y`` is True - The copy of UCI ML Wine Data Set dataset is - downloaded from: + downloaded and modified to fit standard format from: + https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data - The file has been modified: - -to include class labels class_0, class_1 and class_2; - -to have the target classification as the last column; - -to rename target variables from 1, 2, and 3 to 0, 1 and 2; - -to include to amount of datapoints and class labels. - - Examples -------- Let's say you are interested in the samples 10, 80, and 140, and want to From dbb495fdff7a8368abe019913e1cabaf35b1d9b3 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Thu, 8 Dec 2016 21:26:38 -0800 Subject: [PATCH 17/26] flake8 compliance --- sklearn/datasets/base.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index f51edd8d37322..4029da576b0ad 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -279,7 +279,6 @@ def load_wine(return_X_y=False): The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit standard format from: - https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data Examples @@ -332,10 +331,6 @@ def load_wine(return_X_y=False): 'proline']) - - - - def load_iris(return_X_y=False): """Load and return the iris dataset (classification). @@ -611,12 +606,12 @@ def load_diabetes(return_X_y=False): (data, target) : tuple if ``return_X_y`` is True - .. versionadded:: 0.18 + .. versionadded:: 0.18 """ base_dir = join(dirname(__file__), 'data') data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz')) target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz')) - + if return_X_y: return data, target @@ -648,7 +643,7 @@ def load_linnerud(return_X_y=False): 'targets', the two multivariate datasets, with 'data' corresponding to the exercise and 'targets' corresponding to the physiological measurements, as well as 'feature_names' and 'target_names'. - + (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.18 @@ -702,7 +697,7 @@ def load_boston(return_X_y=False): (data, target) : tuple if ``return_X_y`` is True - .. versionadded:: 0.18 + .. versionadded:: 0.18 Examples -------- From 2d9b0fb3f550a1971361137c2df1f7718364e66e Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Wed, 14 Dec 2016 12:41:19 -0800 Subject: [PATCH 18/26] documentation changes from jnothman --- .../preprocessing/plot_scaling_importance.py | 54 +++++++++---------- sklearn/datasets/base.py | 2 +- sklearn/datasets/descr/wine_data.rst | 10 ++-- sklearn/datasets/tests/test_base.py | 6 ++- 4 files changed, 39 insertions(+), 33 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index aac9eeb490b34..c3582a718b377 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -8,37 +8,37 @@ Features scaling though standardization (or Z-score normalization) can be an important preprocessing step for many machine learning algorithms. Standardization involves rescaling the features such -that they’ll have the properties of a standard normal distribution +that they have the properties of a standard normal distribution with a mean of zero and a standard deviation of one. -While many algorithms (such as SVM, K-nearest neighbors and logistic +While many algorithms (such as SVM, K-nearest neighbors, and logistic regression) require features to be normalized, intuitively we can think of Principle Component Analysis (PCA) as being a prime example of when normalization is important. In PCA we are interested in the -components that maximize the variance. If there exists components -(e.g human height) that vary less than other components (e.g human -weight) because of their respective scales (meters vs. kilos) it can -be seen how not scaling the features would cause PCA to determine that -the direction of maximal variance more closely corresponds with the -'weight' axis. As a change in height of one meter can be considered much -more important than the change in weight of one kilogram, it is easily -seen that this determination is incorrect. In the case of PCA, scaling -features using normalization is preferred over using min-max scaling as -the primary components are computed using the correlation matrix as -opposed to the covariance matrix. - -In order to illustrate this in an example, PCA will be performed on a -dataset which has been standardized using Standard Scaler and a copy -which has remained untouched. The results with be visualized and a clear -difference noted. The 1st principal component in the unscaled set is shown. -It can be seen that feature #13 dominates the direction, being a whole -two orders of magnitude above the other features. This is contrasted when -observing the prinicpal component for the scaled version of the data. In the -scaled version, the orders of magnitude is roughly the same across all the -features. - -The results will then be used to train a naive Bayes classifier, and a clear -difference in prediction accuracies will be observed. +components that maximize the variance. If one component (e.g. human +height) varies less than another (e.g. weight) because of their +respective scales (meters vs. kilos), PCA might determine that the +direction of maximal variance more closely corresponds with the +'weight' axis, if those features are not scaled. As a change in +height of one meter can be considered much more important than the +change in weight of one kilogram, this is clearly incorrect. + +To illustrate this, PCA is performed comparing the use of the unscaled +data against the same with :class:`preprocessing.StandardScaler` applied +The results are visualized and a clear difference noted. The 1st principal +component in the unscaled set can be seen. It can be seen that feature #13 +dominates the direction, being a whole two orders of magnitude above the +other features. This is contrasted when observing the principal component +for the scaled version of the data. In the scaled version, the orders of +magnitude are roughly the same across all the features. + +The dataset used is the Wine Dataset available at UCI. This dataset +has continuous features that are heterogeneous in scale due to differing +properties that they measure (i.e alcohol content, and malic acid). + +The results are then used to train a naive Bayes classifier, and a clear +difference in prediction accuracies will be observed wherein the dataset +which is scaled before PCA vastly outperforms the unscaled version. """ from __future__ import print_function @@ -52,7 +52,7 @@ print(__doc__) # Code source: Tyler Lanigan -# Sebastian Raschka +# Sebastian Raschka # License: BSD 3 clause diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 4029da576b0ad..c091c16996227 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -245,7 +245,7 @@ def load_files(container_path, description=None, categories=None, def load_wine(return_X_y=False): """Load and return the wine dataset (classification). - .. versionadded:: 0.18 + .. versionadded:: 0.18 The wine dataset is a classic and very easy multi-class classification dataset. diff --git a/sklearn/datasets/descr/wine_data.rst b/sklearn/datasets/descr/wine_data.rst index de9f8a318af31..abb1e3e35fcf8 100644 --- a/sklearn/datasets/descr/wine_data.rst +++ b/sklearn/datasets/descr/wine_data.rst @@ -53,16 +53,20 @@ Data Set Characteristics: This is a copy of UCI ML Wine recognition datasets. https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data -These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines. +The data is the results of a chemical analysis of wines grown in the same region in Italy by three different cultivators. There are thirteen different measurements taken for different constituents found in the three types of wine. Original Owners: Forina, M. et al, PARVUS - An Extendible Package for Data Exploration, Classification and Correlation. Institute of Pharmaceutical and Food Analysis and Technologies, Via Brigata Salerno, -16147 Genoa, Italy. +16147 Genoa, Italy. -Relevant Papers +Citation: + +Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. + +References ---------- (1) S. Aeberhard, D. Coomans and O. de Vel, diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index fade1c49d0b56..c0dd5101904d9 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -196,6 +196,7 @@ def test_load_linnerud(): assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) + def test_load_iris(): res = load_iris() assert_equal(res.data.shape, (150, 4)) @@ -219,8 +220,8 @@ def test_load_wine(): assert_true(res.DESCR) # test return_X_y option - X_y_tuple = load_iris(return_X_y=True) - bunch = load_iris() + X_y_tuple = load_wine(return_X_y=True) + bunch = load_wine() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) @@ -255,6 +256,7 @@ def test_load_boston(): assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) + def test_loads_dumps_bunch(): bunch = Bunch(x="x") bunch_from_pkl = loads(dumps(bunch)) From 3b61a0c42c65b2884c5bf871db4eb42f833fab87 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Wed, 21 Dec 2016 13:53:05 -0800 Subject: [PATCH 19/26] fixed link to StandardScaler --- .../preprocessing/plot_scaling_importance.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index c3582a718b377..4db6068d5595a 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -23,14 +23,16 @@ height of one meter can be considered much more important than the change in weight of one kilogram, this is clearly incorrect. -To illustrate this, PCA is performed comparing the use of the unscaled -data against the same with :class:`preprocessing.StandardScaler` applied -The results are visualized and a clear difference noted. The 1st principal -component in the unscaled set can be seen. It can be seen that feature #13 -dominates the direction, being a whole two orders of magnitude above the -other features. This is contrasted when observing the principal component -for the scaled version of the data. In the scaled version, the orders of -magnitude are roughly the same across all the features. +:class:`StandardScaler ` + +To illustrate this, PCA is performed comparing the use of data with +:class:`StandardScaler ` applied, +to unscaled data. The results are visualized and a clear difference noted. +The 1st principal component in the unscaled set can be seen.It can be seen +that feature #13 dominates the direction, being a whole two orders of +magnitude above the other features. This is contrasted when observing +the principal component for the scaled version of the data. In the scaled +version, the orders of magnitude are roughly the same across all the features. The dataset used is the Wine Dataset available at UCI. This dataset has continuous features that are heterogeneous in scale due to differing From caa949978793fb2a060ab9a7be8d43a513331d16 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Mon, 2 Jan 2017 12:46:36 -0800 Subject: [PATCH 20/26] slight mod to wine description file --- sklearn/datasets/descr/wine_data.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/descr/wine_data.rst b/sklearn/datasets/descr/wine_data.rst index abb1e3e35fcf8..5a63dffd629ab 100644 --- a/sklearn/datasets/descr/wine_data.rst +++ b/sklearn/datasets/descr/wine_data.rst @@ -19,7 +19,7 @@ Data Set Characteristics: - 10)Color intensity - 11)Hue - 12)OD280/OD315 of diluted wines - - 13)Proline + - 13)Proline - class: - class_0 - class_1 @@ -27,7 +27,7 @@ Data Set Characteristics: :Summary Statistics: ============== ==== ==== ======= ===== ==================== - Min Max Mean SD + Min Max Mean SD ============== ==== ==== ======= ===== ==================== Alcohol: 11.0 14.8 13.0 0.811 Malic Acid: 0.74 5.80 2.34 1.12 @@ -36,7 +36,7 @@ Data Set Characteristics: Magnesium: 70.0 162.0 99.7 14.28 Total Phenols: 0.980 3.88 2.29 0.626 Flavanoids: 0.340 5.08 2.03 .999 - Nonflavanoid Phenols: 0.130 0.66 0.362 0.124 + Nonflavanoid Phenols: 0.130 0.66 0.362 0.124 Proanthocyanins: 0.410 3.580 1.590 0.572 Colour Intensity: 1.28 13.0 5.06 2.318 Hue: 0.480 1.71 0.957 0.226 From 11f6d1107f1d0b5177a10654d647298cbdc81914 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Wed, 11 Jan 2017 12:14:31 -0800 Subject: [PATCH 21/26] added pipelining to plot_scaling... fixed spelling --- .../preprocessing/plot_scaling_importance.py | 58 ++++++++----------- sklearn/datasets/descr/wine_data.rst | 13 +++-- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 4db6068d5595a..f1e599ed753c7 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -5,7 +5,7 @@ Importance of Feature Scaling ========================================================= -Features scaling though standardization (or Z-score normalization) +Feature scaling though standardization (or Z-score normalization) can be an important preprocessing step for many machine learning algorithms. Standardization involves rescaling the features such that they have the properties of a standard normal distribution @@ -23,12 +23,10 @@ height of one meter can be considered much more important than the change in weight of one kilogram, this is clearly incorrect. -:class:`StandardScaler ` - To illustrate this, PCA is performed comparing the use of data with :class:`StandardScaler ` applied, to unscaled data. The results are visualized and a clear difference noted. -The 1st principal component in the unscaled set can be seen.It can be seen +The 1st principal component in the unscaled set can be seen. It can be seen that feature #13 dominates the direction, being a whole two orders of magnitude above the other features. This is contrasted when observing the principal component for the scaled version of the data. In the scaled @@ -38,19 +36,20 @@ has continuous features that are heterogeneous in scale due to differing properties that they measure (i.e alcohol content, and malic acid). -The results are then used to train a naive Bayes classifier, and a clear -difference in prediction accuracies will be observed wherein the dataset +The transformed data is then used to train a naive Bayes classifier, and a +clear difference in prediction accuracies is observed wherein the dataset which is scaled before PCA vastly outperforms the unscaled version. """ from __future__ import print_function from sklearn.model_selection import train_test_split -from sklearn import preprocessing +from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.naive_bayes import GaussianNB from sklearn import metrics import matplotlib.pyplot as plt from sklearn.datasets import load_wine +from sklearn.pipeline import make_pipeline print(__doc__) # Code source: Tyler Lanigan @@ -69,42 +68,35 @@ test_size=0.30, random_state=RANDOM_STATE) -# Apply Scaling to X_train and X_test -std_scale = preprocessing.StandardScaler().fit(X_train) -X_train_std = std_scale.transform(X_train) -X_test_std = std_scale.transform(X_test) - -# Perform PCA on non-standardized data -pca = PCA(n_components=2).fit(X_train) -X_train = pca.transform(X_train) -X_test = pca.transform(X_test) - -# Perform PCA on standardized data -pca_std = PCA(n_components=2).fit(X_train_std) -X_train_std = pca_std.transform(X_train_std) -X_test_std = pca_std.transform(X_test_std) - -# Fit GaussianNB on standard and non-standardized data -clf = GaussianNB() -fit = clf.fit(X_train, y_train) -clf_std = GaussianNB() -fit_std = clf_std.fit(X_train_std, y_train) - -# Make predictions for standard and non standardized data. -pred_train = clf.predict(X_train) -pred_test = clf.predict(X_test) -pred_train_std = clf_std.predict(X_train_std) -pred_test_std = clf_std.predict(X_test_std) +# Fit to data and predict using pipelined GNB and PCA. +unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB()) +unscaled_clf.fit(X_train, y_train) +pred_test = unscaled_clf.predict(X_test) + +# Fit to data and predict using pipelined scaling, GNB and PCA. +std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB()) +std_clf.fit(X_train, y_train) +pred_test_std = std_clf.predict(X_test) +# Show prediction accuracies in scaled and unscaled data. print('\nPrediction accuracy for the normal test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test))) print('\nPrediction accuracy for the standardized test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std))) +# Extract PCA from pipeline +pca = unscaled_clf.named_steps['pca'] +pca_std = std_clf.named_steps['pca'] + +# Show first principal componenets print('\nPC 1 without scaling:\n', pca.components_[0]) print('\nPC 1 with scaling:\n', pca_std.components_[0]) +# Scale and use PCA on X_train data for visualization. +scaler = std_clf.named_steps['standardscaler'] +X_train_std = pca_std.transform(scaler.transform(X_train)) + # visualize standardized vs. untouched dataset with PCA performed fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE) diff --git a/sklearn/datasets/descr/wine_data.rst b/sklearn/datasets/descr/wine_data.rst index 5a63dffd629ab..3ad6dec6ddc01 100644 --- a/sklearn/datasets/descr/wine_data.rst +++ b/sklearn/datasets/descr/wine_data.rst @@ -53,18 +53,23 @@ Data Set Characteristics: This is a copy of UCI ML Wine recognition datasets. https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data -The data is the results of a chemical analysis of wines grown in the same region in Italy by three different cultivators. There are thirteen different measurements taken for different constituents found in the three types of wine. +The data is the results of a chemical analysis of wines grown in the same +region in Italy by three different cultivators. There are thirteen different +measurements taken for different constituents found in the three types of +wine. Original Owners: Forina, M. et al, PARVUS - An Extendible Package for Data Exploration, Classification and Correlation. -Institute of Pharmaceutical and Food Analysis and Technologies, Via Brigata Salerno, -16147 Genoa, Italy. +Institute of Pharmaceutical and Food Analysis and Technologies, +Via Brigata Salerno, 16147 Genoa, Italy. Citation: -Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. +Lichman, M. (2013). UCI Machine Learning Repository +[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, +School of Information and Computer Science. References ---------- From ac9f2426691d8287da921c36cbb2fc0c30f1620b Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Thu, 12 Jan 2017 12:14:39 -0800 Subject: [PATCH 22/26] added load_data function to clean up code for load_wine, load_iris, and load_breast_cancer --- sklearn/datasets/base.py | 76 +++++++++++++------------ sklearn/datasets/data/breast_cancer.csv | 2 +- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index c091c16996227..0492c759398d1 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -241,6 +241,43 @@ def load_files(container_path, description=None, categories=None, target=target, DESCR=description) +def load_data(module_path, data_file_name): + """ + Loads data module_path/data/data_file_name. + + Parameters + ---------- + data_file_name : String. Name of csv file to be loaded from + module_path/data/data_file_name. For example 'wine_data.csv'. + + Returns + ------- + data : Numpy Array + A 2D array with each row representing one sample and each column + representing the features of a given sample. + target : Numpy Array + A 1D array holding target variables for all the samples in `data. + For example target[0] is the target varible for data[0]). + target_names : Numpy Array + A 1D array containing the names of the classifications. For example + target_names[0] is the name of the target[0] class. + """ + + with open(join(module_path, 'data', data_file_name)) as csv_file: + data_file = csv.reader(csv_file) + temp = next(data_file) + n_samples = int(temp[0]) + n_features = int(temp[1]) + target_names = np.array(temp[2:]) + data = np.empty((n_samples, n_features)) + target = np.empty((n_samples,), dtype=np.int) + + for i, ir in enumerate(data_file): + data[i] = np.asarray(ir[:-1], dtype=np.float64) + target[i] = np.asarray(ir[-1], dtype=np.int) + + return data, target, target_names + def load_wine(return_X_y=False): """Load and return the wine dataset (classification). @@ -294,18 +331,7 @@ def load_wine(return_X_y=False): ['class_0', 'class_1', 'class_2'] """ module_path = dirname(__file__) - with open(join(module_path, 'data', 'wine_data.csv')) as csv_file: - data_file = csv.reader(csv_file) - temp = next(data_file) - n_samples = int(temp[0]) - n_features = int(temp[1]) - target_names = np.array(temp[2:]) - data = np.empty((n_samples, n_features)) - target = np.empty((n_samples,), dtype=np.int) - - for i, ir in enumerate(data_file): - data[i] = np.asarray(ir[:-1], dtype=np.float64) - target[i] = np.asarray(ir[-1], dtype=np.int) + data, target, target_names = load_data(module_path, 'wine_data.csv') with open(join(module_path, 'descr', 'wine_data.rst')) as rst_file: fdescr = rst_file.read() @@ -381,18 +407,7 @@ def load_iris(return_X_y=False): ['setosa', 'versicolor', 'virginica'] """ module_path = dirname(__file__) - with open(join(module_path, 'data', 'iris.csv')) as csv_file: - data_file = csv.reader(csv_file) - temp = next(data_file) - n_samples = int(temp[0]) - n_features = int(temp[1]) - target_names = np.array(temp[2:]) - data = np.empty((n_samples, n_features)) - target = np.empty((n_samples,), dtype=np.int) - - for i, ir in enumerate(data_file): - data[i] = np.asarray(ir[:-1], dtype=np.float64) - target[i] = np.asarray(ir[-1], dtype=np.int) + data, target, target_names = load_data(module_path, 'iris.csv') with open(join(module_path, 'descr', 'iris.rst')) as rst_file: fdescr = rst_file.read() @@ -459,18 +474,7 @@ def load_breast_cancer(return_X_y=False): ['malignant', 'benign'] """ module_path = dirname(__file__) - with open(join(module_path, 'data', 'breast_cancer.csv')) as csv_file: - data_file = csv.reader(csv_file) - first_line = next(data_file) - n_samples = int(first_line[0]) - n_features = int(first_line[1]) - target_names = np.array(first_line[2:4]) - data = np.empty((n_samples, n_features)) - target = np.empty((n_samples,), dtype=np.int) - - for count, value in enumerate(data_file): - data[count] = np.asarray(value[:-1], dtype=np.float64) - target[count] = np.asarray(value[-1], dtype=np.int) + data, target, target_names = load_data(module_path, 'breast_cancer.csv') with open(join(module_path, 'descr', 'breast_cancer.rst')) as rst_file: fdescr = rst_file.read() diff --git a/sklearn/datasets/data/breast_cancer.csv b/sklearn/datasets/data/breast_cancer.csv index 8eafb95815978..979a3dcb6786a 100644 --- a/sklearn/datasets/data/breast_cancer.csv +++ b/sklearn/datasets/data/breast_cancer.csv @@ -1,4 +1,4 @@ -569,30,malignant,benign,,,,,,,,,,,,,,,,,,,,,,,,,,, +569,30,malignant,benign 17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0 20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0 19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0 From 684a2d792627c4b496f9bc0f19b854180e0941bc Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 13 Jan 2017 12:02:41 +1100 Subject: [PATCH 23/26] DOC Align summary statistics for wine Also use same number of decimal places across columns of same row. --- sklearn/datasets/descr/wine_data.rst | 37 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/sklearn/datasets/descr/wine_data.rst b/sklearn/datasets/descr/wine_data.rst index 3ad6dec6ddc01..3d3341874a584 100644 --- a/sklearn/datasets/descr/wine_data.rst +++ b/sklearn/datasets/descr/wine_data.rst @@ -24,25 +24,26 @@ Data Set Characteristics: - class_0 - class_1 - class_2 + :Summary Statistics: - - ============== ==== ==== ======= ===== ==================== - Min Max Mean SD - ============== ==== ==== ======= ===== ==================== - Alcohol: 11.0 14.8 13.0 0.811 - Malic Acid: 0.74 5.80 2.34 1.12 - Ash: 1.36 3.23 2.36 0.274 - Alcalinity of Ash: 10.6 30.0 19.5 3.34 - Magnesium: 70.0 162.0 99.7 14.28 - Total Phenols: 0.980 3.88 2.29 0.626 - Flavanoids: 0.340 5.08 2.03 .999 - Nonflavanoid Phenols: 0.130 0.66 0.362 0.124 - Proanthocyanins: 0.410 3.580 1.590 0.572 - Colour Intensity: 1.28 13.0 5.06 2.318 - Hue: 0.480 1.71 0.957 0.226 - OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.710 - Proline: 278 1680 746 315 - ============== ==== ==== ======= ===== ==================== + + ============================= ==== ===== ======= ===== + Min Max Mean SD + ============================= ==== ===== ======= ===== + Alcohol: 11.0 14.8 13.0 0.8 + Malic Acid: 0.74 5.80 2.34 1.12 + Ash: 1.36 3.23 2.36 0.27 + Alcalinity of Ash: 10.6 30.0 19.5 3.3 + Magnesium: 70.0 162.0 99.7 14.3 + Total Phenols: 0.98 3.88 2.29 0.63 + Flavanoids: 0.34 5.08 2.03 1.00 + Nonflavanoid Phenols: 0.13 0.66 0.36 0.12 + Proanthocyanins: 0.41 3.58 1.59 0.57 + Colour Intensity: 1.3 13.0 5.1 2.3 + Hue: 0.48 1.71 0.96 0.23 + OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71 + Proline: 278 1680 746 315 + ============================= ==== ===== ======= ===== :Missing Attribute Values: None :Class Distribution: class_0 (59), class_1 (71), class_2 (48) From 14e186f3d7b92a69bd149d766b6df9bb00cc4ec1 Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Thu, 12 Jan 2017 18:53:26 -0800 Subject: [PATCH 24/26] flake8 compliance...again --- sklearn/datasets/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 0492c759398d1..64f6084865b2d 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -241,6 +241,7 @@ def load_files(container_path, description=None, categories=None, target=target, DESCR=description) + def load_data(module_path, data_file_name): """ Loads data module_path/data/data_file_name. @@ -262,7 +263,7 @@ def load_data(module_path, data_file_name): A 1D array containing the names of the classifications. For example target_names[0] is the name of the target[0] class. """ - + with open(join(module_path, 'data', data_file_name)) as csv_file: data_file = csv.reader(csv_file) temp = next(data_file) From c33b382d14d566ad0c6f1275d2e279377133d9ed Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Mon, 16 Jan 2017 11:54:42 -0800 Subject: [PATCH 25/26] updated base.py load_data for PEP257 --- sklearn/datasets/base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 64f6084865b2d..291eff6e7528e 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -244,7 +244,8 @@ def load_files(container_path, description=None, categories=None, def load_data(module_path, data_file_name): """ - Loads data module_path/data/data_file_name. + Loads data from module_path/data/data_file_name and returns + the data, targets, and target names as numpy arrays. Parameters ---------- @@ -256,14 +257,15 @@ def load_data(module_path, data_file_name): data : Numpy Array A 2D array with each row representing one sample and each column representing the features of a given sample. + target : Numpy Array A 1D array holding target variables for all the samples in `data. - For example target[0] is the target varible for data[0]). + For example target[0] is the target varible for data[0]. + target_names : Numpy Array A 1D array containing the names of the classifications. For example target_names[0] is the name of the target[0] class. """ - with open(join(module_path, 'data', data_file_name)) as csv_file: data_file = csv.reader(csv_file) temp = next(data_file) From 538cb5a3f31fc649f34f2ded638755e80c0d5d8b Mon Sep 17 00:00:00 2001 From: Tyler Lanigan Date: Mon, 16 Jan 2017 14:41:07 -0800 Subject: [PATCH 26/26] base.py PEP257 one line description for load_data --- sklearn/datasets/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 291eff6e7528e..2325d971428d2 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -243,9 +243,7 @@ def load_files(container_path, description=None, categories=None, def load_data(module_path, data_file_name): - """ - Loads data from module_path/data/data_file_name and returns - the data, targets, and target names as numpy arrays. + """Loads data from module_path/data/data_file_name. Parameters ----------