scikit-learn · ogrisel · Jul 28, 2013 · Jul 28, 2013
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -253,6 +253,8 @@ API changes summary
      to float, and raises a warning. Previously it rounded for dense integer
      input.
 
+   - Better input validation, warning on unexpected shapes for y.
+
 .. _changes_0_13_1:
 
 0.13.1

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -54,6 +54,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
                     ExtraTreeClassifier, ExtraTreeRegressor)
 from ..tree._tree import DTYPE, DOUBLE
 from ..utils import array2d, check_random_state, check_arrays, safe_asarray
+from ..utils.validation import DataConversionWarning
 from ..utils.fixes import bincount, unique
 
 
@@ -259,6 +260,12 @@ def fit(self, X, y, sample_weight=None):
         n_samples, self.n_features_ = X.shape
 
         y = np.atleast_1d(y)
+        if y.ndim == 2 and y.shape[1] == 1:
+            warn("A column-vector y was passed when a 1d array was"
+                 " expected. Please change the shape of y to "
+                 "(n_samples, ), for example using ravel().",
+                 DataConversionWarning, stacklevel=2)
+
         if y.ndim == 1:
             # reshape is necessary to preserve the data contiguity against vs
             # [:, np.newaxis] that does not.

diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
@@ -34,7 +34,7 @@
 from ..base import BaseEstimator
 from ..base import ClassifierMixin
 from ..base import RegressorMixin
-from ..utils import check_random_state, array2d, check_arrays
+from ..utils import check_random_state, array2d, check_arrays, column_or_1d
 from ..utils.extmath import logsumexp
 from ..utils.fixes import unique
 from ..externals import six
@@ -552,7 +552,7 @@ def fit(self, X, y):
         # Check input
         X, = check_arrays(X, dtype=DTYPE, sparse_format="dense",
                           check_ccontiguous=True)
-        y = np.ravel(y, order="C")
+        y = column_or_1d(y, warn=True)
         n_samples, n_features = X.shape
         self.n_features = n_features
         random_state = check_random_state(self.random_state)
@@ -883,6 +883,7 @@ def fit(self, X, y):
         self : object
             Returns self.
         """
+        y = column_or_1d(y, warn=True)
         self.classes_, y = unique(y, return_inverse=True)
         self.n_classes_ = len(self.classes_)
 

diff --git a/sklearn/lda.py b/sklearn/lda.py
@@ -13,7 +13,7 @@
 from .base import BaseEstimator, ClassifierMixin, TransformerMixin
 from .utils.extmath import logsumexp
 from .utils.fixes import unique
-from .utils import check_arrays, array2d
+from .utils import check_arrays, array2d, column_or_1d
 
 __all__ = ['LDA']
 
@@ -112,6 +112,7 @@ def fit(self, X, y, store_covariance=False, tol=1.0e-4):
             and stored in `self.covariance_` attribute.
         """
         X, y = check_arrays(X, y, sparse_format='dense')
+        y = column_or_1d(y, warn=True)
         self.classes_, y = unique(y, return_inverse=True)
         n_samples, n_features = X.shape
         n_classes = len(self.classes_)

diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
@@ -534,7 +534,7 @@ def fit(self, X, y):
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
         Y = self._label_binarizer.fit_transform(y)
         if not self._label_binarizer.multilabel_:
-            y = column_or_1d(y)
+            y = column_or_1d(y, warn=True)
 
         if self.class_weight:
             cw = compute_class_weight(self.class_weight,
@@ -749,8 +749,8 @@ def fit(self, X, y, sample_weight=1.0):
             # identity_estimator will just return them
             def identity_estimator():
                 pass
-            identity_estimator.decision_function = lambda y_predict : y_predict
-            identity_estimator.predict = lambda y_predict : y_predict
+            identity_estimator.decision_function = lambda y_predict: y_predict
+            identity_estimator.predict = lambda y_predict: y_predict
 
             out = [scorer(identity_estimator, y.ravel(), cv_values[:, i])
                    for i in range(len(self.alphas))]
@@ -1025,7 +1025,7 @@ def fit(self, X, y, sample_weight=1.0, class_weight=None):
         self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
         Y = self._label_binarizer.fit_transform(y)
         if not self._label_binarizer.multilabel_:
-            y = column_or_1d(y)
+            y = column_or_1d(y, warn=True)
         cw = compute_class_weight(class_weight,
                                   self.classes_, Y)
         # modify the sample weights with the corresponding class weight

diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
@@ -15,7 +15,8 @@
 from .base import LinearClassifierMixin, SparseCoefMixin
 from ..base import BaseEstimator, RegressorMixin
 from ..feature_selection.from_model import _LearntSelectorMixin
-from ..utils import array2d, atleast2d_or_csr, check_arrays, deprecated
+from ..utils import (array2d, atleast2d_or_csr, check_arrays, deprecated,
+                     column_or_1d)
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
 from ..externals import six
@@ -336,7 +337,7 @@ def _partial_fit(self, X, y, alpha, C,
                      classes, sample_weight,
                      coef_init, intercept_init):
         X = atleast2d_or_csr(X, dtype=np.float64, order="C")
-        y = np.asarray(y).ravel()
+        y = column_or_1d(y, warn=True)
 
         n_samples, n_features = X.shape
         _check_fit_data(X, y)
@@ -773,7 +774,7 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
                      coef_init, intercept_init):
         X, y = check_arrays(X, y, sparse_format="csr", copy=False,
                             check_ccontiguous=True, dtype=np.float64)
-        y = y.ravel()
+        y = column_or_1d(y, warn=True)
 
         n_samples, n_features = X.shape
         _check_fit_data(X, y)

diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
@@ -18,6 +18,7 @@
 from ..metrics import pairwise_distances
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 from ..utils import safe_asarray, atleast2d_or_csr, check_arrays
+from ..utils.validation import DataConversionWarning
 from ..utils.fixes import unique
 from ..externals import six
 
@@ -605,6 +606,12 @@ def fit(self, X, y):
             X, y = check_arrays(X, y, sparse_format="csr")
 
         if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
+            if y.ndim != 1:
+                warnings.warn("A column-vector y was passed when a 1d array"
+                              "was expected. Please change the shape of y to"
+                              "(n_samples, ), for example using ravel().",
+                              DataConversionWarning, stacklevel=2)
+
             self.outputs_2d_ = False
             y = y.reshape((-1, 1))
         else:

diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py
@@ -89,7 +89,7 @@ def fit(self, X, y):
         if sp.issparse(X) and self.shrink_threshold:
             raise ValueError("threshold shrinking not supported"
                              " for sparse input")
-        y = column_or_1d(y)
+        y = column_or_1d(y, warn=True)
 
         n_samples, n_features = X.shape
         classes = np.unique(y)

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -31,7 +31,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    `classes_`: array of shape [n_class]
+    `classes_` : array of shape (n_class,)
         Holds the label for each class.
 
     Examples
@@ -73,13 +73,14 @@ def fit(self, y):
 
         Parameters
         ----------
-        y : array-like of shape [n_samples]
+        y : array-like of shape (n_samples,)
             Target values.
 
         Returns
         -------
         self : returns an instance of self.
         """
+        y = column_or_1d(y, warn=True)
         self.classes_ = np.unique(y)
         return self
 
@@ -95,6 +96,7 @@ def fit_transform(self, y):
         -------
         y : array-like of shape [n_samples]
         """
+        y = column_or_1d(y, warn=True)
         self.classes_, y = unique(y, return_inverse=True)
         return y
 

diff --git a/sklearn/qda.py b/sklearn/qda.py
@@ -13,7 +13,7 @@
 from .base import BaseEstimator, ClassifierMixin
 from .externals.six.moves import xrange
 from .utils.fixes import unique
-from .utils import check_arrays, array2d
+from .utils import check_arrays, array2d, column_or_1d
 
 __all__ = ['QDA']
 
@@ -96,6 +96,7 @@ def fit(self, X, y, store_covariances=False, tol=1.0e-4):
             `self.covariances_` attribute.
         """
         X, y = check_arrays(X, y)
+        y = column_or_1d(y, warn=True)
         self.classes_, y = unique(y, return_inverse=True)
         n_samples, n_features = X.shape
         n_classes = len(self.classes_)

diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
@@ -9,7 +9,7 @@
 from . import libsvm_sparse
 from ..base import BaseEstimator, ClassifierMixin
 from ..preprocessing import LabelEncoder
-from ..utils import atleast2d_or_csr, array2d, check_random_state
+from ..utils import atleast2d_or_csr, array2d, check_random_state, column_or_1d
 from ..utils import ConvergenceWarning, compute_class_weight, deprecated
 from ..utils.fixes import unique
 from ..utils.extmath import safe_sparse_dot
@@ -437,6 +437,7 @@ class BaseSVC(BaseLibSVM, ClassifierMixin):
     """ABC for LibSVM-based classifiers."""
 
     def _validate_targets(self, y):
+        y = column_or_1d(y, warn=True)
         cls, y = unique(y, return_inverse=True)
         self.class_weight_ = compute_class_weight(self.class_weight, cls, y)
         if len(cls) < 2:

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -651,6 +651,10 @@ def test_classifiers_input_shapes():
         if name in ["MultinomialNB", "LabelPropagation", "LabelSpreading"]:
             # TODO some complication with -1 label
             continue
+        if name in ["DecisionTreeClassifier", "ExtraTreeClassifier"]:
+            # We don't raise a warning in these classifiers, as
+            # the column y interface is used by the forests.
+            continue
 
         # catch deprecation warnings
         with warnings.catch_warnings(record=True):
@@ -661,7 +665,6 @@ def test_classifiers_input_shapes():
         y_pred = classifier.predict(X)
 
         set_random_state(classifier)
-        classifier.fit(X, y[:, np.newaxis])
         # Check that when a 2D y is given, a DataConversionWarning is
         # raised
         with warnings.catch_warnings(record=True) as w:

diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
@@ -137,6 +137,7 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True,
         is_classification = isinstance(self, ClassifierMixin)
 
         y = np.atleast_1d(y)
+
         if y.ndim == 1:
             # reshape is necessary to preserve the data contiguity against vs
             # [:, np.newaxis] that does not.
-Original file line number
+Diff line change
@@ Expand Up / @@ -253,6 +253,8 @@ API changes summary @@
          to float, and raises a warning. Previously it rounded for dense integer
          input.
+       - Better input validation, warning on unexpected shapes for y.
     .. _changes_0_13_1:
 .13.1
@@ Expand Down @@