scikit-learn · ogrisel · Feb 24, 2015 · Feb 14, 2015 · Feb 18, 2015 · jnothman
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -377,6 +377,10 @@ API changes summary
     - `thresh` parameter is deprecated in favor of new `tol` parameter in
       :class:`GMM`. See `Enhancements` section for details. By `Hervé Bredin`_.
 
+    - Estimators will treat input with dtype object as numeric when possible.
+      By `Andreas Müller`_
+
+
 
 .. _changes_0_15_2:
 

diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
@@ -126,7 +126,7 @@ class in the training data.
     """
     def fit(self, X, y, sample_weight=None):
         if sample_weight is None:
-            sample_weight = np.ones_like(y, dtype=np.float)
+            sample_weight = np.ones_like(y, dtype=np.float64)
         class_counts = bincount(y, weights=sample_weight)
         self.priors = class_counts / class_counts.sum()
 
@@ -1146,7 +1146,8 @@ def feature_importances_(self):
 
     def _validate_y(self, y):
         self.n_classes_ = 1
-
+        if y.dtype.kind == 'O':
+            y = y.astype(np.float64)
         # Default implementation
         return y
 

diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
@@ -11,8 +11,8 @@
 
 from ..base import BaseEstimator, RegressorMixin
 from ..metrics.pairwise import manhattan_distances
-from ..utils import check_random_state, check_array, check_consistent_length
-from ..utils.validation import  check_is_fitted
+from ..utils import check_random_state, check_array, check_X_y
+from ..utils.validation import check_is_fitted
 from . import regression_models as regression
 from . import correlation_models as correlation
 
@@ -264,12 +264,10 @@ def fit(self, X, y):
         self.random_state = check_random_state(self.random_state)
 
         # Force data to 2D numpy.array
-        X = check_array(X)
-        y = np.asarray(y)
+        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
         self.y_ndim_ = y.ndim
         if y.ndim == 1:
             y = y[:, np.newaxis]
-        check_consistent_length(X, y)
 
         # Check shapes of DOE & observations
         n_samples, n_features = X.shape
@@ -883,7 +881,7 @@ def _check_params(self, n_samples=None):
                              "or array of length n_samples.")
 
         # Check optimizer
-        if not self.optimizer in self._optimizer_types:
+        if self.optimizer not in self._optimizer_types:
             raise ValueError("optimizer should be one of %s"
                              % self._optimizer_types)
 

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
@@ -25,7 +25,7 @@
 from ..externals import six
 from ..externals.joblib import Parallel, delayed
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
-from ..utils import as_float_array, check_array
+from ..utils import as_float_array, check_array, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 from ..utils.fixes import sparse_lsqr
@@ -372,8 +372,8 @@ def fit(self, X, y, n_jobs=1):
             n_jobs_ = n_jobs
         else:
             n_jobs_ = self.n_jobs
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
-        y = np.asarray(y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         y_numeric=True, multi_output=True)
 
         X, y, X_mean, y_mean, X_std = self._center_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)

diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
@@ -132,7 +132,7 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
-        X, y = check_X_y(X, y, dtype=np.float)
+        X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
         X, y, X_mean, y_mean, X_std = self._center_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
         n_samples, n_features = X.shape
@@ -342,7 +342,7 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
-        X, y = check_X_y(X, y, dtype=np.float)
+        X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
 
         n_samples, n_features = X.shape
         coef_ = np.zeros(n_features)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
@@ -627,7 +627,7 @@ def fit(self, X, y):
 
         X, y = check_X_y(X, y, accept_sparse='csc', dtype=np.float64,
                          order='F', copy=self.copy_X and self.fit_intercept,
-                         multi_output=True)
+                         multi_output=True, y_numeric=True)
 
         X, y, X_mean, y_mean, X_std, precompute, Xy = \
             _pre_fit(X, y, None, self.precompute, self.normalize,

diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
@@ -21,7 +21,7 @@
 
 from .base import LinearModel
 from ..base import RegressorMixin
-from ..utils import arrayfuncs, as_float_array, check_array, check_X_y
+from ..utils import arrayfuncs, as_float_array, check_X_y
 from ..cross_validation import _check_cv as check_cv
 from ..utils import ConvergenceWarning
 from ..externals.joblib import Parallel, delayed
@@ -422,7 +422,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
                 for ii in idx:
                     for i in range(ii, n_active):
                         indices[i], indices[i + 1] = indices[i + 1], indices[i]
-                        Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i+1])
+                        Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
                         Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i],
                                                           Gram[:, i + 1])
 
@@ -589,8 +589,7 @@ def fit(self, X, y, Xy=None):
         self : object
             returns an instance of self.
         """
-        X = check_array(X)
-        y = np.asarray(y)
+        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)
         n_features = X.shape[1]
 
         X, y, X_mean, y_mean, X_std = self._center_data(X, y,
@@ -1268,8 +1267,7 @@ def fit(self, X, y, copy_X=True):
             returns an instance of self.
         """
         self.fit_path = True
-        X = check_array(X)
-        y = np.asarray(y)
+        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
 
         X, y, Xmean, ymean, Xstd = LinearModel._center_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)

diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -529,7 +529,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                              "dual=False, got dual=%s" % dual)
     # Preprocessing.
     X = check_array(X, accept_sparse='csr', dtype=np.float64)
-    y = check_array(y, ensure_2d=False, copy=copy)
+    y = check_array(y, ensure_2d=False, copy=copy, dtype=None)
     _, n_features = X.shape
     check_consistent_length(X, y)
     classes = np.unique(y)
@@ -1318,7 +1318,7 @@ def fit(self, X, y):
                                  "the primal form.")
 
         X = check_array(X, accept_sparse='csr', dtype=np.float64)
-        y = check_array(y, ensure_2d=False)
+        y = check_array(y, ensure_2d=False, dtype=None)
 
         if self.multi_class not in ['ovr', 'multinomial']:
             raise ValueError("multi_class backend should be either "

diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
@@ -609,8 +609,7 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X = check_array(X)
-        y = np.asarray(y)
+        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
         n_features = X.shape[1]
 
         X, y, X_mean, y_mean, X_std, Gram, Xy = \
@@ -805,7 +804,7 @@ def fit(self, X, y):
         self : object
             returns an instance of self.
         """
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, y_numeric=True)
         X = as_float_array(X, copy=False, force_all_finite=False)
         cv = check_cv(self.cv, X, y, classifier=False)
         max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1])

diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
@@ -88,7 +88,7 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
+        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], y_numeric=True)
         X = as_float_array(X, copy=False)
         n_samples, n_features = X.shape
 

diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
@@ -378,7 +378,8 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
         self.solver = solver
 
     def fit(self, X, y, sample_weight=None):
-        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True)
+        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float,
+                         multi_output=True, y_numeric=True)
 
         if ((sample_weight is not None) and
                 np.atleast_1d(sample_weight).ndim > 1):
@@ -743,7 +744,8 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : Returns self.
         """
-        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True)
+        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float,
+                         multi_output=True, y_numeric=True)
 
         n_samples, n_features = X.shape
 

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -464,7 +464,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
     if not isinstance(y, list):
         # XXX Workaround that will be removed when list of list format is
         # dropped
-        y = check_array(y, accept_sparse='csr', ensure_2d=False)
+        y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None)
     if neg_label >= pos_label:
         raise ValueError("neg_label={0} must be strictly less than "
                          "pos_label={1}.".format(neg_label, pos_label))

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -29,6 +29,7 @@
 from sklearn.cross_validation import train_test_split
 from sklearn.linear_model.base import LinearClassifierMixin
 from sklearn.utils.estimator_checks import (
+    check_dtype_object,
     check_parameters_default_constructible,
     check_estimator_sparse_data,
     check_estimators_dtypes,
@@ -96,6 +97,7 @@ def test_non_meta_estimators():
         if name not in CROSS_DECOMPOSITION:
             yield check_estimators_dtypes, name, Estimator
             yield check_fit_score_takes_y, name, Estimator
+            yield check_dtype_object, name, Estimator
 
         if name not in CROSS_DECOMPOSITION + ['SpectralEmbedding']:
             # SpectralEmbedding is non-deterministic,

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -23,6 +23,7 @@
 from sklearn.utils.testing import SkipTest
 from sklearn.utils.testing import check_skip_travis
 from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import assert_raise_message
 
 from sklearn.base import clone, ClassifierMixin
 from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score
@@ -149,6 +150,33 @@ def check_estimator_sparse_data(name, Estimator):
         raise
 
 
+def check_dtype_object(name, Estimator):
+    # check that estimators treat dtype object as numeric if possible
+    rng = np.random.RandomState(0)
+    X = rng.rand(40, 10).astype(object)
+    y = (X[:, 0] * 4).astype(np.int)
+    y = multioutput_estimator_convert_y_2d(name, y)
+    with warnings.catch_warnings():
+        estimator = Estimator()
+    set_fast_parameters(estimator)
+
+    estimator.fit(X, y)
+    if hasattr(estimator, "predict"):
+        estimator.predict(X)
+
+    if hasattr(estimator, "transform"):
+        estimator.transform(X)
+
+    try:
+        estimator.fit(X, y.astype(object))
+    except Exception as e:
+        if "Unknown label type" not in str(e):
+            raise
+
+    X[0, 0] = {'foo': 'bar'}
+    assert_raise_message(TypeError, "string or a number", estimator.fit, X, y)
+
+
 def check_transformer(name, Transformer):
     X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                       random_state=0, n_features=2, cluster_std=0.1)

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -227,12 +227,14 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, order, copy,
     return spmatrix
 
 
-def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
+def check_array(array, accept_sparse=None, dtype="numeric", order=None, copy=False,
                 force_all_finite=True, ensure_2d=True, allow_nd=False,
                 ensure_min_samples=1, ensure_min_features=1):
     """Input validation on an array, list, sparse matrix or similar.
 
-    By default, the input is converted to an at least 2d numpy array.
+    By default, the input is converted to an at least 2nd numpy array.
+    If the dtype of the array is object, attempt converting to float,
+    raising on failure.
 
     Parameters
     ----------
@@ -245,8 +247,9 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
         If the input is sparse but not in the allowed format, it will be
         converted to the first listed format.
 
-    dtype : string, type or None (default=none)
+    dtype : string, type or None (default="numeric")
         Data type of result. If None, the dtype of the input is preserved.
+        If "numeric", dtype is preserved unless array.dtype is object.
 
     order : 'F', 'C' or None (default=None)
         Whether an array will be forced to be fortran or c-style.
@@ -283,11 +286,19 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
         accept_sparse = [accept_sparse]
 
     if sp.issparse(array):
+        if dtype == "numeric":
+            dtype = None
         array = _ensure_sparse_format(array, accept_sparse, dtype, order,
                                       copy, force_all_finite)
     else:
         if ensure_2d:
             array = np.atleast_2d(array)
+        if dtype == "numeric":
+            if hasattr(array, "dtype") and array.dtype.kind == "O":
+                # if input is object, convert to float.
+                dtype = np.float64
+            else:
+                dtype = None
         array = np.array(array, dtype=dtype, order=order, copy=copy)
         if not allow_nd and array.ndim >= 3:
             raise ValueError("Found array with dim %d. Expected <= 2" %
@@ -311,15 +322,17 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
     return array
 
 
-def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
+def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None, copy=False,
               force_all_finite=True, ensure_2d=True, allow_nd=False,
               multi_output=False, ensure_min_samples=1,
-              ensure_min_features=1):
+              ensure_min_features=1, y_numeric=False):
     """Input validation for standard estimators.
 
     Checks X and y for consistent length, enforces X 2d and y 1d.
     Standard input checks are only applied to y. For multi-label y,
     set multi_output=True to allow 2d and sparse y.
+    If the dtype of X is object, attempt converting to float,
+    raising on failure.
 
     Parameters
     ----------
@@ -335,8 +348,9 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
         If the input is sparse but not in the allowed format, it will be
         converted to the first listed format.
 
-    dtype : string, type or None (default=none)
+    dtype : string, type or None (default="numeric")
         Data type of result. If None, the dtype of the input is preserved.
+        If "numeric", dtype is preserved unless array.dtype is object.
 
     order : 'F', 'C' or None (default=None)
         Whether an array will be forced to be fortran or c-style.
@@ -367,6 +381,9 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
         (columns). The default value of 1 rejects empty datasets.
         This check is only enforced when ``ensure_2d`` is True and
         ``allow_nd`` is False.
+    y_numeric : boolean (default=False)
+        Whether to ensure that y has a numeric type. If dtype of y is object,
+        it is converted to float64. Should only be used for regression algorithms.
 
     Returns
     -------
@@ -377,10 +394,12 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
                     ensure_2d, allow_nd, ensure_min_samples,
                     ensure_min_features)
     if multi_output:
-        y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False)
+        y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, dtype=None)
     else:
         y = column_or_1d(y, warn=True)
         _assert_all_finite(y)
+    if y_numeric and y.dtype.kind == 'O':
+        y = y.astype(np.float64)
 
     check_consistent_length(X, y)
 
@@ -520,7 +539,7 @@ def check_symmetric(array, tol=1E-10, raise_warning=True,
 def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
-    Checks if the estimator is fitted by verifying the presence of 
+    Checks if the estimator is fitted by verifying the presence of
     "all_or_any" of the passed attributes and raises a NotFittedError with the
     given message.