Skip to content

[MRG + 2] make check_array convert object to float. #4057

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 24, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,10 @@ API changes summary
- `thresh` parameter is deprecated in favor of new `tol` parameter in
:class:`GMM`. See `Enhancements` section for details. By `Hervé Bredin`_.

- Estimators will treat input with dtype object as numeric when possible.
By `Andreas Müller`_



.. _changes_0_15_2:

Expand Down
5 changes: 3 additions & 2 deletions sklearn/ensemble/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class in the training data.
"""
def fit(self, X, y, sample_weight=None):
if sample_weight is None:
sample_weight = np.ones_like(y, dtype=np.float)
sample_weight = np.ones_like(y, dtype=np.float64)
class_counts = bincount(y, weights=sample_weight)
self.priors = class_counts / class_counts.sum()

Expand Down Expand Up @@ -1146,7 +1146,8 @@ def feature_importances_(self):

def _validate_y(self, y):
self.n_classes_ = 1

if y.dtype.kind == 'O':
y = y.astype(np.float64)
# Default implementation
return y

Expand Down
10 changes: 4 additions & 6 deletions sklearn/gaussian_process/gaussian_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from ..base import BaseEstimator, RegressorMixin
from ..metrics.pairwise import manhattan_distances
from ..utils import check_random_state, check_array, check_consistent_length
from ..utils.validation import check_is_fitted
from ..utils import check_random_state, check_array, check_X_y
from ..utils.validation import check_is_fitted
from . import regression_models as regression
from . import correlation_models as correlation

Expand Down Expand Up @@ -264,12 +264,10 @@ def fit(self, X, y):
self.random_state = check_random_state(self.random_state)

# Force data to 2D numpy.array
X = check_array(X)
y = np.asarray(y)
X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
self.y_ndim_ = y.ndim
if y.ndim == 1:
y = y[:, np.newaxis]
check_consistent_length(X, y)

# Check shapes of DOE & observations
n_samples, n_features = X.shape
Expand Down Expand Up @@ -883,7 +881,7 @@ def _check_params(self, n_samples=None):
"or array of length n_samples.")

# Check optimizer
if not self.optimizer in self._optimizer_types:
if self.optimizer not in self._optimizer_types:
raise ValueError("optimizer should be one of %s"
% self._optimizer_types)

Expand Down
6 changes: 3 additions & 3 deletions sklearn/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from ..externals import six
from ..externals.joblib import Parallel, delayed
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
from ..utils import as_float_array, check_array
from ..utils import as_float_array, check_array, check_X_y
from ..utils.extmath import safe_sparse_dot
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
from ..utils.fixes import sparse_lsqr
Expand Down Expand Up @@ -372,8 +372,8 @@ def fit(self, X, y, n_jobs=1):
n_jobs_ = n_jobs
else:
n_jobs_ = self.n_jobs
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
y = np.asarray(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
y_numeric=True, multi_output=True)

X, y, X_mean, y_mean, X_std = self._center_data(
X, y, self.fit_intercept, self.normalize, self.copy_X)
Expand Down
4 changes: 2 additions & 2 deletions sklearn/linear_model/bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def fit(self, X, y):
-------
self : returns an instance of self.
"""
X, y = check_X_y(X, y, dtype=np.float)
X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
X, y, X_mean, y_mean, X_std = self._center_data(
X, y, self.fit_intercept, self.normalize, self.copy_X)
n_samples, n_features = X.shape
Expand Down Expand Up @@ -342,7 +342,7 @@ def fit(self, X, y):
-------
self : returns an instance of self.
"""
X, y = check_X_y(X, y, dtype=np.float)
X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)

n_samples, n_features = X.shape
coef_ = np.zeros(n_features)
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def fit(self, X, y):

X, y = check_X_y(X, y, accept_sparse='csc', dtype=np.float64,
order='F', copy=self.copy_X and self.fit_intercept,
multi_output=True)
multi_output=True, y_numeric=True)

X, y, X_mean, y_mean, X_std, precompute, Xy = \
_pre_fit(X, y, None, self.precompute, self.normalize,
Expand Down
10 changes: 4 additions & 6 deletions sklearn/linear_model/least_angle.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from .base import LinearModel
from ..base import RegressorMixin
from ..utils import arrayfuncs, as_float_array, check_array, check_X_y
from ..utils import arrayfuncs, as_float_array, check_X_y
from ..cross_validation import _check_cv as check_cv
from ..utils import ConvergenceWarning
from ..externals.joblib import Parallel, delayed
Expand Down Expand Up @@ -422,7 +422,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
for ii in idx:
for i in range(ii, n_active):
indices[i], indices[i + 1] = indices[i + 1], indices[i]
Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i+1])
Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i],
Gram[:, i + 1])

Expand Down Expand Up @@ -589,8 +589,7 @@ def fit(self, X, y, Xy=None):
self : object
returns an instance of self.
"""
X = check_array(X)
y = np.asarray(y)
X, y = check_X_y(X, y, y_numeric=True, multi_output=True)
n_features = X.shape[1]

X, y, X_mean, y_mean, X_std = self._center_data(X, y,
Expand Down Expand Up @@ -1268,8 +1267,7 @@ def fit(self, X, y, copy_X=True):
returns an instance of self.
"""
self.fit_path = True
X = check_array(X)
y = np.asarray(y)
X, y = check_X_y(X, y, multi_output=True, y_numeric=True)

X, y, Xmean, ymean, Xstd = LinearModel._center_data(
X, y, self.fit_intercept, self.normalize, self.copy_X)
Expand Down
4 changes: 2 additions & 2 deletions sklearn/linear_model/logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
"dual=False, got dual=%s" % dual)
# Preprocessing.
X = check_array(X, accept_sparse='csr', dtype=np.float64)
y = check_array(y, ensure_2d=False, copy=copy)
y = check_array(y, ensure_2d=False, copy=copy, dtype=None)
_, n_features = X.shape
check_consistent_length(X, y)
classes = np.unique(y)
Expand Down Expand Up @@ -1318,7 +1318,7 @@ def fit(self, X, y):
"the primal form.")

X = check_array(X, accept_sparse='csr', dtype=np.float64)
y = check_array(y, ensure_2d=False)
y = check_array(y, ensure_2d=False, dtype=None)

if self.multi_class not in ['ovr', 'multinomial']:
raise ValueError("multi_class backend should be either "
Expand Down
5 changes: 2 additions & 3 deletions sklearn/linear_model/omp.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,8 +609,7 @@ def fit(self, X, y):
self : object
returns an instance of self.
"""
X = check_array(X)
y = np.asarray(y)
X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
n_features = X.shape[1]

X, y, X_mean, y_mean, X_std, Gram, Xy = \
Expand Down Expand Up @@ -805,7 +804,7 @@ def fit(self, X, y):
self : object
returns an instance of self.
"""
X, y = check_X_y(X, y)
X, y = check_X_y(X, y, y_numeric=True)
X = as_float_array(X, copy=False, force_all_finite=False)
cv = check_cv(self.cv, X, y, classifier=False)
max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1])
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/randomized_l1.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def fit(self, X, y):
self : object
Returns an instance of self.
"""
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], y_numeric=True)
X = as_float_array(X, copy=False)
n_samples, n_features = X.shape

Expand Down
6 changes: 4 additions & 2 deletions sklearn/linear_model/ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,8 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
self.solver = solver

def fit(self, X, y, sample_weight=None):
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True)
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float,
multi_output=True, y_numeric=True)

if ((sample_weight is not None) and
np.atleast_1d(sample_weight).ndim > 1):
Expand Down Expand Up @@ -743,7 +744,8 @@ def fit(self, X, y, sample_weight=None):
-------
self : Returns self.
"""
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True)
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float,
multi_output=True, y_numeric=True)

n_samples, n_features = X.shape

Expand Down
2 changes: 1 addition & 1 deletion sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
if not isinstance(y, list):
# XXX Workaround that will be removed when list of list format is
# dropped
y = check_array(y, accept_sparse='csr', ensure_2d=False)
y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None)
if neg_label >= pos_label:
raise ValueError("neg_label={0} must be strictly less than "
"pos_label={1}.".format(neg_label, pos_label))
Expand Down
2 changes: 2 additions & 0 deletions sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from sklearn.cross_validation import train_test_split
from sklearn.linear_model.base import LinearClassifierMixin
from sklearn.utils.estimator_checks import (
check_dtype_object,
check_parameters_default_constructible,
check_estimator_sparse_data,
check_estimators_dtypes,
Expand Down Expand Up @@ -96,6 +97,7 @@ def test_non_meta_estimators():
if name not in CROSS_DECOMPOSITION:
yield check_estimators_dtypes, name, Estimator
yield check_fit_score_takes_y, name, Estimator
yield check_dtype_object, name, Estimator

if name not in CROSS_DECOMPOSITION + ['SpectralEmbedding']:
# SpectralEmbedding is non-deterministic,
Expand Down
28 changes: 28 additions & 0 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from sklearn.utils.testing import SkipTest
from sklearn.utils.testing import check_skip_travis
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import assert_raise_message

from sklearn.base import clone, ClassifierMixin
from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score
Expand Down Expand Up @@ -149,6 +150,33 @@ def check_estimator_sparse_data(name, Estimator):
raise


def check_dtype_object(name, Estimator):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd appreciate a comment here just to give a sense of what this requires of an estimator: Numeric features must be accepted when dtype=object for example.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what you mean by "numeric features must be accepted" That is something that all tests require, right?
I would formulate it as "object dtype should be handled as numeric"

# check that estimators treat dtype object as numeric if possible
rng = np.random.RandomState(0)
X = rng.rand(40, 10).astype(object)
y = (X[:, 0] * 4).astype(np.int)
y = multioutput_estimator_convert_y_2d(name, y)
with warnings.catch_warnings():
estimator = Estimator()
set_fast_parameters(estimator)

estimator.fit(X, y)
if hasattr(estimator, "predict"):
estimator.predict(X)

if hasattr(estimator, "transform"):
estimator.transform(X)

try:
estimator.fit(X, y.astype(object))
except Exception as e:
if "Unknown label type" not in str(e):
raise

X[0, 0] = {'foo': 'bar'}
assert_raise_message(TypeError, "string or a number", estimator.fit, X, y)


def check_transformer(name, Transformer):
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
random_state=0, n_features=2, cluster_std=0.1)
Expand Down
35 changes: 27 additions & 8 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,14 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, order, copy,
return spmatrix


def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
def check_array(array, accept_sparse=None, dtype="numeric", order=None, copy=False,
force_all_finite=True, ensure_2d=True, allow_nd=False,
ensure_min_samples=1, ensure_min_features=1):
"""Input validation on an array, list, sparse matrix or similar.

By default, the input is converted to an at least 2d numpy array.
By default, the input is converted to an at least 2nd numpy array.
If the dtype of the array is object, attempt converting to float,
raising on failure.

Parameters
----------
Expand All @@ -245,8 +247,9 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
If the input is sparse but not in the allowed format, it will be
converted to the first listed format.

dtype : string, type or None (default=none)
dtype : string, type or None (default="numeric")
Data type of result. If None, the dtype of the input is preserved.
If "numeric", dtype is preserved unless array.dtype is object.

order : 'F', 'C' or None (default=None)
Whether an array will be forced to be fortran or c-style.
Expand Down Expand Up @@ -283,11 +286,19 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
accept_sparse = [accept_sparse]

if sp.issparse(array):
if dtype == "numeric":
dtype = None
array = _ensure_sparse_format(array, accept_sparse, dtype, order,
copy, force_all_finite)
else:
if ensure_2d:
array = np.atleast_2d(array)
if dtype == "numeric":
if hasattr(array, "dtype") and array.dtype.kind == "O":
# if input is object, convert to float.
dtype = np.float64
else:
dtype = None
array = np.array(array, dtype=dtype, order=order, copy=copy)
if not allow_nd and array.ndim >= 3:
raise ValueError("Found array with dim %d. Expected <= 2" %
Expand All @@ -311,15 +322,17 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
return array


def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None, copy=False,
force_all_finite=True, ensure_2d=True, allow_nd=False,
multi_output=False, ensure_min_samples=1,
ensure_min_features=1):
ensure_min_features=1, y_numeric=False):
"""Input validation for standard estimators.

Checks X and y for consistent length, enforces X 2d and y 1d.
Standard input checks are only applied to y. For multi-label y,
set multi_output=True to allow 2d and sparse y.
If the dtype of X is object, attempt converting to float,
raising on failure.

Parameters
----------
Expand All @@ -335,8 +348,9 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
If the input is sparse but not in the allowed format, it will be
converted to the first listed format.

dtype : string, type or None (default=none)
dtype : string, type or None (default="numeric")
Data type of result. If None, the dtype of the input is preserved.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be more specific and say "Data type of the checked input data X" instead of "Data type of result".

If "numeric", dtype is preserved unless array.dtype is object.

order : 'F', 'C' or None (default=None)
Whether an array will be forced to be fortran or c-style.
Expand Down Expand Up @@ -367,6 +381,9 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when ``ensure_2d`` is True and
``allow_nd`` is False.
y_numeric : boolean (default=False)
Whether to ensure that y has a numeric type. If dtype of y is object,
it is converted to float64. Should only be used for regression algorithms.

Returns
-------
Expand All @@ -377,10 +394,12 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
ensure_2d, allow_nd, ensure_min_samples,
ensure_min_features)
if multi_output:
y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False)
y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, dtype=None)
else:
y = column_or_1d(y, warn=True)
_assert_all_finite(y)
if y_numeric and y.dtype.kind == 'O':
y = y.astype(np.float64)

check_consistent_length(X, y)

Expand Down Expand Up @@ -520,7 +539,7 @@ def check_symmetric(array, tol=1E-10, raise_warning=True,
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
"""Perform is_fitted validation for estimator.

Checks if the estimator is fitted by verifying the presence of
Checks if the estimator is fitted by verifying the presence of
"all_or_any" of the passed attributes and raises a NotFittedError with the
given message.

Expand Down