diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index acf2087d4f611..3483b173dcb16 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -36,6 +36,15 @@ Changelog threaded when `n_jobs > 1` or `n_jobs = -1`. :issue:`13005` by :user:`Prabakaran Kumaresshan `. +:mod:`sklearn.impute` +..................... + +- |Fix| add support for non-numeric data in + :class:`sklearn.impute.MissingIndicator` which was not supported while + :class:`sklearn.impute.SimpleImputer` was supporting this for some + imputation strategies. + :issue:`13046` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/impute.py b/sklearn/impute.py index 349af27eeb91e..e6a9278b47800 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -533,6 +533,23 @@ def _get_missing_features_info(self, X): return imputer_mask, features_with_missing + def _validate_input(self, X): + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + if X.dtype.kind not in ("i", "u", "f", "O"): + raise ValueError("MissingIndicator does not support data with " + "dtype {0}. Please provide either a numeric array" + " (with a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype)) + return X + def fit(self, X, y=None): """Fit the transformer on X. @@ -547,14 +564,7 @@ def fit(self, X, y=None): self : object Returns self. """ - if not is_scalar_nan(self.missing_values): - force_all_finite = True - else: - force_all_finite = "allow-nan" - X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite=force_all_finite) - _check_inputs_dtype(X, self.missing_values) - + X = self._validate_input(X) self._n_features = X.shape[1] if self.features not in ('missing-only', 'all'): @@ -588,14 +598,7 @@ def transform(self, X): """ check_is_fitted(self, "features_") - - if not is_scalar_nan(self.missing_values): - force_all_finite = True - else: - force_all_finite = "allow-nan" - X = check_array(X, accept_sparse=('csc', 'csr'), - force_all_finite=force_all_finite) - _check_inputs_dtype(X, self.missing_values) + X = self._validate_input(X) if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 7131ac3ed0f5f..2f6a4aa4ec6fe 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -13,6 +13,7 @@ from sklearn.impute import MissingIndicator from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_union from sklearn.model_selection import GridSearchCV from sklearn import tree from sklearn.random_projection import sparse_random_matrix @@ -509,7 +510,10 @@ def test_imputation_copy(): "'features' has to be either 'missing-only' or 'all'"), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'all', 'sparse': 'random'}, - "'sparse' has to be a boolean or 'auto'")] + "'sparse' has to be a boolean or 'auto'"), + (np.array([['a', 'b'], ['c', 'a']], dtype=str), + np.array([['a', 'b'], ['c', 'a']], dtype=str), + {}, "MissingIndicator does not support data with dtype")] ) def test_missing_indicator_error(X_fit, X_trans, params, msg_err): indicator = MissingIndicator(missing_values=-1) @@ -614,6 +618,37 @@ def test_missing_indicator_sparse_param(arr_type, missing_values, assert isinstance(X_trans_mask, np.ndarray) +def test_missing_indicator_string(): + X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object) + indicator = MissingIndicator(missing_values='a', features='all') + X_trans = indicator.fit_transform(X) + assert_array_equal(X_trans, np.array([[True, False, False], + [False, False, True]])) + + +@pytest.mark.parametrize( + "X, missing_values, X_trans_exp", + [(np.array([['a', 'b'], ['b', 'a']], dtype=object), 'a', + np.array([['b', 'b', True, False], ['b', 'b', False, True]], + dtype=object)), + (np.array([[np.nan, 1.], [1., np.nan]]), np.nan, + np.array([[1., 1., True, False], [1., 1., False, True]])), + (np.array([[np.nan, 'b'], ['b', np.nan]], dtype=object), np.nan, + np.array([['b', 'b', True, False], ['b', 'b', False, True]], + dtype=object)), + (np.array([[None, 'b'], ['b', None]], dtype=object), None, + np.array([['b', 'b', True, False], ['b', 'b', False, True]], + dtype=object))] +) +def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): + trans = make_union( + SimpleImputer(missing_values=missing_values, strategy='most_frequent'), + MissingIndicator(missing_values=missing_values) + ) + X_trans = trans.fit_transform(X) + assert_array_equal(X_trans, X_trans_exp) + + @pytest.mark.parametrize("imputer_constructor", [SimpleImputer]) @pytest.mark.parametrize( diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 69850ecc5f796..77c557685aa13 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -74,10 +74,10 @@ 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression', 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] - ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator', 'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler', 'PowerTransformer', 'QuantileTransformer'] +SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator'] def _yield_non_meta_checks(name, estimator): @@ -625,9 +625,16 @@ def check_dtype_object(name, estimator_orig): if "Unknown label type" not in str(e): raise - X[0, 0] = {'foo': 'bar'} - msg = "argument must be a string or a number" - assert_raises_regex(TypeError, msg, estimator.fit, X, y) + if name not in SUPPORT_STRING: + X[0, 0] = {'foo': 'bar'} + msg = "argument must be a string or a number" + assert_raises_regex(TypeError, msg, estimator.fit, X, y) + else: + # Estimators supporting string will not call np.asarray to convert the + # data to numeric and therefore, the error will not be raised. + # Checking for each element dtype in the input array will be costly. + # Refer to #11401 for full discussion. + estimator.fit(X, y) def check_complex_data(name, estimator_orig):