From fce7ee1b34cbfd4eba975a25a2d7843790dca137 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Mar 2019 16:20:23 +0100 Subject: [PATCH 01/10] option to drop full missing columns (dense) --- sklearn/impute.py | 21 +++++++++++++++------ sklearn/tests/test_impute.py | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index ea4e8663d0313..8a14a4564c35e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1063,6 +1063,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. + - If "not-constant", the imputer mask will only represent features + containing missing values, but not containing only missing values, + during fit time. - If "all", the imputer mask will represent all features. sparse : boolean or "auto", optional @@ -1158,7 +1161,12 @@ def _get_missing_features_info(self, X): imputer_mask = imputer_mask.tocsc() else: imputer_mask = _get_mask(X, self.missing_values) - features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0)) + n_missing = imputer_mask.sum(axis=0) + features_with_missing = np.flatnonzero(n_missing) + + if self.features == 'not-constant': + features_with_missing = np.intersect1d( + features_with_missing, np.flatnonzero(n_missing - X.shape[0])) if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) @@ -1207,9 +1215,10 @@ def fit(self, X, y=None): X = self._validate_input(X) self._n_features = X.shape[1] - if self.features not in ('missing-only', 'all'): - raise ValueError("'features' has to be either 'missing-only' or " - "'all'. Got {} instead.".format(self.features)) + if self.features not in ('missing-only', 'all', 'not-constant'): + raise ValueError("'features' has to be either 'missing-only', " + "'not-constant' or 'all'. Got {} instead." + .format(self.features)) if not ((isinstance(self.sparse, str) and self.sparse == "auto") or isinstance(self.sparse, bool)): @@ -1217,7 +1226,7 @@ def fit(self, X, y=None): "Got {!r} instead.".format(self.sparse)) self.features_ = (self._get_missing_features_info(X)[1] - if self.features == 'missing-only' + if self.features in ('missing-only', 'not-constant') else np.arange(self._n_features)) return self @@ -1246,7 +1255,7 @@ def transform(self, X): imputer_mask, features = self._get_missing_features_info(X) - if self.features == "missing-only": + if self.features in ("missing-only", "not-constant"): features_diff_fit_trans = np.setdiff1d(features, self.features_) if (self.error_on_new and features_diff_fit_trans.size > 0): raise ValueError("The features {} have missing values " diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 59bbb091afbea..ba744356a3178 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1119,3 +1119,20 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, with pytest.raises(ValueError, match=err_msg): imputer.fit_transform(X) + + +@pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix], + ids=["dense", "sparse"]) +def test_missing_indicator_drop_full_missing(array_constr): + # Check that missing indicator with features="not-constant" drops columns + # with no missing values as well as columns full of missing values. + X = array_constr([[0, np.nan, 0], + [0, np.nan, np.nan]]) + + expected_Xt = array_constr([[False], + [True]]) + + mi = MissingIndicator(features="not-constant") + Xt = mi.fit_transform(X) + + assert_allclose(Xt, expected_Xt) From cb35f6ca2dfe996104b520eeb690f6a6b1a8449c Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Mar 2019 22:26:20 +0100 Subject: [PATCH 02/10] deal with sparse --- sklearn/impute.py | 47 ++++++++++++++++++------------------ sklearn/tests/test_impute.py | 9 ++++--- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 8a14a4564c35e..fdc25147094cb 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1057,16 +1057,15 @@ class MissingIndicator(BaseEstimator, TransformerMixin): `missing_values` will be indicated (True in the output array), the other values will be marked as False. - features : str, optional + features : {"missing-only", "all", "not-constant"}, optional Whether the imputer mask should represent all or a subset of features. - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. - - If "not-constant", the imputer mask will only represent features - containing missing values, but not containing only missing values, - during fit time. - If "all", the imputer mask will represent all features. + - If "not-constant", the imputer mask will represent features + containing missing values but not containing only missing values. sparse : boolean or "auto", optional Whether the imputer mask format should be sparse or dense. @@ -1147,13 +1146,14 @@ def _get_missing_features_info(self, X): imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) + imputer_mask.eliminate_zeros() - missing_values_mask = imputer_mask.copy() - missing_values_mask.eliminate_zeros() - features_with_missing = ( - np.flatnonzero(np.diff(missing_values_mask.indptr)) - if missing_values_mask.format == 'csc' - else np.unique(missing_values_mask.indices)) + if self.features in ('missing-only', 'not-constant'): + if imputer_mask.format == 'csc': + n_missing = np.diff(imputer_mask.indptr) + else: + n_missing = np.bincount(imputer_mask.indices, + minlength=X.shape[1]) if self.sparse is False: imputer_mask = imputer_mask.toarray() @@ -1161,17 +1161,22 @@ def _get_missing_features_info(self, X): imputer_mask = imputer_mask.tocsc() else: imputer_mask = _get_mask(X, self.missing_values) - n_missing = imputer_mask.sum(axis=0) - features_with_missing = np.flatnonzero(n_missing) - if self.features == 'not-constant': - features_with_missing = np.intersect1d( - features_with_missing, np.flatnonzero(n_missing - X.shape[0])) + if self.features in ('missing-only', 'not-constant'): + n_missing = imputer_mask.sum(axis=0) if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) - return imputer_mask, features_with_missing + if self.features == 'all': + features_indices = np.arange(X.shape[1]) + else: + features_indices = np.flatnonzero(n_missing) + if self.features == 'not-constant': + features_indices = np.intersect1d( + features_indices, np.flatnonzero(n_missing - X.shape[0])) + + return imputer_mask, features_indices def _validate_input(self, X): if not is_scalar_nan(self.missing_values): @@ -1217,7 +1222,7 @@ def fit(self, X, y=None): if self.features not in ('missing-only', 'all', 'not-constant'): raise ValueError("'features' has to be either 'missing-only', " - "'not-constant' or 'all'. Got {} instead." + "'all' or 'not-constant'. Got {} instead." .format(self.features)) if not ((isinstance(self.sparse, str) and @@ -1225,9 +1230,7 @@ def fit(self, X, y=None): raise ValueError("'sparse' has to be a boolean or 'auto'. " "Got {!r} instead.".format(self.sparse)) - self.features_ = (self._get_missing_features_info(X)[1] - if self.features in ('missing-only', 'not-constant') - else np.arange(self._n_features)) + self.features_ = self._get_missing_features_info(X)[1] return self @@ -1262,9 +1265,7 @@ def transform(self, X): "in transform but have no missing values " "in fit.".format(features_diff_fit_trans)) - if (self.features_.size > 0 and - self.features_.size < self._n_features): - imputer_mask = imputer_mask[:, self.features_] + imputer_mask = imputer_mask[:, self.features_] return imputer_mask diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index ba744356a3178..4db6fac28d704 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -919,7 +919,7 @@ def test_iterative_imputer_early_stopping(): 'have missing values in transform but have no missing values in fit'), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'random', 'sparse': 'auto'}, - "'features' has to be either 'missing-only' or 'all'"), + "'features' has to be either 'missing-only', 'all' or 'not-constant'"), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'all', 'sparse': 'random'}, "'sparse' has to be a boolean or 'auto'"), @@ -1121,8 +1121,9 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, imputer.fit_transform(X) -@pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix], - ids=["dense", "sparse"]) +@pytest.mark.parametrize("array_constr", + [np.array, sparse.csr_matrix, sparse.csc_matrix], + ids=["dense", "sparse_csr", "sparse_csc"]) def test_missing_indicator_drop_full_missing(array_constr): # Check that missing indicator with features="not-constant" drops columns # with no missing values as well as columns full of missing values. @@ -1135,4 +1136,4 @@ def test_missing_indicator_drop_full_missing(array_constr): mi = MissingIndicator(features="not-constant") Xt = mi.fit_transform(X) - assert_allclose(Xt, expected_Xt) + assert_allclose_dense_sparse(Xt, expected_Xt) From 90490f048c67e92a728b0109ef2fc3fc5e4090bb Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Mar 2019 22:40:38 +0100 Subject: [PATCH 03/10] what' new --- doc/whats_new/v0.21.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index db4aa1b3250a3..f6d037faf4695 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -210,6 +210,11 @@ Support for Python 3.4 and below has been officially dropped. raising an exception if input is sparse add `missing_values` property is set to 0. :issue:`13240` by :user:`Bartosz Telenczuk `. +- |ENH| Add another possible value for the `features` parameter of + :class:`MissingIndicator` to drop features with no missing values as well as + features with only missing values. :issue:`13491` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.isotonic` ....................... From 5090cce6499125d40eeec43c638f51c90ba850e3 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 25 Mar 2019 11:53:29 +0100 Subject: [PATCH 04/10] tst --- sklearn/impute.py | 6 +++--- sklearn/tests/test_impute.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index fdc25147094cb..cc93b0bcb8f20 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1146,7 +1146,7 @@ def _get_missing_features_info(self, X): imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) - imputer_mask.eliminate_zeros() + # imputer_mask.eliminate_zeros() if self.features in ('missing-only', 'not-constant'): if imputer_mask.format == 'csc': @@ -1155,7 +1155,7 @@ def _get_missing_features_info(self, X): n_missing = np.bincount(imputer_mask.indices, minlength=X.shape[1]) - if self.sparse is False: + if not self.sparse: imputer_mask = imputer_mask.toarray() elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() @@ -1165,7 +1165,7 @@ def _get_missing_features_info(self, X): if self.features in ('missing-only', 'not-constant'): n_missing = imputer_mask.sum(axis=0) - if self.sparse is True: + if self.sparse: imputer_mask = sparse.csc_matrix(imputer_mask) if self.features == 'all': diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 4db6fac28d704..b39c6de3a72c8 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1137,3 +1137,19 @@ def test_missing_indicator_drop_full_missing(array_constr): Xt = mi.fit_transform(X) assert_allclose_dense_sparse(Xt, expected_Xt) + + +def test_missing_indicator_sparse_no_explicit_zeros(): + # Check that non missing values don't become explicit zeeros in the mask + # generated by missing indicator when X is sparse. + X = sparse.csr_matrix([[0, 1, 2], + [1, 2, 0], + [2, 0, 1]]) + expected_nnz = 3 # 3 missing values + + mi = MissingIndicator(features='all', missing_values=1) + Xt = mi.fit_transform(X) + + nnz = Xt.getnnz() + + assert nnz == expected_nnz From a8d95e5b2c66b873d96cf8b694a4ee186ee52ec7 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 25 Mar 2019 13:02:28 +0100 Subject: [PATCH 05/10] add test for explicit zeros --- sklearn/impute.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index cc93b0bcb8f20..13afd0f3d0930 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1146,7 +1146,7 @@ def _get_missing_features_info(self, X): imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) - # imputer_mask.eliminate_zeros() + imputer_mask.eliminate_zeros() if self.features in ('missing-only', 'not-constant'): if imputer_mask.format == 'csc': @@ -1155,7 +1155,7 @@ def _get_missing_features_info(self, X): n_missing = np.bincount(imputer_mask.indices, minlength=X.shape[1]) - if not self.sparse: + if self.sparse is False: imputer_mask = imputer_mask.toarray() elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() @@ -1165,7 +1165,7 @@ def _get_missing_features_info(self, X): if self.features in ('missing-only', 'not-constant'): n_missing = imputer_mask.sum(axis=0) - if self.sparse: + if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) if self.features == 'all': @@ -1265,7 +1265,8 @@ def transform(self, X): "in transform but have no missing values " "in fit.".format(features_diff_fit_trans)) - imputer_mask = imputer_mask[:, self.features_] + if self.features_.size < self._n_features: + imputer_mask = imputer_mask[:, self.features_] return imputer_mask From b0f4fdd742cc8866e6b89bcbb0ab1f489481a288 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 25 Mar 2019 15:16:34 +0100 Subject: [PATCH 06/10] what's new --- doc/whats_new/v0.21.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index f6d037faf4695..66e6fe495f97b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -215,6 +215,10 @@ Support for Python 3.4 and below has been officially dropped. features with only missing values. :issue:`13491` by :user:`Jérémie du Boisberranger `. +- |Fix| Fixed a bug in :class:`MissingIndicator` when ``X`` is sparse. All the + non-zero missing values used to become explicit False is the transformed data. + :issue:`13491` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.isotonic` ....................... From fba8ff452510cd26e6efca8638a39259a0f7ed36 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 27 Mar 2019 10:35:12 +0100 Subject: [PATCH 07/10] simpler code. fix typos. wording. --- doc/whats_new/v0.21.rst | 4 ++-- sklearn/impute.py | 30 +++++++++++++----------------- sklearn/tests/test_impute.py | 11 ++++------- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 66e6fe495f97b..b5df94f10594b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -216,8 +216,8 @@ Support for Python 3.4 and below has been officially dropped. :user:`Jérémie du Boisberranger `. - |Fix| Fixed a bug in :class:`MissingIndicator` when ``X`` is sparse. All the - non-zero missing values used to become explicit False is the transformed data. - :issue:`13491` by :user:`Jérémie du Boisberranger `. + non-zero missing values used to become explicit False in the transformed + data. :issue:`13491` by :user:`Jérémie du Boisberranger `. :mod:`sklearn.isotonic` ....................... diff --git a/sklearn/impute.py b/sklearn/impute.py index 13afd0f3d0930..13f752a14628e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1057,14 +1057,14 @@ class MissingIndicator(BaseEstimator, TransformerMixin): `missing_values` will be indicated (True in the output array), the other values will be marked as False. - features : {"missing-only", "all", "not-constant"}, optional + features : {"missing-only", "all", "some-missing"}, optional Whether the imputer mask should represent all or a subset of features. - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. - If "all", the imputer mask will represent all features. - - If "not-constant", the imputer mask will represent features + - If "some-missing", the imputer mask will represent features containing missing values but not containing only missing values. sparse : boolean or "auto", optional @@ -1148,12 +1148,8 @@ def _get_missing_features_info(self, X): shape=X.shape, dtype=bool) imputer_mask.eliminate_zeros() - if self.features in ('missing-only', 'not-constant'): - if imputer_mask.format == 'csc': - n_missing = np.diff(imputer_mask.indptr) - else: - n_missing = np.bincount(imputer_mask.indices, - minlength=X.shape[1]) + if self.features in ('missing-only', 'some-missing'): + n_missing = imputer_mask.getnnz(axis=0) if self.sparse is False: imputer_mask = imputer_mask.toarray() @@ -1162,7 +1158,7 @@ def _get_missing_features_info(self, X): else: imputer_mask = _get_mask(X, self.missing_values) - if self.features in ('missing-only', 'not-constant'): + if self.features in ('missing-only', 'some-missing'): n_missing = imputer_mask.sum(axis=0) if self.sparse is True: @@ -1170,11 +1166,11 @@ def _get_missing_features_info(self, X): if self.features == 'all': features_indices = np.arange(X.shape[1]) - else: + elif self.features == 'missing-only': features_indices = np.flatnonzero(n_missing) - if self.features == 'not-constant': - features_indices = np.intersect1d( - features_indices, np.flatnonzero(n_missing - X.shape[0])) + else: + features_indices = np.flatnonzero( + np.logical_and(n_missing < X.shape[0], n_missing > 0)) return imputer_mask, features_indices @@ -1220,9 +1216,9 @@ def fit(self, X, y=None): X = self._validate_input(X) self._n_features = X.shape[1] - if self.features not in ('missing-only', 'all', 'not-constant'): - raise ValueError("'features' has to be either 'missing-only', " - "'all' or 'not-constant'. Got {} instead." + if self.features not in ('missing-only', 'all', 'some-missing'): + raise ValueError("'features' has to be one of 'missing-only', " + "'all' or 'some-missing'. Got {} instead." .format(self.features)) if not ((isinstance(self.sparse, str) and @@ -1258,7 +1254,7 @@ def transform(self, X): imputer_mask, features = self._get_missing_features_info(X) - if self.features in ("missing-only", "not-constant"): + if self.features in ("missing-only", "some-missing"): features_diff_fit_trans = np.setdiff1d(features, self.features_) if (self.error_on_new and features_diff_fit_trans.size > 0): raise ValueError("The features {} have missing values " diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index b39c6de3a72c8..4f86b8a0079f0 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -919,7 +919,7 @@ def test_iterative_imputer_early_stopping(): 'have missing values in transform but have no missing values in fit'), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'random', 'sparse': 'auto'}, - "'features' has to be either 'missing-only', 'all' or 'not-constant'"), + "'features' has to be one of 'missing-only', 'all' or 'some-missing'"), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'all', 'sparse': 'random'}, "'sparse' has to be a boolean or 'auto'"), @@ -1125,7 +1125,7 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, [np.array, sparse.csr_matrix, sparse.csc_matrix], ids=["dense", "sparse_csr", "sparse_csc"]) def test_missing_indicator_drop_full_missing(array_constr): - # Check that missing indicator with features="not-constant" drops columns + # Check that missing indicator with features="some-missing" drops columns # with no missing values as well as columns full of missing values. X = array_constr([[0, np.nan, 0], [0, np.nan, np.nan]]) @@ -1133,7 +1133,7 @@ def test_missing_indicator_drop_full_missing(array_constr): expected_Xt = array_constr([[False], [True]]) - mi = MissingIndicator(features="not-constant") + mi = MissingIndicator(features="some-missing") Xt = mi.fit_transform(X) assert_allclose_dense_sparse(Xt, expected_Xt) @@ -1145,11 +1145,8 @@ def test_missing_indicator_sparse_no_explicit_zeros(): X = sparse.csr_matrix([[0, 1, 2], [1, 2, 0], [2, 0, 1]]) - expected_nnz = 3 # 3 missing values mi = MissingIndicator(features='all', missing_values=1) Xt = mi.fit_transform(X) - nnz = Xt.getnnz() - - assert nnz == expected_nnz + assert Xt.getnnz() == Xt.sum() From d57db9dfb42d5fc594f8a142ededa5d849173af5 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 1 Apr 2019 10:51:59 +0200 Subject: [PATCH 08/10] update docstring + typo --- sklearn/impute.py | 2 +- sklearn/tests/test_impute.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 13f752a14628e..52005ac66aa5d 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1078,7 +1078,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): error_on_new : boolean, optional If True (default), transform will raise an error when there are features with missing values in transform that have no missing values - in fit. This is applicable only when ``features="missing-only"``. + in fit. This is applicable only when ``features!="all"``. Attributes ---------- diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 4f86b8a0079f0..ff59d407139db 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1140,7 +1140,7 @@ def test_missing_indicator_drop_full_missing(array_constr): def test_missing_indicator_sparse_no_explicit_zeros(): - # Check that non missing values don't become explicit zeeros in the mask + # Check that non missing values don't become explicit zeros in the mask # generated by missing indicator when X is sparse. X = sparse.csr_matrix([[0, 1, 2], [1, 2, 0], From f65055ea8c0d75f53ca08665fadf9a7e91ef5f52 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 1 Apr 2019 15:25:31 +0200 Subject: [PATCH 09/10] update error_on_new --- sklearn/impute.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 52005ac66aa5d..44fd9abb89610 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1076,9 +1076,12 @@ class MissingIndicator(BaseEstimator, TransformerMixin): - If False, the imputer mask will be a numpy array. error_on_new : boolean, optional - If True (default), transform will raise an error when there are + If True (default), transform will raise an error when there are either features with missing values in transform that have no missing values - in fit. This is applicable only when ``features!="all"``. + in fit (only applicable if + ``features in ("missing-only", "some-missing")``), or features with non + missing values in transform that have only missing values in fit + (only applicable if ``features="some-missing"``). Attributes ---------- @@ -1257,9 +1260,11 @@ def transform(self, X): if self.features in ("missing-only", "some-missing"): features_diff_fit_trans = np.setdiff1d(features, self.features_) if (self.error_on_new and features_diff_fit_trans.size > 0): - raise ValueError("The features {} have missing values " + raise ValueError("The features {} either have missing values " "in transform but have no missing values " - "in fit.".format(features_diff_fit_trans)) + "in fit, or have non missing values in " + "transform but have only missing values in " + "fit".format(features_diff_fit_trans)) if self.features_.size < self._n_features: imputer_mask = imputer_mask[:, self.features_] From db10a82d973e49c89db672f7f5ccbd0abe07ca42 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 1 Apr 2019 15:28:12 +0200 Subject: [PATCH 10/10] add test --- sklearn/tests/test_impute.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index ff59d407139db..d0e8ffb2e7c33 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -917,6 +917,9 @@ def test_iterative_imputer_early_stopping(): [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), {'features': 'missing-only', 'sparse': 'auto'}, 'have missing values in transform but have no missing values in fit'), + (np.array([[-1, 1], [-1, -1]]), np.array([[-1, 1], [1, 1]]), + {'features': 'some-missing', 'sparse': 'auto'}, + 'have missing values in transform but have no missing values in fit'), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'random', 'sparse': 'auto'}, "'features' has to be one of 'missing-only', 'all' or 'some-missing'"),