From fce7ee1b34cbfd4eba975a25a2d7843790dca137 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Mar 2019 16:20:23 +0100 Subject: [PATCH 1/7] option to drop full missing columns (dense) --- sklearn/impute.py | 21 +++++++++++++++------ sklearn/tests/test_impute.py | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index ea4e8663d0313..8a14a4564c35e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1063,6 +1063,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. + - If "not-constant", the imputer mask will only represent features + containing missing values, but not containing only missing values, + during fit time. - If "all", the imputer mask will represent all features. sparse : boolean or "auto", optional @@ -1158,7 +1161,12 @@ def _get_missing_features_info(self, X): imputer_mask = imputer_mask.tocsc() else: imputer_mask = _get_mask(X, self.missing_values) - features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0)) + n_missing = imputer_mask.sum(axis=0) + features_with_missing = np.flatnonzero(n_missing) + + if self.features == 'not-constant': + features_with_missing = np.intersect1d( + features_with_missing, np.flatnonzero(n_missing - X.shape[0])) if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) @@ -1207,9 +1215,10 @@ def fit(self, X, y=None): X = self._validate_input(X) self._n_features = X.shape[1] - if self.features not in ('missing-only', 'all'): - raise ValueError("'features' has to be either 'missing-only' or " - "'all'. Got {} instead.".format(self.features)) + if self.features not in ('missing-only', 'all', 'not-constant'): + raise ValueError("'features' has to be either 'missing-only', " + "'not-constant' or 'all'. Got {} instead." + .format(self.features)) if not ((isinstance(self.sparse, str) and self.sparse == "auto") or isinstance(self.sparse, bool)): @@ -1217,7 +1226,7 @@ def fit(self, X, y=None): "Got {!r} instead.".format(self.sparse)) self.features_ = (self._get_missing_features_info(X)[1] - if self.features == 'missing-only' + if self.features in ('missing-only', 'not-constant') else np.arange(self._n_features)) return self @@ -1246,7 +1255,7 @@ def transform(self, X): imputer_mask, features = self._get_missing_features_info(X) - if self.features == "missing-only": + if self.features in ("missing-only", "not-constant"): features_diff_fit_trans = np.setdiff1d(features, self.features_) if (self.error_on_new and features_diff_fit_trans.size > 0): raise ValueError("The features {} have missing values " diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 59bbb091afbea..ba744356a3178 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1119,3 +1119,20 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, with pytest.raises(ValueError, match=err_msg): imputer.fit_transform(X) + + +@pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix], + ids=["dense", "sparse"]) +def test_missing_indicator_drop_full_missing(array_constr): + # Check that missing indicator with features="not-constant" drops columns + # with no missing values as well as columns full of missing values. + X = array_constr([[0, np.nan, 0], + [0, np.nan, np.nan]]) + + expected_Xt = array_constr([[False], + [True]]) + + mi = MissingIndicator(features="not-constant") + Xt = mi.fit_transform(X) + + assert_allclose(Xt, expected_Xt) From cb35f6ca2dfe996104b520eeb690f6a6b1a8449c Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Mar 2019 22:26:20 +0100 Subject: [PATCH 2/7] deal with sparse --- sklearn/impute.py | 47 ++++++++++++++++++------------------ sklearn/tests/test_impute.py | 9 ++++--- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 8a14a4564c35e..fdc25147094cb 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1057,16 +1057,15 @@ class MissingIndicator(BaseEstimator, TransformerMixin): `missing_values` will be indicated (True in the output array), the other values will be marked as False. - features : str, optional + features : {"missing-only", "all", "not-constant"}, optional Whether the imputer mask should represent all or a subset of features. - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. - - If "not-constant", the imputer mask will only represent features - containing missing values, but not containing only missing values, - during fit time. - If "all", the imputer mask will represent all features. + - If "not-constant", the imputer mask will represent features + containing missing values but not containing only missing values. sparse : boolean or "auto", optional Whether the imputer mask format should be sparse or dense. @@ -1147,13 +1146,14 @@ def _get_missing_features_info(self, X): imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) + imputer_mask.eliminate_zeros() - missing_values_mask = imputer_mask.copy() - missing_values_mask.eliminate_zeros() - features_with_missing = ( - np.flatnonzero(np.diff(missing_values_mask.indptr)) - if missing_values_mask.format == 'csc' - else np.unique(missing_values_mask.indices)) + if self.features in ('missing-only', 'not-constant'): + if imputer_mask.format == 'csc': + n_missing = np.diff(imputer_mask.indptr) + else: + n_missing = np.bincount(imputer_mask.indices, + minlength=X.shape[1]) if self.sparse is False: imputer_mask = imputer_mask.toarray() @@ -1161,17 +1161,22 @@ def _get_missing_features_info(self, X): imputer_mask = imputer_mask.tocsc() else: imputer_mask = _get_mask(X, self.missing_values) - n_missing = imputer_mask.sum(axis=0) - features_with_missing = np.flatnonzero(n_missing) - if self.features == 'not-constant': - features_with_missing = np.intersect1d( - features_with_missing, np.flatnonzero(n_missing - X.shape[0])) + if self.features in ('missing-only', 'not-constant'): + n_missing = imputer_mask.sum(axis=0) if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) - return imputer_mask, features_with_missing + if self.features == 'all': + features_indices = np.arange(X.shape[1]) + else: + features_indices = np.flatnonzero(n_missing) + if self.features == 'not-constant': + features_indices = np.intersect1d( + features_indices, np.flatnonzero(n_missing - X.shape[0])) + + return imputer_mask, features_indices def _validate_input(self, X): if not is_scalar_nan(self.missing_values): @@ -1217,7 +1222,7 @@ def fit(self, X, y=None): if self.features not in ('missing-only', 'all', 'not-constant'): raise ValueError("'features' has to be either 'missing-only', " - "'not-constant' or 'all'. Got {} instead." + "'all' or 'not-constant'. Got {} instead." .format(self.features)) if not ((isinstance(self.sparse, str) and @@ -1225,9 +1230,7 @@ def fit(self, X, y=None): raise ValueError("'sparse' has to be a boolean or 'auto'. " "Got {!r} instead.".format(self.sparse)) - self.features_ = (self._get_missing_features_info(X)[1] - if self.features in ('missing-only', 'not-constant') - else np.arange(self._n_features)) + self.features_ = self._get_missing_features_info(X)[1] return self @@ -1262,9 +1265,7 @@ def transform(self, X): "in transform but have no missing values " "in fit.".format(features_diff_fit_trans)) - if (self.features_.size > 0 and - self.features_.size < self._n_features): - imputer_mask = imputer_mask[:, self.features_] + imputer_mask = imputer_mask[:, self.features_] return imputer_mask diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index ba744356a3178..4db6fac28d704 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -919,7 +919,7 @@ def test_iterative_imputer_early_stopping(): 'have missing values in transform but have no missing values in fit'), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'random', 'sparse': 'auto'}, - "'features' has to be either 'missing-only' or 'all'"), + "'features' has to be either 'missing-only', 'all' or 'not-constant'"), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'all', 'sparse': 'random'}, "'sparse' has to be a boolean or 'auto'"), @@ -1121,8 +1121,9 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, imputer.fit_transform(X) -@pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix], - ids=["dense", "sparse"]) +@pytest.mark.parametrize("array_constr", + [np.array, sparse.csr_matrix, sparse.csc_matrix], + ids=["dense", "sparse_csr", "sparse_csc"]) def test_missing_indicator_drop_full_missing(array_constr): # Check that missing indicator with features="not-constant" drops columns # with no missing values as well as columns full of missing values. @@ -1135,4 +1136,4 @@ def test_missing_indicator_drop_full_missing(array_constr): mi = MissingIndicator(features="not-constant") Xt = mi.fit_transform(X) - assert_allclose(Xt, expected_Xt) + assert_allclose_dense_sparse(Xt, expected_Xt) From 90490f048c67e92a728b0109ef2fc3fc5e4090bb Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Mar 2019 22:40:38 +0100 Subject: [PATCH 3/7] what' new --- doc/whats_new/v0.21.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index db4aa1b3250a3..f6d037faf4695 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -210,6 +210,11 @@ Support for Python 3.4 and below has been officially dropped. raising an exception if input is sparse add `missing_values` property is set to 0. :issue:`13240` by :user:`Bartosz Telenczuk `. +- |ENH| Add another possible value for the `features` parameter of + :class:`MissingIndicator` to drop features with no missing values as well as + features with only missing values. :issue:`13491` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.isotonic` ....................... From 5090cce6499125d40eeec43c638f51c90ba850e3 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 25 Mar 2019 11:53:29 +0100 Subject: [PATCH 4/7] tst --- sklearn/impute.py | 6 +++--- sklearn/tests/test_impute.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index fdc25147094cb..cc93b0bcb8f20 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1146,7 +1146,7 @@ def _get_missing_features_info(self, X): imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) - imputer_mask.eliminate_zeros() + # imputer_mask.eliminate_zeros() if self.features in ('missing-only', 'not-constant'): if imputer_mask.format == 'csc': @@ -1155,7 +1155,7 @@ def _get_missing_features_info(self, X): n_missing = np.bincount(imputer_mask.indices, minlength=X.shape[1]) - if self.sparse is False: + if not self.sparse: imputer_mask = imputer_mask.toarray() elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() @@ -1165,7 +1165,7 @@ def _get_missing_features_info(self, X): if self.features in ('missing-only', 'not-constant'): n_missing = imputer_mask.sum(axis=0) - if self.sparse is True: + if self.sparse: imputer_mask = sparse.csc_matrix(imputer_mask) if self.features == 'all': diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 4db6fac28d704..b39c6de3a72c8 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1137,3 +1137,19 @@ def test_missing_indicator_drop_full_missing(array_constr): Xt = mi.fit_transform(X) assert_allclose_dense_sparse(Xt, expected_Xt) + + +def test_missing_indicator_sparse_no_explicit_zeros(): + # Check that non missing values don't become explicit zeeros in the mask + # generated by missing indicator when X is sparse. + X = sparse.csr_matrix([[0, 1, 2], + [1, 2, 0], + [2, 0, 1]]) + expected_nnz = 3 # 3 missing values + + mi = MissingIndicator(features='all', missing_values=1) + Xt = mi.fit_transform(X) + + nnz = Xt.getnnz() + + assert nnz == expected_nnz From a8d95e5b2c66b873d96cf8b694a4ee186ee52ec7 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 25 Mar 2019 13:02:28 +0100 Subject: [PATCH 5/7] add test for explicit zeros --- sklearn/impute.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index cc93b0bcb8f20..13afd0f3d0930 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1146,7 +1146,7 @@ def _get_missing_features_info(self, X): imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) - # imputer_mask.eliminate_zeros() + imputer_mask.eliminate_zeros() if self.features in ('missing-only', 'not-constant'): if imputer_mask.format == 'csc': @@ -1155,7 +1155,7 @@ def _get_missing_features_info(self, X): n_missing = np.bincount(imputer_mask.indices, minlength=X.shape[1]) - if not self.sparse: + if self.sparse is False: imputer_mask = imputer_mask.toarray() elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() @@ -1165,7 +1165,7 @@ def _get_missing_features_info(self, X): if self.features in ('missing-only', 'not-constant'): n_missing = imputer_mask.sum(axis=0) - if self.sparse: + if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) if self.features == 'all': @@ -1265,7 +1265,8 @@ def transform(self, X): "in transform but have no missing values " "in fit.".format(features_diff_fit_trans)) - imputer_mask = imputer_mask[:, self.features_] + if self.features_.size < self._n_features: + imputer_mask = imputer_mask[:, self.features_] return imputer_mask From b0f4fdd742cc8866e6b89bcbb0ab1f489481a288 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 25 Mar 2019 15:16:34 +0100 Subject: [PATCH 6/7] what's new --- doc/whats_new/v0.21.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index f6d037faf4695..66e6fe495f97b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -215,6 +215,10 @@ Support for Python 3.4 and below has been officially dropped. features with only missing values. :issue:`13491` by :user:`Jérémie du Boisberranger `. +- |Fix| Fixed a bug in :class:`MissingIndicator` when ``X`` is sparse. All the + non-zero missing values used to become explicit False is the transformed data. + :issue:`13491` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.isotonic` ....................... From c58c342cda0536c350e2605526a2bf4d1bce5ba6 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 25 Mar 2019 18:03:52 +0100 Subject: [PATCH 7/7] support precomputed mask --- doc/whats_new/v0.21.rst | 5 +++++ sklearn/impute.py | 27 ++++++++++++++++++++++++--- sklearn/tests/test_impute.py | 26 ++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 66e6fe495f97b..d123eb37611d6 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -219,6 +219,11 @@ Support for Python 3.4 and below has been officially dropped. non-zero missing values used to become explicit False is the transformed data. :issue:`13491` by :user:`Jérémie du Boisberranger `. +- |ENH| Add another possible value (`MissingIndicator.precomputed`) for the + `missing_values` parameter of the :class:`MissingIndicator` which allows to + pass ``X`` as a precomputed mask of the missing values. :issue:`` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.isotonic` ....................... diff --git a/sklearn/impute.py b/sklearn/impute.py index 13afd0f3d0930..227a569d9dd22 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1052,10 +1052,14 @@ class MissingIndicator(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : number, string, np.nan (default) or None + missing_values : number, string, np.nan, None or \ +MissingIndicator.precomputed (default=np.nan) The placeholder for the missing values. All occurrences of `missing_values` will be indicated (True in the output array), the other values will be marked as False. + If MissingIndicator.precomputed, then ``X`` must be a boolean array or + sparse matrix and will be interpreted as a precomputed mask of the + missing values. features : {"missing-only", "all", "not-constant"}, optional Whether the imputer mask should represent all or a subset of @@ -1108,6 +1112,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): [False, False]]) """ + precomputed = object() def __init__(self, missing_values=np.nan, features="missing-only", sparse="auto", error_on_new=True): @@ -1137,8 +1142,17 @@ def _get_missing_features_info(self, X): """ if sparse.issparse(X): - mask = _get_mask(X.data, self.missing_values) + if self.missing_values is self.precomputed: + mask = X.data + else: + mask = _get_mask(X.data, self.missing_values) + else: + if self.missing_values is self.precomputed: + mask = X + else: + mask = _get_mask(X, self.missing_values) + if sparse.issparse(X): # The imputer mask will be constructed with the same sparse format # as X. sparse_constructor = (sparse.csr_matrix if X.format == 'csr' @@ -1160,7 +1174,7 @@ def _get_missing_features_info(self, X): elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() else: - imputer_mask = _get_mask(X, self.missing_values) + imputer_mask = mask if self.features in ('missing-only', 'not-constant'): n_missing = imputer_mask.sum(axis=0) @@ -1179,6 +1193,13 @@ def _get_missing_features_info(self, X): return imputer_mask, features_indices def _validate_input(self, X): + if self.missing_values is self.precomputed: + if X.dtype.kind != "b": + raise ValueError("Expected a boolean mask.") + + return check_array(X, accept_sparse=('csc'), dtype=np.bool, + force_all_finite=True) + if not is_scalar_nan(self.missing_values): force_all_finite = True else: diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index b39c6de3a72c8..03654163bdaa4 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1153,3 +1153,29 @@ def test_missing_indicator_sparse_no_explicit_zeros(): nnz = Xt.getnnz() assert nnz == expected_nnz + + +@pytest.mark.parametrize("features, expected_Xt", + [("all", [[False, True, False], + [False, True, True]]), + ("missing-only", [[True, False], + [True, True]]), + ("not-constant", [[False], + [True]])]) +@pytest.mark.parametrize("array_constr", [np.array, sparse.csc_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("sparse_output", [True, False]) +def test_missing_indicator_precomputed_mask(features, expected_Xt, + array_constr, sparse_output): + # Test missing indicator with precomputed mask. + X_mask = array_constr([[False, True, False], + [False, True, True]]) + + mi = MissingIndicator(features=features, sparse=sparse_output, + missing_values=MissingIndicator.precomputed) + Xt = mi.fit_transform(X_mask) + + if sparse.issparse(Xt): + Xt = Xt.toarray() + + assert_array_equal(Xt, expected_Xt)