diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index db4aa1b3250a3..d123eb37611d6 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -210,6 +210,20 @@ Support for Python 3.4 and below has been officially dropped. raising an exception if input is sparse add `missing_values` property is set to 0. :issue:`13240` by :user:`Bartosz Telenczuk `. +- |ENH| Add another possible value for the `features` parameter of + :class:`MissingIndicator` to drop features with no missing values as well as + features with only missing values. :issue:`13491` by + :user:`Jérémie du Boisberranger `. + +- |Fix| Fixed a bug in :class:`MissingIndicator` when ``X`` is sparse. All the + non-zero missing values used to become explicit False is the transformed data. + :issue:`13491` by :user:`Jérémie du Boisberranger `. + +- |ENH| Add another possible value (`MissingIndicator.precomputed`) for the + `missing_values` parameter of the :class:`MissingIndicator` which allows to + pass ``X`` as a precomputed mask of the missing values. :issue:`` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.isotonic` ....................... diff --git a/sklearn/impute.py b/sklearn/impute.py index ea4e8663d0313..227a569d9dd22 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -1052,18 +1052,24 @@ class MissingIndicator(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : number, string, np.nan (default) or None + missing_values : number, string, np.nan, None or \ +MissingIndicator.precomputed (default=np.nan) The placeholder for the missing values. All occurrences of `missing_values` will be indicated (True in the output array), the other values will be marked as False. + If MissingIndicator.precomputed, then ``X`` must be a boolean array or + sparse matrix and will be interpreted as a precomputed mask of the + missing values. - features : str, optional + features : {"missing-only", "all", "not-constant"}, optional Whether the imputer mask should represent all or a subset of features. - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. - If "all", the imputer mask will represent all features. + - If "not-constant", the imputer mask will represent features + containing missing values but not containing only missing values. sparse : boolean or "auto", optional Whether the imputer mask format should be sparse or dense. @@ -1106,6 +1112,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin): [False, False]]) """ + precomputed = object() def __init__(self, missing_values=np.nan, features="missing-only", sparse="auto", error_on_new=True): @@ -1135,8 +1142,17 @@ def _get_missing_features_info(self, X): """ if sparse.issparse(X): - mask = _get_mask(X.data, self.missing_values) + if self.missing_values is self.precomputed: + mask = X.data + else: + mask = _get_mask(X.data, self.missing_values) + else: + if self.missing_values is self.precomputed: + mask = X + else: + mask = _get_mask(X, self.missing_values) + if sparse.issparse(X): # The imputer mask will be constructed with the same sparse format # as X. sparse_constructor = (sparse.csr_matrix if X.format == 'csr' @@ -1144,28 +1160,46 @@ def _get_missing_features_info(self, X): imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) + imputer_mask.eliminate_zeros() - missing_values_mask = imputer_mask.copy() - missing_values_mask.eliminate_zeros() - features_with_missing = ( - np.flatnonzero(np.diff(missing_values_mask.indptr)) - if missing_values_mask.format == 'csc' - else np.unique(missing_values_mask.indices)) + if self.features in ('missing-only', 'not-constant'): + if imputer_mask.format == 'csc': + n_missing = np.diff(imputer_mask.indptr) + else: + n_missing = np.bincount(imputer_mask.indices, + minlength=X.shape[1]) if self.sparse is False: imputer_mask = imputer_mask.toarray() elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() else: - imputer_mask = _get_mask(X, self.missing_values) - features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0)) + imputer_mask = mask + + if self.features in ('missing-only', 'not-constant'): + n_missing = imputer_mask.sum(axis=0) if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) - return imputer_mask, features_with_missing + if self.features == 'all': + features_indices = np.arange(X.shape[1]) + else: + features_indices = np.flatnonzero(n_missing) + if self.features == 'not-constant': + features_indices = np.intersect1d( + features_indices, np.flatnonzero(n_missing - X.shape[0])) + + return imputer_mask, features_indices def _validate_input(self, X): + if self.missing_values is self.precomputed: + if X.dtype.kind != "b": + raise ValueError("Expected a boolean mask.") + + return check_array(X, accept_sparse=('csc'), dtype=np.bool, + force_all_finite=True) + if not is_scalar_nan(self.missing_values): force_all_finite = True else: @@ -1207,18 +1241,17 @@ def fit(self, X, y=None): X = self._validate_input(X) self._n_features = X.shape[1] - if self.features not in ('missing-only', 'all'): - raise ValueError("'features' has to be either 'missing-only' or " - "'all'. Got {} instead.".format(self.features)) + if self.features not in ('missing-only', 'all', 'not-constant'): + raise ValueError("'features' has to be either 'missing-only', " + "'all' or 'not-constant'. Got {} instead." + .format(self.features)) if not ((isinstance(self.sparse, str) and self.sparse == "auto") or isinstance(self.sparse, bool)): raise ValueError("'sparse' has to be a boolean or 'auto'. " "Got {!r} instead.".format(self.sparse)) - self.features_ = (self._get_missing_features_info(X)[1] - if self.features == 'missing-only' - else np.arange(self._n_features)) + self.features_ = self._get_missing_features_info(X)[1] return self @@ -1246,15 +1279,14 @@ def transform(self, X): imputer_mask, features = self._get_missing_features_info(X) - if self.features == "missing-only": + if self.features in ("missing-only", "not-constant"): features_diff_fit_trans = np.setdiff1d(features, self.features_) if (self.error_on_new and features_diff_fit_trans.size > 0): raise ValueError("The features {} have missing values " "in transform but have no missing values " "in fit.".format(features_diff_fit_trans)) - if (self.features_.size > 0 and - self.features_.size < self._n_features): + if self.features_.size < self._n_features: imputer_mask = imputer_mask[:, self.features_] return imputer_mask diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 59bbb091afbea..03654163bdaa4 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -919,7 +919,7 @@ def test_iterative_imputer_early_stopping(): 'have missing values in transform but have no missing values in fit'), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'random', 'sparse': 'auto'}, - "'features' has to be either 'missing-only' or 'all'"), + "'features' has to be either 'missing-only', 'all' or 'not-constant'"), (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]), {'features': 'all', 'sparse': 'random'}, "'sparse' has to be a boolean or 'auto'"), @@ -1119,3 +1119,63 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor, with pytest.raises(ValueError, match=err_msg): imputer.fit_transform(X) + + +@pytest.mark.parametrize("array_constr", + [np.array, sparse.csr_matrix, sparse.csc_matrix], + ids=["dense", "sparse_csr", "sparse_csc"]) +def test_missing_indicator_drop_full_missing(array_constr): + # Check that missing indicator with features="not-constant" drops columns + # with no missing values as well as columns full of missing values. + X = array_constr([[0, np.nan, 0], + [0, np.nan, np.nan]]) + + expected_Xt = array_constr([[False], + [True]]) + + mi = MissingIndicator(features="not-constant") + Xt = mi.fit_transform(X) + + assert_allclose_dense_sparse(Xt, expected_Xt) + + +def test_missing_indicator_sparse_no_explicit_zeros(): + # Check that non missing values don't become explicit zeeros in the mask + # generated by missing indicator when X is sparse. + X = sparse.csr_matrix([[0, 1, 2], + [1, 2, 0], + [2, 0, 1]]) + expected_nnz = 3 # 3 missing values + + mi = MissingIndicator(features='all', missing_values=1) + Xt = mi.fit_transform(X) + + nnz = Xt.getnnz() + + assert nnz == expected_nnz + + +@pytest.mark.parametrize("features, expected_Xt", + [("all", [[False, True, False], + [False, True, True]]), + ("missing-only", [[True, False], + [True, True]]), + ("not-constant", [[False], + [True]])]) +@pytest.mark.parametrize("array_constr", [np.array, sparse.csc_matrix], + ids=["dense", "sparse"]) +@pytest.mark.parametrize("sparse_output", [True, False]) +def test_missing_indicator_precomputed_mask(features, expected_Xt, + array_constr, sparse_output): + # Test missing indicator with precomputed mask. + X_mask = array_constr([[False, True, False], + [False, True, True]]) + + mi = MissingIndicator(features=features, sparse=sparse_output, + missing_values=MissingIndicator.precomputed) + Xt = mi.fit_transform(X_mask) + + if sparse.issparse(Xt): + Xt = Xt.toarray() + + assert_array_equal(Xt, expected_Xt)