Skip to content

[MRG] ENH add an option to drop full missing features in MissingIndicator #13491

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

9 changes: 9 additions & 0 deletions doc/whats_new/v0.21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,15 @@ Support for Python 3.4 and below has been officially dropped.
raising an exception if input is sparse add `missing_values` property
is set to 0. :issue:`13240` by :user:`Bartosz Telenczuk <btel>`.

- |ENH| Add another possible value for the `features` parameter of
:class:`MissingIndicator` to drop features with no missing values as well as
features with only missing values. :issue:`13491` by
:user:`Jérémie du Boisberranger <jeremiedbb>`.

- |Fix| Fixed a bug in :class:`MissingIndicator` when ``X`` is sparse. All the
non-zero missing values used to become explicit False in the transformed
data. :issue:`13491` by :user:`Jérémie du Boisberranger <jeremiedbb>`.

:mod:`sklearn.isotonic`
.......................

Expand Down
56 changes: 34 additions & 22 deletions sklearn/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,13 +1057,15 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
`missing_values` will be indicated (True in the output array), the
other values will be marked as False.

features : str, optional
features : {"missing-only", "all", "some-missing"}, optional
Whether the imputer mask should represent all or a subset of
features.

- If "missing-only" (default), the imputer mask will only represent
features containing missing values during fit time.
- If "all", the imputer mask will represent all features.
- If "some-missing", the imputer mask will represent features
containing missing values but not containing only missing values.

sparse : boolean or "auto", optional
Whether the imputer mask format should be sparse or dense.
Expand All @@ -1074,9 +1076,12 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
- If False, the imputer mask will be a numpy array.

error_on_new : boolean, optional
If True (default), transform will raise an error when there are
If True (default), transform will raise an error when there are either
features with missing values in transform that have no missing values
in fit. This is applicable only when ``features="missing-only"``.
in fit (only applicable if
``features in ("missing-only", "some-missing")``), or features with non
missing values in transform that have only missing values in fit
(only applicable if ``features="some-missing"``).

Attributes
----------
Expand Down Expand Up @@ -1144,26 +1149,33 @@ def _get_missing_features_info(self, X):
imputer_mask = sparse_constructor(
(mask, X.indices.copy(), X.indptr.copy()),
shape=X.shape, dtype=bool)
imputer_mask.eliminate_zeros()

missing_values_mask = imputer_mask.copy()
missing_values_mask.eliminate_zeros()
features_with_missing = (
np.flatnonzero(np.diff(missing_values_mask.indptr))
if missing_values_mask.format == 'csc'
else np.unique(missing_values_mask.indices))
if self.features in ('missing-only', 'some-missing'):
n_missing = imputer_mask.getnnz(axis=0)

if self.sparse is False:
imputer_mask = imputer_mask.toarray()
elif imputer_mask.format == 'csr':
imputer_mask = imputer_mask.tocsc()
else:
imputer_mask = _get_mask(X, self.missing_values)
features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0))

if self.features in ('missing-only', 'some-missing'):
n_missing = imputer_mask.sum(axis=0)

if self.sparse is True:
imputer_mask = sparse.csc_matrix(imputer_mask)

return imputer_mask, features_with_missing
if self.features == 'all':
features_indices = np.arange(X.shape[1])
elif self.features == 'missing-only':
features_indices = np.flatnonzero(n_missing)
else:
features_indices = np.flatnonzero(
np.logical_and(n_missing < X.shape[0], n_missing > 0))

return imputer_mask, features_indices

def _validate_input(self, X):
if not is_scalar_nan(self.missing_values):
Expand Down Expand Up @@ -1207,18 +1219,17 @@ def fit(self, X, y=None):
X = self._validate_input(X)
self._n_features = X.shape[1]

if self.features not in ('missing-only', 'all'):
raise ValueError("'features' has to be either 'missing-only' or "
"'all'. Got {} instead.".format(self.features))
if self.features not in ('missing-only', 'all', 'some-missing'):
raise ValueError("'features' has to be one of 'missing-only', "
"'all' or 'some-missing'. Got {} instead."
.format(self.features))

if not ((isinstance(self.sparse, str) and
self.sparse == "auto") or isinstance(self.sparse, bool)):
raise ValueError("'sparse' has to be a boolean or 'auto'. "
"Got {!r} instead.".format(self.sparse))

self.features_ = (self._get_missing_features_info(X)[1]
if self.features == 'missing-only'
else np.arange(self._n_features))
self.features_ = self._get_missing_features_info(X)[1]

return self

Expand Down Expand Up @@ -1246,15 +1257,16 @@ def transform(self, X):

imputer_mask, features = self._get_missing_features_info(X)

if self.features == "missing-only":
if self.features in ("missing-only", "some-missing"):
features_diff_fit_trans = np.setdiff1d(features, self.features_)
if (self.error_on_new and features_diff_fit_trans.size > 0):
raise ValueError("The features {} have missing values "
raise ValueError("The features {} either have missing values "
"in transform but have no missing values "
"in fit.".format(features_diff_fit_trans))
"in fit, or have non missing values in "
"transform but have only missing values in "
"fit".format(features_diff_fit_trans))

if (self.features_.size > 0 and
self.features_.size < self._n_features):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the first condition. If we want only features with missing, and there's not any, then the mask should be empty. Before, it would return the mask of all features.

if self.features_.size < self._n_features:
imputer_mask = imputer_mask[:, self.features_]

return imputer_mask
Expand Down
36 changes: 35 additions & 1 deletion sklearn/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,9 +917,12 @@ def test_iterative_imputer_early_stopping():
[(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),
{'features': 'missing-only', 'sparse': 'auto'},
'have missing values in transform but have no missing values in fit'),
(np.array([[-1, 1], [-1, -1]]), np.array([[-1, 1], [1, 1]]),
{'features': 'some-missing', 'sparse': 'auto'},
'have missing values in transform but have no missing values in fit'),
(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
{'features': 'random', 'sparse': 'auto'},
"'features' has to be either 'missing-only' or 'all'"),
"'features' has to be one of 'missing-only', 'all' or 'some-missing'"),
(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
{'features': 'all', 'sparse': 'random'},
"'sparse' has to be a boolean or 'auto'"),
Expand Down Expand Up @@ -1119,3 +1122,34 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor,

with pytest.raises(ValueError, match=err_msg):
imputer.fit_transform(X)


@pytest.mark.parametrize("array_constr",
[np.array, sparse.csr_matrix, sparse.csc_matrix],
ids=["dense", "sparse_csr", "sparse_csc"])
def test_missing_indicator_drop_full_missing(array_constr):
# Check that missing indicator with features="some-missing" drops columns
# with no missing values as well as columns full of missing values.
X = array_constr([[0, np.nan, 0],
[0, np.nan, np.nan]])

expected_Xt = array_constr([[False],
[True]])

mi = MissingIndicator(features="some-missing")
Xt = mi.fit_transform(X)

assert_allclose_dense_sparse(Xt, expected_Xt)


def test_missing_indicator_sparse_no_explicit_zeros():
# Check that non missing values don't become explicit zeros in the mask
# generated by missing indicator when X is sparse.
X = sparse.csr_matrix([[0, 1, 2],
[1, 2, 0],
[2, 0, 1]])

mi = MissingIndicator(features='all', missing_values=1)
Xt = mi.fit_transform(X)

assert Xt.getnnz() == Xt.sum()