Skip to content

[WIP] ENH Allow precomputed mask as input for MissingIndicator #13514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions doc/whats_new/v0.21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,20 @@ Support for Python 3.4 and below has been officially dropped.
raising an exception if input is sparse add `missing_values` property
is set to 0. :issue:`13240` by :user:`Bartosz Telenczuk <btel>`.

- |ENH| Add another possible value for the `features` parameter of
:class:`MissingIndicator` to drop features with no missing values as well as
features with only missing values. :issue:`13491` by
:user:`Jérémie du Boisberranger <jeremiedbb>`.

- |Fix| Fixed a bug in :class:`MissingIndicator` when ``X`` is sparse. All the
non-zero missing values used to become explicit False is the transformed data.
:issue:`13491` by :user:`Jérémie du Boisberranger <jeremiedbb>`.

- |ENH| Add another possible value (`MissingIndicator.precomputed`) for the
`missing_values` parameter of the :class:`MissingIndicator` which allows to
pass ``X`` as a precomputed mask of the missing values. :issue:`` by
:user:`Jérémie du Boisberranger <jeremiedbb>`.

:mod:`sklearn.isotonic`
.......................

Expand Down
74 changes: 53 additions & 21 deletions sklearn/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1052,18 +1052,24 @@ class MissingIndicator(BaseEstimator, TransformerMixin):

Parameters
----------
missing_values : number, string, np.nan (default) or None
missing_values : number, string, np.nan, None or \
MissingIndicator.precomputed (default=np.nan)
The placeholder for the missing values. All occurrences of
`missing_values` will be indicated (True in the output array), the
other values will be marked as False.
If MissingIndicator.precomputed, then ``X`` must be a boolean array or
sparse matrix and will be interpreted as a precomputed mask of the
missing values.

features : str, optional
features : {"missing-only", "all", "not-constant"}, optional
Whether the imputer mask should represent all or a subset of
features.

- If "missing-only" (default), the imputer mask will only represent
features containing missing values during fit time.
- If "all", the imputer mask will represent all features.
- If "not-constant", the imputer mask will represent features
containing missing values but not containing only missing values.

sparse : boolean or "auto", optional
Whether the imputer mask format should be sparse or dense.
Expand Down Expand Up @@ -1106,6 +1112,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
[False, False]])

"""
precomputed = object()

def __init__(self, missing_values=np.nan, features="missing-only",
sparse="auto", error_on_new=True):
Expand Down Expand Up @@ -1135,37 +1142,64 @@ def _get_missing_features_info(self, X):

"""
if sparse.issparse(X):
mask = _get_mask(X.data, self.missing_values)
if self.missing_values is self.precomputed:
mask = X.data
else:
mask = _get_mask(X.data, self.missing_values)
else:
if self.missing_values is self.precomputed:
mask = X
else:
mask = _get_mask(X, self.missing_values)

if sparse.issparse(X):
# The imputer mask will be constructed with the same sparse format
# as X.
sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
else sparse.csc_matrix)
imputer_mask = sparse_constructor(
(mask, X.indices.copy(), X.indptr.copy()),
shape=X.shape, dtype=bool)
imputer_mask.eliminate_zeros()

missing_values_mask = imputer_mask.copy()
missing_values_mask.eliminate_zeros()
features_with_missing = (
np.flatnonzero(np.diff(missing_values_mask.indptr))
if missing_values_mask.format == 'csc'
else np.unique(missing_values_mask.indices))
if self.features in ('missing-only', 'not-constant'):
if imputer_mask.format == 'csc':
n_missing = np.diff(imputer_mask.indptr)
else:
n_missing = np.bincount(imputer_mask.indices,
minlength=X.shape[1])

if self.sparse is False:
imputer_mask = imputer_mask.toarray()
elif imputer_mask.format == 'csr':
imputer_mask = imputer_mask.tocsc()
else:
imputer_mask = _get_mask(X, self.missing_values)
features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0))
imputer_mask = mask

if self.features in ('missing-only', 'not-constant'):
n_missing = imputer_mask.sum(axis=0)

if self.sparse is True:
imputer_mask = sparse.csc_matrix(imputer_mask)

return imputer_mask, features_with_missing
if self.features == 'all':
features_indices = np.arange(X.shape[1])
else:
features_indices = np.flatnonzero(n_missing)
if self.features == 'not-constant':
features_indices = np.intersect1d(
features_indices, np.flatnonzero(n_missing - X.shape[0]))

return imputer_mask, features_indices

def _validate_input(self, X):
if self.missing_values is self.precomputed:
if X.dtype.kind != "b":
raise ValueError("Expected a boolean mask.")

return check_array(X, accept_sparse=('csc'), dtype=np.bool,
force_all_finite=True)

if not is_scalar_nan(self.missing_values):
force_all_finite = True
else:
Expand Down Expand Up @@ -1207,18 +1241,17 @@ def fit(self, X, y=None):
X = self._validate_input(X)
self._n_features = X.shape[1]

if self.features not in ('missing-only', 'all'):
raise ValueError("'features' has to be either 'missing-only' or "
"'all'. Got {} instead.".format(self.features))
if self.features not in ('missing-only', 'all', 'not-constant'):
raise ValueError("'features' has to be either 'missing-only', "
"'all' or 'not-constant'. Got {} instead."
.format(self.features))

if not ((isinstance(self.sparse, str) and
self.sparse == "auto") or isinstance(self.sparse, bool)):
raise ValueError("'sparse' has to be a boolean or 'auto'. "
"Got {!r} instead.".format(self.sparse))

self.features_ = (self._get_missing_features_info(X)[1]
if self.features == 'missing-only'
else np.arange(self._n_features))
self.features_ = self._get_missing_features_info(X)[1]

return self

Expand Down Expand Up @@ -1246,15 +1279,14 @@ def transform(self, X):

imputer_mask, features = self._get_missing_features_info(X)

if self.features == "missing-only":
if self.features in ("missing-only", "not-constant"):
features_diff_fit_trans = np.setdiff1d(features, self.features_)
if (self.error_on_new and features_diff_fit_trans.size > 0):
raise ValueError("The features {} have missing values "
"in transform but have no missing values "
"in fit.".format(features_diff_fit_trans))

if (self.features_.size > 0 and
self.features_.size < self._n_features):
if self.features_.size < self._n_features:
imputer_mask = imputer_mask[:, self.features_]

return imputer_mask
Expand Down
62 changes: 61 additions & 1 deletion sklearn/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,7 @@ def test_iterative_imputer_early_stopping():
'have missing values in transform but have no missing values in fit'),
(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
{'features': 'random', 'sparse': 'auto'},
"'features' has to be either 'missing-only' or 'all'"),
"'features' has to be either 'missing-only', 'all' or 'not-constant'"),
(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
{'features': 'all', 'sparse': 'random'},
"'sparse' has to be a boolean or 'auto'"),
Expand Down Expand Up @@ -1119,3 +1119,63 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor,

with pytest.raises(ValueError, match=err_msg):
imputer.fit_transform(X)


@pytest.mark.parametrize("array_constr",
[np.array, sparse.csr_matrix, sparse.csc_matrix],
ids=["dense", "sparse_csr", "sparse_csc"])
def test_missing_indicator_drop_full_missing(array_constr):
# Check that missing indicator with features="not-constant" drops columns
# with no missing values as well as columns full of missing values.
X = array_constr([[0, np.nan, 0],
[0, np.nan, np.nan]])

expected_Xt = array_constr([[False],
[True]])

mi = MissingIndicator(features="not-constant")
Xt = mi.fit_transform(X)

assert_allclose_dense_sparse(Xt, expected_Xt)


def test_missing_indicator_sparse_no_explicit_zeros():
# Check that non missing values don't become explicit zeeros in the mask
# generated by missing indicator when X is sparse.
X = sparse.csr_matrix([[0, 1, 2],
[1, 2, 0],
[2, 0, 1]])
expected_nnz = 3 # 3 missing values

mi = MissingIndicator(features='all', missing_values=1)
Xt = mi.fit_transform(X)

nnz = Xt.getnnz()

assert nnz == expected_nnz


@pytest.mark.parametrize("features, expected_Xt",
[("all", [[False, True, False],
[False, True, True]]),
("missing-only", [[True, False],
[True, True]]),
("not-constant", [[False],
[True]])])
@pytest.mark.parametrize("array_constr", [np.array, sparse.csc_matrix],
ids=["dense", "sparse"])
@pytest.mark.parametrize("sparse_output", [True, False])
def test_missing_indicator_precomputed_mask(features, expected_Xt,
array_constr, sparse_output):
# Test missing indicator with precomputed mask.
X_mask = array_constr([[False, True, False],
[False, True, True]])

mi = MissingIndicator(features=features, sparse=sparse_output,
missing_values=MissingIndicator.precomputed)
Xt = mi.fit_transform(X_mask)

if sparse.issparse(Xt):
Xt = Xt.toarray()

assert_array_equal(Xt, expected_Xt)