Skip to content

DEP Adding a warning in the SimpleImputer when strategy mode is constant and keep_empty_features is False #29950

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
4 changes: 4 additions & 0 deletions doc/whats_new/upcoming_changes/sklearn.impute/29950.api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- Add a warning in :class:`impute.SimpleImputer` when `keep_empty_feature=False` and
`strategy="constant"`. In this case empty features are not dropped and this behaviour
will change in 1.8.
By :user:`Arthur Courselle <ArthurCourselle>` and :user:`Simon Riou <simon-riou>`
29 changes: 29 additions & 0 deletions sklearn/impute/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,11 @@ class SimpleImputer(_BaseImputer):

.. versionadded:: 1.2

.. versionchanged:: 1.6
Currently, when `keep_empty_feature=False` and `strategy="constant"`,
empty features are not dropped. This behaviour will change in version
1.8. Set `keep_empty_feature=True` to preserve this behaviour.

Attributes
----------
statistics_ : array of shape (n_features,)
Expand Down Expand Up @@ -458,6 +463,19 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value):
statistics = np.empty(X.shape[1])

if strategy == "constant":
# TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
# for empty features to drop them later.
if not self.keep_empty_features and any(
[all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])]
):
warnings.warn(
"Currently, when `keep_empty_feature=False` and "
'`strategy="constant"`, empty features are not dropped. '
"This behaviour will change in version 1.8. Set "
"`keep_empty_feature=True` to preserve this behaviour.",
FutureWarning,
)

# for constant strategy, self.statistics_ is used to store
# fill_value in each column
statistics.fill(fill_value)
Expand Down Expand Up @@ -548,6 +566,17 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):

# Constant
elif strategy == "constant":
# TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
# for empty features to drop them later.
if not self.keep_empty_features and ma.getmask(masked_X).all(axis=0).any():
warnings.warn(
"Currently, when `keep_empty_feature=False` and "
'`strategy="constant"`, empty features are not dropped. '
"This behaviour will change in version 1.8. Set "
"`keep_empty_feature=True` to preserve this behaviour.",
FutureWarning,
)

# for constant strategy, self.statistcs_ is used to store
# fill_value in each column
return np.full(X.shape[1], fill_value, dtype=X.dtype)
Expand Down
29 changes: 26 additions & 3 deletions sklearn/impute/_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,16 +636,38 @@ def _initial_imputation(self, X, in_fit=False):

X_missing_mask = _get_mask(X, self.missing_values)
mask_missing_values = X_missing_mask.copy()

# TODO (1.8): remove this once the deprecation is removed. In the meantime,
# we need to catch the warning to avoid false positives.
catch_warning = (
self.initial_strategy == "constant" and not self.keep_empty_features
)

if self.initial_imputer_ is None:
self.initial_imputer_ = SimpleImputer(
missing_values=self.missing_values,
strategy=self.initial_strategy,
fill_value=self.fill_value,
keep_empty_features=self.keep_empty_features,
).set_output(transform="default")
X_filled = self.initial_imputer_.fit_transform(X)

# TODO (1.8): remove this once the deprecation is removed to keep only
# the code in the else case.
if catch_warning:
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
X_filled = self.initial_imputer_.fit_transform(X)
else:
X_filled = self.initial_imputer_.fit_transform(X)
else:
X_filled = self.initial_imputer_.transform(X)
# TODO (1.8): remove this once the deprecation is removed to keep only
# the code in the else case.
if catch_warning:
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
X_filled = self.initial_imputer_.transform(X)
else:
X_filled = self.initial_imputer_.transform(X)

if in_fit:
self._is_empty_feature = np.all(mask_missing_values, axis=0)
Expand All @@ -659,7 +681,8 @@ def _initial_imputation(self, X, in_fit=False):
# The constant strategy has a specific behavior and preserve empty
# features even with ``keep_empty_features=False``. We need to drop
# the column for consistency.
# TODO: remove this `if` branch once the following issue is addressed:
# TODO (1.8): remove this `if` branch once the following issue is
# addressed:
# https://github.com/scikit-learn/scikit-learn/issues/29827
X_filled = X_filled[:, ~self._is_empty_feature]

Expand Down
52 changes: 47 additions & 5 deletions sklearn/impute/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,18 +410,24 @@ def test_imputation_constant_error_invalid_type(X_data, missing_value):
imputer.fit_transform(X)


# TODO (1.8): check that `keep_empty_features=False` drop the
# empty features due to the behaviour change.
def test_imputation_constant_integer():
# Test imputation using the constant strategy on integers
X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])

X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]])

imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0)
imputer = SimpleImputer(
missing_values=-1, strategy="constant", fill_value=0, keep_empty_features=True
)
X_trans = imputer.fit_transform(X)

assert_array_equal(X_trans, X_true)


# TODO (1.8): check that `keep_empty_features=False` drop the
# empty features due to the behaviour change.
@pytest.mark.parametrize("array_constructor", CSR_CONTAINERS + [np.asarray])
def test_imputation_constant_float(array_constructor):
# Test imputation using the constant strategy on floats
Expand All @@ -442,12 +448,16 @@ def test_imputation_constant_float(array_constructor):

X_true = array_constructor(X_true)

imputer = SimpleImputer(strategy="constant", fill_value=-1)
imputer = SimpleImputer(
strategy="constant", fill_value=-1, keep_empty_features=True
)
X_trans = imputer.fit_transform(X)

assert_allclose_dense_sparse(X_trans, X_true)


# TODO (1.8): check that `keep_empty_features=False` drop the
# empty features due to the behaviour change.
@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0])
def test_imputation_constant_object(marker):
# Test imputation using the constant strategy on objects
Expand All @@ -472,13 +482,18 @@ def test_imputation_constant_object(marker):
)

imputer = SimpleImputer(
missing_values=marker, strategy="constant", fill_value="missing"
missing_values=marker,
strategy="constant",
fill_value="missing",
keep_empty_features=True,
)
X_trans = imputer.fit_transform(X)

assert_array_equal(X_trans, X_true)


# TODO (1.8): check that `keep_empty_features=False` drop the
# empty features due to the behaviour change.
@pytest.mark.parametrize("dtype", [object, "category"])
def test_imputation_constant_pandas(dtype):
# Test imputation using the constant strategy on pandas df
Expand All @@ -498,7 +513,7 @@ def test_imputation_constant_pandas(dtype):
dtype=object,
)

imputer = SimpleImputer(strategy="constant")
imputer = SimpleImputer(strategy="constant", keep_empty_features=True)
X_trans = imputer.fit_transform(df)

assert_array_equal(X_trans, X_true)
Expand Down Expand Up @@ -1514,6 +1529,26 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
)


@pytest.mark.parametrize(
"initial_strategy", ["mean", "median", "most_frequent", "constant"]
)
def test_iterative_imputer_keep_empty_features(initial_strategy):
"""Check the behaviour of the iterative imputer with different initial strategy
and keeping empty features (i.e. features containing only missing values).
"""
X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])

imputer = IterativeImputer(
initial_strategy=initial_strategy, keep_empty_features=True
)
X_imputed = imputer.fit_transform(X)
assert_allclose(X_imputed[:, 1], 0)
X_imputed = imputer.transform(X)
assert_allclose(X_imputed[:, 1], 0)


# TODO (1.8): check that `keep_empty_features=False` drop the
# empty features due to the behaviour change.
def test_iterative_imputer_constant_fill_value():
"""Check that we propagate properly the parameter `fill_value`."""
X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
Expand All @@ -1524,6 +1559,7 @@ def test_iterative_imputer_constant_fill_value():
initial_strategy="constant",
fill_value=fill_value,
max_iter=0,
keep_empty_features=True,
)
imputer.fit_transform(X)
assert_array_equal(imputer.initial_imputer_.statistics_, fill_value)
Expand Down Expand Up @@ -1722,7 +1758,13 @@ def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_feat
)

for method in ["fit_transform", "transform"]:
X_imputed = getattr(imputer, method)(X)
# TODO(1.8): Remove the condition and still call getattr(imputer, method)(X)
if method.startswith("fit") and not keep_empty_features:
warn_msg = '`strategy="constant"`, empty features are not dropped. '
with pytest.warns(FutureWarning, match=warn_msg):
X_imputed = getattr(imputer, method)(X)
else:
X_imputed = getattr(imputer, method)(X)
assert X_imputed.shape == X.shape
constant_feature = (
X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]
Expand Down