Skip to content

FIX make sure IterativeImputer does not skip iterative process when keep_empty_features=True #29779

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e38593a
Fix Iterative Imputation not Triggered when Keep Empty Features
arifqodari Sep 3, 2024
43d61eb
Merge branch 'main' into fix_iterative_imputer_keep_empty_features
arifqodari Sep 4, 2024
d5eddf6
Added Two More Tests for IterativeImputer
arifqodari Sep 5, 2024
1f49868
Fix Applied Only in the Initial Imputation Step
arifqodari Sep 5, 2024
e01bd1c
Merge branch 'main' into fix_iterative_imputer_keep_empty_features
arifqodari Sep 5, 2024
47800e0
Added Changelog
arifqodari Sep 5, 2024
04c94f2
Remove Trailing Whitespace and Unused Lines
arifqodari Sep 5, 2024
b37cbdc
Added More Tests with Various Initial Strategis
arifqodari Sep 6, 2024
498a346
Fix Error in IterativeImputer When Initial Strategy Constant
arifqodari Sep 6, 2024
33dcfbe
Fix Lint Issues
arifqodari Sep 6, 2024
59ca7ca
Merge branch 'main' into fix_iterative_imputer_keep_empty_features
arifqodari Sep 9, 2024
d0fc4c3
Merge branch 'main' into fix_iterative_imputer_keep_empty_features
arifqodari Sep 9, 2024
05da983
Merge branch 'main' into fix_iterative_imputer_keep_empty_features
arifqodari Sep 9, 2024
4ff968b
Move Changelog to 1.6
arifqodari Sep 10, 2024
8a71a26
Better Variable Naming in Tests
arifqodari Sep 11, 2024
af88e79
Use Simple Handcrafted Matrix Instread of Generated Randomly
arifqodari Sep 11, 2024
6d7e858
Black Formatting
arifqodari Sep 11, 2024
31cf853
Simplify Test Iterative Imputer with Empty Features
arifqodari Sep 12, 2024
3d04df9
Fix Missing Value Masking
arifqodari Sep 12, 2024
0e1f067
Remove Redundant Test
arifqodari Sep 12, 2024
a3bbbda
Add Test Data along with Few More Tests
arifqodari Sep 12, 2024
daf1adc
Update docstring for Test without Empty Feature
arifqodari Sep 12, 2024
291edcf
Update docstring for Test with Empty Feature
arifqodari Sep 12, 2024
004df0c
Parametrize on X_test for test_iterative_imputer_with_empty_features
arifqodari Sep 13, 2024
63048e4
Merge branch 'main' into fix_iterative_imputer_keep_empty_features
arifqodari Sep 13, 2024
a573512
Fix Typo in Docstring
arifqodari Sep 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,10 @@ Changelog
computing the mean value for uniform weights.
:pr:`29135` by :user:`Xuefeng Xu <xuefeng-xu>`.

- |Fix| Fixed :class:`impute.IterativeImputer` to make sure that it does not skip
the iterative process when `keep_empty_features` is set to `True`.
:pr:`29779` by :user:`Arif Qodari <arifqodari>`.

:mod:`sklearn.linear_model`
...........................

Expand Down
21 changes: 15 additions & 6 deletions sklearn/impute/_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,19 +646,28 @@ def _initial_imputation(self, X, in_fit=False):
else:
X_filled = self.initial_imputer_.transform(X)

valid_mask = np.flatnonzero(
np.logical_not(np.isnan(self.initial_imputer_.statistics_))
)
if in_fit:
self._is_empty_feature = np.all(mask_missing_values, axis=0)

if not self.keep_empty_features:
# drop empty features
Xt = X[:, valid_mask]
mask_missing_values = mask_missing_values[:, valid_mask]
Xt = X[:, ~self._is_empty_feature]
mask_missing_values = mask_missing_values[:, ~self._is_empty_feature]

if self.initial_imputer_.get_params()["strategy"] == "constant":
# The constant strategy has a specific behavior and preserve empty
# features even with ``keep_empty_features=False``. We need to drop
# the column for consistency.
# TODO: remove this `if` branch once the following issue is addressed:
# https://github.com/scikit-learn/scikit-learn/issues/29827
X_filled = X_filled[:, ~self._is_empty_feature]

else:
# mark empty features as not missing and keep the original
# imputation
mask_missing_values[:, valid_mask] = True
mask_missing_values[:, self._is_empty_feature] = False
Xt = X
Xt[:, self._is_empty_feature] = X_filled[:, self._is_empty_feature]

return Xt, X_filled, mask_missing_values, X_missing_mask

Expand Down
85 changes: 67 additions & 18 deletions sklearn/impute/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1513,24 +1513,6 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
)


@pytest.mark.parametrize(
"initial_strategy", ["mean", "median", "most_frequent", "constant"]
)
def test_iterative_imputer_keep_empty_features(initial_strategy):
"""Check the behaviour of the iterative imputer with different initial strategy
and keeping empty features (i.e. features containing only missing values).
"""
X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])

imputer = IterativeImputer(
initial_strategy=initial_strategy, keep_empty_features=True
)
X_imputed = imputer.fit_transform(X)
assert_allclose(X_imputed[:, 1], 0)
X_imputed = imputer.transform(X)
assert_allclose(X_imputed[:, 1], 0)


def test_iterative_imputer_constant_fill_value():
"""Check that we propagate properly the parameter `fill_value`."""
X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
Expand Down Expand Up @@ -1786,3 +1768,70 @@ def test_simple_imputer_constant_fill_value_casting():
)
X_trans = imputer.fit_transform(X_float32)
assert X_trans.dtype == X_float32.dtype


@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
def test_iterative_imputer_no_empty_features(strategy):
"""Check the behaviour of `keep_empty_features` with no empty features.

With no-empty features, we should get the same imputation whatever the
parameter `keep_empty_features`.

Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/29375
"""
X = np.array([[np.nan, 0, 1], [2, np.nan, 3], [4, 5, np.nan]])

imputer_drop_empty_features = IterativeImputer(
initial_strategy=strategy, fill_value=1, keep_empty_features=False
)

imputer_keep_empty_features = IterativeImputer(
initial_strategy=strategy, fill_value=1, keep_empty_features=True
)

assert_allclose(
imputer_drop_empty_features.fit_transform(X),
imputer_keep_empty_features.fit_transform(X),
)


@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
@pytest.mark.parametrize(
"X_test",
[
np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), # without empty feature
np.array([[np.nan, 2, 3, 4], [np.nan, 6, 7, 8]]), # empty feature at column 0
np.array([[1, 2, 3, np.nan], [5, 6, 7, np.nan]]), # empty feature at column 3
],
)
def test_iterative_imputer_with_empty_features(strategy, X_test):
"""Check the behaviour of `keep_empty_features` in the presence of empty features.

With `keep_empty_features=True`, the empty feature will be imputed with the value
defined by the initial imputation.

Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/29375
"""
X_train = np.array(
[[np.nan, np.nan, 0, 1], [np.nan, 2, np.nan, 3], [np.nan, 4, 5, np.nan]]
)

imputer_drop_empty_features = IterativeImputer(
initial_strategy=strategy, fill_value=0, keep_empty_features=False
)
X_train_drop_empty_features = imputer_drop_empty_features.fit_transform(X_train)
X_test_drop_empty_features = imputer_drop_empty_features.transform(X_test)

imputer_keep_empty_features = IterativeImputer(
initial_strategy=strategy, fill_value=0, keep_empty_features=True
)
X_train_keep_empty_features = imputer_keep_empty_features.fit_transform(X_train)
X_test_keep_empty_features = imputer_keep_empty_features.transform(X_test)

assert_allclose(X_train_drop_empty_features, X_train_keep_empty_features[:, 1:])
assert_allclose(X_train_keep_empty_features[:, 0], 0)

assert X_train_drop_empty_features.shape[1] == X_test_drop_empty_features.shape[1]
assert X_train_keep_empty_features.shape[1] == X_test_keep_empty_features.shape[1]