diff --git a/doc/whats_new/upcoming_changes/sklearn.impute/29950.api.rst b/doc/whats_new/upcoming_changes/sklearn.impute/29950.api.rst new file mode 100644 index 0000000000000..27ac9e06ac320 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.impute/29950.api.rst @@ -0,0 +1,4 @@ +- Add a warning in :class:`impute.SimpleImputer` when `keep_empty_feature=False` and + `strategy="constant"`. In this case empty features are not dropped and this behaviour + will change in 1.8. + By :user:`Arthur Courselle ` and :user:`Simon Riou ` \ No newline at end of file diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index aecc235ecd6ac..faf1f9e23b678 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -225,6 +225,11 @@ class SimpleImputer(_BaseImputer): .. versionadded:: 1.2 + .. versionchanged:: 1.6 + Currently, when `keep_empty_feature=False` and `strategy="constant"`, + empty features are not dropped. This behaviour will change in version + 1.8. Set `keep_empty_feature=True` to preserve this behaviour. + Attributes ---------- statistics_ : array of shape (n_features,) @@ -458,6 +463,19 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value): statistics = np.empty(X.shape[1]) if strategy == "constant": + # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic + # for empty features to drop them later. + if not self.keep_empty_features and any( + [all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])] + ): + warnings.warn( + "Currently, when `keep_empty_feature=False` and " + '`strategy="constant"`, empty features are not dropped. ' + "This behaviour will change in version 1.8. Set " + "`keep_empty_feature=True` to preserve this behaviour.", + FutureWarning, + ) + # for constant strategy, self.statistics_ is used to store # fill_value in each column statistics.fill(fill_value) @@ -548,6 +566,17 @@ def _dense_fit(self, X, strategy, missing_values, fill_value): # Constant elif strategy == "constant": + # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic + # for empty features to drop them later. + if not self.keep_empty_features and ma.getmask(masked_X).all(axis=0).any(): + warnings.warn( + "Currently, when `keep_empty_feature=False` and " + '`strategy="constant"`, empty features are not dropped. ' + "This behaviour will change in version 1.8. Set " + "`keep_empty_feature=True` to preserve this behaviour.", + FutureWarning, + ) + # for constant strategy, self.statistcs_ is used to store # fill_value in each column return np.full(X.shape[1], fill_value, dtype=X.dtype) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index a471ca9313add..86723c8245d44 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -636,6 +636,13 @@ def _initial_imputation(self, X, in_fit=False): X_missing_mask = _get_mask(X, self.missing_values) mask_missing_values = X_missing_mask.copy() + + # TODO (1.8): remove this once the deprecation is removed. In the meantime, + # we need to catch the warning to avoid false positives. + catch_warning = ( + self.initial_strategy == "constant" and not self.keep_empty_features + ) + if self.initial_imputer_ is None: self.initial_imputer_ = SimpleImputer( missing_values=self.missing_values, @@ -643,9 +650,24 @@ def _initial_imputation(self, X, in_fit=False): fill_value=self.fill_value, keep_empty_features=self.keep_empty_features, ).set_output(transform="default") - X_filled = self.initial_imputer_.fit_transform(X) + + # TODO (1.8): remove this once the deprecation is removed to keep only + # the code in the else case. + if catch_warning: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.fit_transform(X) else: - X_filled = self.initial_imputer_.transform(X) + # TODO (1.8): remove this once the deprecation is removed to keep only + # the code in the else case. + if catch_warning: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + X_filled = self.initial_imputer_.transform(X) + else: + X_filled = self.initial_imputer_.transform(X) if in_fit: self._is_empty_feature = np.all(mask_missing_values, axis=0) @@ -659,7 +681,8 @@ def _initial_imputation(self, X, in_fit=False): # The constant strategy has a specific behavior and preserve empty # features even with ``keep_empty_features=False``. We need to drop # the column for consistency. - # TODO: remove this `if` branch once the following issue is addressed: + # TODO (1.8): remove this `if` branch once the following issue is + # addressed: # https://github.com/scikit-learn/scikit-learn/issues/29827 X_filled = X_filled[:, ~self._is_empty_feature] diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index df8715327163d..b92e8ecd8f01f 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -410,18 +410,24 @@ def test_imputation_constant_error_invalid_type(X_data, missing_value): imputer.fit_transform(X) +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. def test_imputation_constant_integer(): # Test imputation using the constant strategy on integers X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]]) X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]]) - imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) + imputer = SimpleImputer( + missing_values=-1, strategy="constant", fill_value=0, keep_empty_features=True + ) X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true) +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. @pytest.mark.parametrize("array_constructor", CSR_CONTAINERS + [np.asarray]) def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats @@ -442,12 +448,16 @@ def test_imputation_constant_float(array_constructor): X_true = array_constructor(X_true) - imputer = SimpleImputer(strategy="constant", fill_value=-1) + imputer = SimpleImputer( + strategy="constant", fill_value=-1, keep_empty_features=True + ) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true) +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. @pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0]) def test_imputation_constant_object(marker): # Test imputation using the constant strategy on objects @@ -472,13 +482,18 @@ def test_imputation_constant_object(marker): ) imputer = SimpleImputer( - missing_values=marker, strategy="constant", fill_value="missing" + missing_values=marker, + strategy="constant", + fill_value="missing", + keep_empty_features=True, ) X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true) +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. @pytest.mark.parametrize("dtype", [object, "category"]) def test_imputation_constant_pandas(dtype): # Test imputation using the constant strategy on pandas df @@ -498,7 +513,7 @@ def test_imputation_constant_pandas(dtype): dtype=object, ) - imputer = SimpleImputer(strategy="constant") + imputer = SimpleImputer(strategy="constant", keep_empty_features=True) X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true) @@ -1514,6 +1529,26 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat): ) +@pytest.mark.parametrize( + "initial_strategy", ["mean", "median", "most_frequent", "constant"] +) +def test_iterative_imputer_keep_empty_features(initial_strategy): + """Check the behaviour of the iterative imputer with different initial strategy + and keeping empty features (i.e. features containing only missing values). + """ + X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]]) + + imputer = IterativeImputer( + initial_strategy=initial_strategy, keep_empty_features=True + ) + X_imputed = imputer.fit_transform(X) + assert_allclose(X_imputed[:, 1], 0) + X_imputed = imputer.transform(X) + assert_allclose(X_imputed[:, 1], 0) + + +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. def test_iterative_imputer_constant_fill_value(): """Check that we propagate properly the parameter `fill_value`.""" X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]]) @@ -1524,6 +1559,7 @@ def test_iterative_imputer_constant_fill_value(): initial_strategy="constant", fill_value=fill_value, max_iter=0, + keep_empty_features=True, ) imputer.fit_transform(X) assert_array_equal(imputer.initial_imputer_.statistics_, fill_value) @@ -1722,7 +1758,13 @@ def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_feat ) for method in ["fit_transform", "transform"]: - X_imputed = getattr(imputer, method)(X) + # TODO(1.8): Remove the condition and still call getattr(imputer, method)(X) + if method.startswith("fit") and not keep_empty_features: + warn_msg = '`strategy="constant"`, empty features are not dropped. ' + with pytest.warns(FutureWarning, match=warn_msg): + X_imputed = getattr(imputer, method)(X) + else: + X_imputed = getattr(imputer, method)(X) assert X_imputed.shape == X.shape constant_feature = ( X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]