scikit-learn · adrinjalali · Oct 29, 2024 · Sep 27, 2024 · Sep 27, 2024 · Oct 15, 2024
diff --git a/doc/whats_new/upcoming_changes/sklearn.impute/29950.api.rst b/doc/whats_new/upcoming_changes/sklearn.impute/29950.api.rst
@@ -0,0 +1,4 @@
+- Add a warning in :class:`impute.SimpleImputer` when `keep_empty_feature=False` and
+  `strategy="constant"`. In this case empty features are not dropped and this behaviour
+  will change in 1.8.
+  By :user:`Arthur Courselle <ArthurCourselle>` and :user:`Simon Riou <simon-riou>`
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
@@ -225,6 +225,11 @@ class SimpleImputer(_BaseImputer):
 
         .. versionadded:: 1.2
 
+        .. versionchanged:: 1.6
+            Currently, when `keep_empty_feature=False` and `strategy="constant"`,
+            empty features are not dropped. This behaviour will change in version
+            1.8. Set `keep_empty_feature=True` to preserve this behaviour.
+
     Attributes
     ----------
     statistics_ : array of shape (n_features,)
@@ -458,6 +463,19 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value):
         statistics = np.empty(X.shape[1])
 
         if strategy == "constant":
+            # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
+            # for empty features to drop them later.
+            if not self.keep_empty_features and any(
+                [all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])]
+            ):
+                warnings.warn(
+                    "Currently, when `keep_empty_feature=False` and "
+                    '`strategy="constant"`, empty features are not dropped. '
+                    "This behaviour will change in version 1.8. Set "
+                    "`keep_empty_feature=True` to preserve this behaviour.",
+                    FutureWarning,
+                )
+
             # for constant strategy, self.statistics_ is used to store
             # fill_value in each column
             statistics.fill(fill_value)
@@ -548,6 +566,17 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
 
         # Constant
         elif strategy == "constant":
+            # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic
+            # for empty features to drop them later.
+            if not self.keep_empty_features and ma.getmask(masked_X).all(axis=0).any():
+                warnings.warn(
+                    "Currently, when `keep_empty_feature=False` and "
+                    '`strategy="constant"`, empty features are not dropped. '
+                    "This behaviour will change in version 1.8. Set "
+                    "`keep_empty_feature=True` to preserve this behaviour.",
+                    FutureWarning,
+                )
+
             # for constant strategy, self.statistcs_ is used to store
             # fill_value in each column
             return np.full(X.shape[1], fill_value, dtype=X.dtype)

diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
@@ -636,16 +636,38 @@ def _initial_imputation(self, X, in_fit=False):
 
         X_missing_mask = _get_mask(X, self.missing_values)
         mask_missing_values = X_missing_mask.copy()
+
+        # TODO (1.8): remove this once the deprecation is removed. In the meantime,
+        # we need to catch the warning to avoid false positives.
+        catch_warning = (
+            self.initial_strategy == "constant" and not self.keep_empty_features
+        )
+
         if self.initial_imputer_ is None:
             self.initial_imputer_ = SimpleImputer(
                 missing_values=self.missing_values,
                 strategy=self.initial_strategy,
                 fill_value=self.fill_value,
                 keep_empty_features=self.keep_empty_features,
             ).set_output(transform="default")
-            X_filled = self.initial_imputer_.fit_transform(X)
+
+            # TODO (1.8): remove this once the deprecation is removed to keep only
+            # the code in the else case.
+            if catch_warning:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore", FutureWarning)
+                    X_filled = self.initial_imputer_.fit_transform(X)
+            else:
+                X_filled = self.initial_imputer_.fit_transform(X)
         else:
-            X_filled = self.initial_imputer_.transform(X)
+            # TODO (1.8): remove this once the deprecation is removed to keep only
+            # the code in the else case.
+            if catch_warning:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore", FutureWarning)
+                    X_filled = self.initial_imputer_.transform(X)
+            else:
+                X_filled = self.initial_imputer_.transform(X)
 
         if in_fit:
             self._is_empty_feature = np.all(mask_missing_values, axis=0)
@@ -659,7 +681,8 @@ def _initial_imputation(self, X, in_fit=False):
                 # The constant strategy has a specific behavior and preserve empty
                 # features even with ``keep_empty_features=False``. We need to drop
                 # the column for consistency.
-                # TODO: remove this `if` branch once the following issue is addressed:
+                # TODO (1.8): remove this `if` branch once the following issue is
+                # addressed:
                 # https://github.com/scikit-learn/scikit-learn/issues/29827
                 X_filled = X_filled[:, ~self._is_empty_feature]
 

diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
@@ -410,18 +410,24 @@ def test_imputation_constant_error_invalid_type(X_data, missing_value):
         imputer.fit_transform(X)
 
 
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
 def test_imputation_constant_integer():
     # Test imputation using the constant strategy on integers
     X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
 
     X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]])
 
-    imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0)
+    imputer = SimpleImputer(
+        missing_values=-1, strategy="constant", fill_value=0, keep_empty_features=True
+    )
     X_trans = imputer.fit_transform(X)
 
     assert_array_equal(X_trans, X_true)
 
 
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
 @pytest.mark.parametrize("array_constructor", CSR_CONTAINERS + [np.asarray])
 def test_imputation_constant_float(array_constructor):
     # Test imputation using the constant strategy on floats
@@ -442,12 +448,16 @@ def test_imputation_constant_float(array_constructor):
 
     X_true = array_constructor(X_true)
 
-    imputer = SimpleImputer(strategy="constant", fill_value=-1)
+    imputer = SimpleImputer(
+        strategy="constant", fill_value=-1, keep_empty_features=True
+    )
     X_trans = imputer.fit_transform(X)
 
     assert_allclose_dense_sparse(X_trans, X_true)
 
 
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
 @pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0])
 def test_imputation_constant_object(marker):
     # Test imputation using the constant strategy on objects
@@ -472,13 +482,18 @@ def test_imputation_constant_object(marker):
     )
 
     imputer = SimpleImputer(
-        missing_values=marker, strategy="constant", fill_value="missing"
+        missing_values=marker,
+        strategy="constant",
+        fill_value="missing",
+        keep_empty_features=True,
     )
     X_trans = imputer.fit_transform(X)
 
     assert_array_equal(X_trans, X_true)
 
 
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
 @pytest.mark.parametrize("dtype", [object, "category"])
 def test_imputation_constant_pandas(dtype):
     # Test imputation using the constant strategy on pandas df
@@ -498,7 +513,7 @@ def test_imputation_constant_pandas(dtype):
         dtype=object,
     )
 
-    imputer = SimpleImputer(strategy="constant")
+    imputer = SimpleImputer(strategy="constant", keep_empty_features=True)
     X_trans = imputer.fit_transform(df)
 
     assert_array_equal(X_trans, X_true)
@@ -1514,6 +1529,26 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
     )
 
 
+@pytest.mark.parametrize(
+    "initial_strategy", ["mean", "median", "most_frequent", "constant"]
+)
+def test_iterative_imputer_keep_empty_features(initial_strategy):
+    """Check the behaviour of the iterative imputer with different initial strategy
+    and keeping empty features (i.e. features containing only missing values).
+    """
+    X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])
+
+    imputer = IterativeImputer(
+        initial_strategy=initial_strategy, keep_empty_features=True
+    )
+    X_imputed = imputer.fit_transform(X)
+    assert_allclose(X_imputed[:, 1], 0)
+    X_imputed = imputer.transform(X)
+    assert_allclose(X_imputed[:, 1], 0)
+
+
+# TODO (1.8): check that `keep_empty_features=False` drop the
+# empty features due to the behaviour change.
 def test_iterative_imputer_constant_fill_value():
     """Check that we propagate properly the parameter `fill_value`."""
     X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
@@ -1524,6 +1559,7 @@ def test_iterative_imputer_constant_fill_value():
         initial_strategy="constant",
         fill_value=fill_value,
         max_iter=0,
+        keep_empty_features=True,
     )
     imputer.fit_transform(X)
     assert_array_equal(imputer.initial_imputer_.statistics_, fill_value)
@@ -1722,7 +1758,13 @@ def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_feat
     )
 
     for method in ["fit_transform", "transform"]:
-        X_imputed = getattr(imputer, method)(X)
+        # TODO(1.8): Remove the condition and still call getattr(imputer, method)(X)
+        if method.startswith("fit") and not keep_empty_features:
+            warn_msg = '`strategy="constant"`, empty features are not dropped. '
+            with pytest.warns(FutureWarning, match=warn_msg):
+                X_imputed = getattr(imputer, method)(X)
+        else:
+            X_imputed = getattr(imputer, method)(X)
         assert X_imputed.shape == X.shape
         constant_feature = (
             X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0]