scikit-learn · jeremiedbb · Mar 21, 2019 · Mar 21, 2019 · Mar 21, 2019 · Mar 25, 2019
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -225,6 +225,15 @@ Support for Python 3.4 and below has been officially dropped.
   raising an exception if input is sparse add `missing_values` property
   is set to 0. :issue:`13240` by :user:`Bartosz Telenczuk <btel>`.
 
+- |ENH| Add another possible value for the `features` parameter of
+  :class:`MissingIndicator` to drop features with no missing values as well as
+  features with only missing values. :issue:`13491` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Fixed a bug in :class:`MissingIndicator` when ``X`` is sparse. All the
+  non-zero missing values used to become explicit False in the transformed
+  data. :issue:`13491` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.isotonic`
 .......................
 

diff --git a/sklearn/impute.py b/sklearn/impute.py
@@ -1057,13 +1057,15 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
         `missing_values` will be indicated (True in the output array), the
         other values will be marked as False.
 
-    features : str, optional
+    features : {"missing-only", "all", "some-missing"}, optional
         Whether the imputer mask should represent all or a subset of
         features.
 
         - If "missing-only" (default), the imputer mask will only represent
           features containing missing values during fit time.
         - If "all", the imputer mask will represent all features.
+        - If "some-missing", the imputer mask will represent features
+          containing missing values but not containing only missing values.
 
     sparse : boolean or "auto", optional
         Whether the imputer mask format should be sparse or dense.
@@ -1074,9 +1076,12 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
         - If False, the imputer mask will be a numpy array.
 
     error_on_new : boolean, optional
-        If True (default), transform will raise an error when there are
+        If True (default), transform will raise an error when there are either
         features with missing values in transform that have no missing values
-        in fit. This is applicable only when ``features="missing-only"``.
+        in fit (only applicable if
+        ``features in ("missing-only", "some-missing")``), or features with non
+        missing values in transform that have only missing values in fit
+        (only applicable if ``features="some-missing"``).
 
     Attributes
     ----------
@@ -1144,26 +1149,33 @@ def _get_missing_features_info(self, X):
             imputer_mask = sparse_constructor(
                 (mask, X.indices.copy(), X.indptr.copy()),
                 shape=X.shape, dtype=bool)
+            imputer_mask.eliminate_zeros()
 
-            missing_values_mask = imputer_mask.copy()
-            missing_values_mask.eliminate_zeros()
-            features_with_missing = (
-                np.flatnonzero(np.diff(missing_values_mask.indptr))
-                if missing_values_mask.format == 'csc'
-                else np.unique(missing_values_mask.indices))
+            if self.features in ('missing-only', 'some-missing'):
+                n_missing = imputer_mask.getnnz(axis=0)
 
             if self.sparse is False:
                 imputer_mask = imputer_mask.toarray()
             elif imputer_mask.format == 'csr':
                 imputer_mask = imputer_mask.tocsc()
         else:
             imputer_mask = _get_mask(X, self.missing_values)
-            features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0))
+
+            if self.features in ('missing-only', 'some-missing'):
+                n_missing = imputer_mask.sum(axis=0)
 
             if self.sparse is True:
                 imputer_mask = sparse.csc_matrix(imputer_mask)
 
-        return imputer_mask, features_with_missing
+        if self.features == 'all':
+            features_indices = np.arange(X.shape[1])
+        elif self.features == 'missing-only':
+            features_indices = np.flatnonzero(n_missing)
+        else:
+            features_indices = np.flatnonzero(
+                np.logical_and(n_missing < X.shape[0], n_missing > 0))
+
+        return imputer_mask, features_indices
 
     def _validate_input(self, X):
         if not is_scalar_nan(self.missing_values):
@@ -1207,18 +1219,17 @@ def fit(self, X, y=None):
         X = self._validate_input(X)
         self._n_features = X.shape[1]
 
-        if self.features not in ('missing-only', 'all'):
-            raise ValueError("'features' has to be either 'missing-only' or "
-                             "'all'. Got {} instead.".format(self.features))
+        if self.features not in ('missing-only', 'all', 'some-missing'):
+            raise ValueError("'features' has to be one of 'missing-only', "
+                             "'all' or 'some-missing'. Got {} instead."
+                             .format(self.features))
 
         if not ((isinstance(self.sparse, str) and
                 self.sparse == "auto") or isinstance(self.sparse, bool)):
             raise ValueError("'sparse' has to be a boolean or 'auto'. "
                              "Got {!r} instead.".format(self.sparse))
 
-        self.features_ = (self._get_missing_features_info(X)[1]
-                          if self.features == 'missing-only'
-                          else np.arange(self._n_features))
+        self.features_ = self._get_missing_features_info(X)[1]
 
         return self
 
@@ -1246,15 +1257,16 @@ def transform(self, X):
 
         imputer_mask, features = self._get_missing_features_info(X)
 
-        if self.features == "missing-only":
+        if self.features in ("missing-only", "some-missing"):
             features_diff_fit_trans = np.setdiff1d(features, self.features_)
             if (self.error_on_new and features_diff_fit_trans.size > 0):
-                raise ValueError("The features {} have missing values "
+                raise ValueError("The features {} either have missing values "
                                  "in transform but have no missing values "
-                                 "in fit.".format(features_diff_fit_trans))
+                                 "in fit, or have non missing values in "
+                                 "transform but have only missing values in "
+                                 "fit".format(features_diff_fit_trans))
 
-            if (self.features_.size > 0 and
-                    self.features_.size < self._n_features):
+            if self.features_.size < self._n_features:
                 imputer_mask = imputer_mask[:, self.features_]
 
         return imputer_mask

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
@@ -917,9 +917,12 @@ def test_iterative_imputer_early_stopping():
     [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]),
       {'features': 'missing-only', 'sparse': 'auto'},
       'have missing values in transform but have no missing values in fit'),
+     (np.array([[-1, 1], [-1, -1]]), np.array([[-1, 1], [1, 1]]),
+      {'features': 'some-missing', 'sparse': 'auto'},
+      'have missing values in transform but have no missing values in fit'),
      (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
       {'features': 'random', 'sparse': 'auto'},
-      "'features' has to be either 'missing-only' or 'all'"),
+      "'features' has to be one of 'missing-only', 'all' or 'some-missing'"),
      (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
       {'features': 'all', 'sparse': 'random'},
       "'sparse' has to be a boolean or 'auto'"),
@@ -1119,3 +1122,34 @@ def test_inconsistent_dtype_X_missing_values(imputer_constructor,
 
     with pytest.raises(ValueError, match=err_msg):
         imputer.fit_transform(X)
+
+
+@pytest.mark.parametrize("array_constr",
+                         [np.array, sparse.csr_matrix, sparse.csc_matrix],
+                         ids=["dense", "sparse_csr", "sparse_csc"])
+def test_missing_indicator_drop_full_missing(array_constr):
+    # Check that missing indicator with features="some-missing" drops columns
+    # with no missing values as well as columns full of missing values.
+    X = array_constr([[0, np.nan, 0],
+                      [0, np.nan, np.nan]])
+
+    expected_Xt = array_constr([[False],
+                                [True]])
+
+    mi = MissingIndicator(features="some-missing")
+    Xt = mi.fit_transform(X)
+
+    assert_allclose_dense_sparse(Xt, expected_Xt)
+
+
+def test_missing_indicator_sparse_no_explicit_zeros():
+    # Check that non missing values don't become explicit zeros in the mask
+    # generated by missing indicator when X is sparse.
+    X = sparse.csr_matrix([[0, 1, 2],
+                           [1, 2, 0],
+                           [2, 0, 1]])
+
+    mi = MissingIndicator(features='all', missing_values=1)
+    Xt = mi.fit_transform(X)
+
+    assert Xt.getnnz() == Xt.sum()