scikit-learn · glemaitre · Aug 26, 2020 · Jul 25, 2020 · Jul 25, 2020 · Jul 25, 2020
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -361,6 +361,12 @@ Changelog
   validity of the input is now delegated to the base estimator.
   :pr:`17233` by :user:`Zolisa Bleki <zoj613>`.
 
+- |Enhancement| :class:`multiclass.OneVsOneClassifier` now accepts
+  the inputs with missing values. Hence, estimators which can handle
+  missing values (may be a pipeline with imputation step) can be used as
+  a estimator for multiclass wrappers. 
+  :pr:`17987` by :user:`Venkatachalam N <venkyyuvy>`.
+
 :mod:`sklearn.multioutput`
 ..........................
 
@@ -369,6 +375,13 @@ Changelog
   :pr:`18124` by :user:`Gus Brocchini <boldloop>` and
   :user:`Amanda Dsouza <amy12xx>`.
 
+- |Enhancement| :class:`multioutput.MultiOutputClassifier` and 
+  :class:`multioutput.MultiOutputRegressor` now accepts the inputs
+  with missing values. Hence, estimators which can handle missing
+  values (may be a pipeline with imputation step, HistGradientBoosting
+  estimators) can be used as a estimator for multiclass wrappers. 
+  :pr:`17987` by :user:`Venkatachalam N <venkyyuvy>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 

diff --git a/sklearn/ensemble/tests/test_common.py b/sklearn/ensemble/tests/test_common.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pytest
 
 from sklearn.base import clone
@@ -6,14 +7,20 @@
 
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
-
+from sklearn.datasets import load_iris, load_diabetes
+from sklearn.impute import SimpleImputer
 from sklearn.linear_model import LogisticRegression, LinearRegression
 from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
+from sklearn.pipeline import make_pipeline
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 from sklearn.ensemble import StackingClassifier, StackingRegressor
 from sklearn.ensemble import VotingClassifier, VotingRegressor
 
+X, y = load_iris(return_X_y=True)
+
+X_r, y_r = load_diabetes(return_X_y=True)
+
 
 @pytest.mark.parametrize(
     "X, y, estimator",
@@ -170,3 +177,28 @@ def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
     estimator.set_params(lr='drop')
     with pytest.raises(ValueError, match="All estimators are dropped."):
         estimator.fit(X, y)
+
+
+@pytest.mark.parametrize(
+     "Ensemble, Estimator, X, y",
+     [(StackingClassifier, LogisticRegression,
+       X, y),
+      (StackingRegressor, LinearRegression,
+       X_r, y_r),
+      (VotingClassifier, LogisticRegression,
+       X, y),
+      (VotingRegressor, LinearRegression,
+       X_r, y_r)]
+ )
+# FIXME: we should move this test in `estimator_checks` once we are able
+# to construct meta-estimator instances
+def test_heterogeneous_ensemble_support_missing_values(Ensemble,
+                                                       Estimator, X, y):
+    # check that Voting and Stacking predictor delegate the missing values
+    # validation to the underlying estimator.
+    X = X.copy()
+    mask = np.random.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
+    X[mask] = np.nan
+    pipe = make_pipeline(SimpleImputer(), Estimator())
+    ensemble = Ensemble(estimators=[('pipe1', pipe), ('pipe2', pipe)])
+    ensemble.fit(X, y).score(X, y)
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
@@ -294,7 +294,7 @@ def partial_fit(self, X, y, classes=None):
         if _check_partial_fit_first_call(self, classes):
             if not hasattr(self.estimator, "partial_fit"):
                 raise ValueError(("Base estimator {0}, doesn't have "
-                                 "partial_fit method").format(self.estimator))
+                                  "partial_fit method").format(self.estimator))
             self.estimators_ = [clone(self.estimator) for _ in range
                                 (self.n_classes_)]
 
@@ -307,8 +307,8 @@ def partial_fit(self, X, y, classes=None):
 
         if len(np.setdiff1d(y, self.classes_)):
             raise ValueError(("Mini-batch contains {0} while classes " +
-                             "must be subset of {1}").format(np.unique(y),
-                                                             self.classes_))
+                              "must be subset of {1}").format(np.unique(y),
+                                                              self.classes_))
 
         Y = self.label_binarizer_.transform(y)
         Y = Y.tocsc()
@@ -578,7 +578,8 @@ def fit(self, X, y):
         -------
         self
         """
-        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
+                                   force_all_finite=False)
         check_classification_targets(y)
 
         self.classes_ = np.unique(y)
@@ -635,7 +636,8 @@ def partial_fit(self, X, y, classes=None):
                              "must be subset of {1}".format(np.unique(y),
                                                             self.classes_))
 
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
+                         force_all_finite=False)
         check_classification_targets(y)
         combinations = itertools.combinations(range(self.n_classes_), 2)
         self.estimators_ = Parallel(

diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
@@ -101,6 +101,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         self : object
         """
         X, y = check_X_y(X, y,
+                         force_all_finite=False,
                          multi_output=True,
                          accept_sparse=True)
 
@@ -153,7 +154,9 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             raise ValueError("The base estimator should implement"
                              " a fit method")
 
-        X, y = self._validate_data(X, y, multi_output=True, accept_sparse=True)
+        X, y = self._validate_data(X, y,
+                                   force_all_finite=False,
+                                   multi_output=True, accept_sparse=True)
 
         if is_classifier(self):
             check_classification_targets(y)
@@ -196,7 +199,7 @@ def predict(self, X):
             raise ValueError("The base estimator should implement"
                              " a predict method")
 
-        X = check_array(X, accept_sparse=True)
+        X = check_array(X, force_all_finite=False, accept_sparse=True)
 
         y = Parallel(n_jobs=self.n_jobs)(
             delayed(e.predict)(X)

diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py
@@ -29,7 +29,8 @@
                                   SGDClassifier)
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.model_selection import GridSearchCV, cross_val_score
-from sklearn.pipeline import Pipeline
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.impute import SimpleImputer
 from sklearn import svm
 from sklearn import datasets
 
@@ -776,3 +777,21 @@ def test_pairwise_cross_val_score():
         score_precomputed = cross_val_score(ovr_true, linear_kernel, y)
         score_linear = cross_val_score(ovr_false, X, y)
         assert_array_equal(score_precomputed, score_linear)
+
+
+@pytest.mark.parametrize("MultiClassClassifier",
+                         [OneVsRestClassifier, OneVsOneClassifier])
+# FIXME: we should move this test in `estimator_checks` once we are able
+# to construct meta-estimator instances
+def test_support_missing_values(MultiClassClassifier):
+    # smoke test to check that pipeline OvR and OvO classifiers are letting
+    # the validation of missing values to
+    # the underlying pipeline or classifiers
+    rng = np.random.RandomState(42)
+    X, y = iris.data, iris.target
+    mask = rng.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
+    X[mask] = np.nan
+    lr = make_pipeline(SimpleImputer(),
+                       LogisticRegression(random_state=rng))
+
+    MultiClassClassifier(lr).fit(X, y).score(X, y)
diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py
@@ -31,6 +31,8 @@
 from sklearn.utils import shuffle
 from sklearn.model_selection import GridSearchCV
 from sklearn.dummy import DummyRegressor, DummyClassifier
+from sklearn.pipeline import make_pipeline
+from sklearn.impute import SimpleImputer
 
 
 def test_multi_target_regression():
@@ -302,7 +304,7 @@ def test_multiclass_multioutput_estimator():
         multi_class_svc_ = clone(multi_class_svc)  # create a clone
         multi_class_svc_.fit(X, y[:, i])
         assert (list(multi_class_svc_.predict(X)) ==
-                     list(predictions[:, i]))
+                list(predictions[:, i]))
 
 
 def test_multiclass_multioutput_estimator_predict_proba():
@@ -463,7 +465,7 @@ def test_classifier_chain_vs_independent_models():
     Y_pred_chain = chain.predict(X_test)
 
     assert (jaccard_score(Y_test, Y_pred_chain, average='samples') >
-                   jaccard_score(Y_test, Y_pred_ovr, average='samples'))
+            jaccard_score(Y_test, Y_pred_ovr, average='samples'))
 
 
 def test_base_chain_fit_and_predict():
@@ -476,7 +478,7 @@ def test_base_chain_fit_and_predict():
         Y_pred = chain.predict(X)
         assert Y_pred.shape == Y.shape
         assert ([c.coef_.size for c in chain.estimators_] ==
-                     list(range(X.shape[1], X.shape[1] + Y.shape[1])))
+                list(range(X.shape[1], X.shape[1] + Y.shape[1])))
 
     Y_prob = chains[1].predict_proba(X)
     Y_binary = (Y_prob >= .5)
@@ -603,6 +605,26 @@ def fit(self, X, y, **fit_params):
         assert est.sample_weight_ is weight
 
 
+@pytest.mark.parametrize(
+    'MultiOutputEstimator, Estimator',
+    [(MultiOutputClassifier, LogisticRegression),
+     (MultiOutputRegressor, Ridge)]
+)
+# FIXME: we should move this test in `estimator_checks` once we are able
+# to construct meta-estimator instances
+def test_support_missing_values(MultiOutputEstimator, Estimator):
+    # smoke test to check that pipeline MultioutputEstimators are letting
+    # the validation of missing values to
+    # the underlying pipeline, regressor or classifier
+    rng = np.random.RandomState(42)
+    X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
+    mask = rng.choice([1, 0], X.shape, p=[.01, .99]).astype(bool)
+    X[mask] = np.nan
+
+    pipe = make_pipeline(SimpleImputer(), Estimator())
+    MultiOutputEstimator(pipe).fit(X, y).score(X, y)
+
+
 @pytest.mark.parametrize("order_type", [list, np.array, tuple])
 def test_classifier_chain_tuple_order(order_type):
     X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]

diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
@@ -36,6 +36,7 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.impute import SimpleImputer
 
 iris = load_iris()
 
@@ -1222,6 +1223,16 @@ def transform(self, X, y=None):
     t.fit_transform(X, y, a=0)
 
 
+def test_pipeline_missing_values_leniency():
+    # check that pipeline let the missing values validation to
+    # the underlying transformers and predictors.
+    X, y = iris.data, iris.target
+    mask = np.random.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
+    X[mask] = np.nan
+    pipe = make_pipeline(SimpleImputer(), LogisticRegression())
+    assert pipe.fit(X, y).score(X, y) > 0.4
+
+
 def test_feature_union_warns_unknown_transformer_weight():
     # Warn user when transformer_weights containers a key not present in
     # transformer_list