API use 'drop' to disable estimators in voting (scikit-learn#13780)

glemaitre · jnothman · commit f688e2846ef6 · 2019-05-07T10:06:38.000+10:00
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -324,6 +324,12 @@ Support for Python 3.4 and below has been officially dropped.
   of the estimators was set to ``None`` and ``sample_weight`` was not ``None``.
   :pr:`13779` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |API| :class:`ensemble.VotingClassifier` and
+  :class:`ensemble.VotingRegressor` accept ``'drop'`` to disable an estimator
+  in addition to ``None`` to be consistent with other estimators (i.e.,
+  :class:`pipeline.FeatureUnion` and :class:`compose.ColumnTransformer`).
+  :pr:`13780` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.externals`
 ........................
 
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
@@ -342,12 +342,25 @@ def test_sample_weight():
     assert_array_equal(eclf3.predict(X), clf1.predict(X))
     assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
 
+    # check that an error is raised and indicative if sample_weight is not
+    # supported.
     clf4 = KNeighborsClassifier()
     eclf3 = VotingClassifier(estimators=[
         ('lr', clf1), ('svc', clf3), ('knn', clf4)],
         voting='soft')
-    msg = ('Underlying estimator \'knn\' does not support sample weights.')
-    assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
+    msg = ('Underlying estimator KNeighborsClassifier does not support '
+           'sample weights.')
+    with pytest.raises(ValueError, match=msg):
+        eclf3.fit(X, y, sample_weight)
+
+    # check that _parallel_fit_estimator will raise the right error
+    # it should raise the original error if this is not linked to sample_weight
+    class ClassifierErrorFit(BaseEstimator, ClassifierMixin):
+        def fit(self, X, y, sample_weight):
+            raise TypeError('Error unrelated to sample_weight.')
+    clf = ClassifierErrorFit()
+    with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
+        clf.fit(X, y, sample_weight=sample_weight)
 
 
 def test_sample_weight_kwargs():
@@ -404,8 +417,10 @@ def test_set_params():
 @pytest.mark.filterwarnings('ignore: Default solver will be changed')  # 0.22
 @pytest.mark.filterwarnings('ignore: Default multi_class will')  # 0.22
 @pytest.mark.filterwarnings('ignore:The default value of n_estimators')
-def test_set_estimator_none():
-    """VotingClassifier set_params should be able to set estimators as None"""
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_set_estimator_none(drop):
+    """VotingClassifier set_params should be able to set estimators as None or
+    drop"""
     # Test predict
     clf1 = LogisticRegression(random_state=123)
     clf2 = RandomForestClassifier(random_state=123)
@@ -417,22 +432,22 @@ def test_set_estimator_none():
     eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                          ('nb', clf3)],
                              voting='hard', weights=[1, 1, 0.5])
-    eclf2.set_params(rf=None).fit(X, y)
+    eclf2.set_params(rf=drop).fit(X, y)
     assert_array_equal(eclf1.predict(X), eclf2.predict(X))
 
-    assert dict(eclf2.estimators)["rf"] is None
+    assert dict(eclf2.estimators)["rf"] is drop
     assert len(eclf2.estimators_) == 2
     assert all(isinstance(est, (LogisticRegression, GaussianNB))
                for est in eclf2.estimators_)
-    assert eclf2.get_params()["rf"] is None
+    assert eclf2.get_params()["rf"] is drop
 
     eclf1.set_params(voting='soft').fit(X, y)
     eclf2.set_params(voting='soft').fit(X, y)
     assert_array_equal(eclf1.predict(X), eclf2.predict(X))
     assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
-    msg = 'All estimators are None. At least one is required!'
+    msg = 'All estimators are None or "drop". At least one is required!'
     assert_raise_message(
-        ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)
+        ValueError, msg, eclf2.set_params(lr=drop, rf=drop, nb=drop).fit, X, y)
 
     # Test soft voting transform
     X1 = np.array([[1], [2]])
@@ -444,7 +459,7 @@ def test_set_estimator_none():
     eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                              voting='soft', weights=[1, 0.5],
                              flatten_transform=False)
-    eclf2.set_params(rf=None).fit(X1, y1)
+    eclf2.set_params(rf=drop).fit(X1, y1)
     assert_array_almost_equal(eclf1.transform(X1),
                               np.array([[[0.7, 0.3], [0.3, 0.7]],
                                         [[1., 0.], [0., 1.]]]))
@@ -522,12 +537,13 @@ def test_transform():
          [('lr', LinearRegression()),
           ('rf', RandomForestRegressor(n_estimators=5))]))]
 )
-def test_none_estimator_with_weights(X, y, voter):
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_none_estimator_with_weights(X, y, voter, drop):
     # check that an estimator can be set to None and passing some weight
     # regression test for
     # https://github.com/scikit-learn/scikit-learn/issues/13777
     voter.fit(X, y, sample_weight=np.ones(y.shape))
-    voter.set_params(lr=None)
+    voter.set_params(lr=drop)
     voter.fit(X, y, sample_weight=np.ones(y.shape))
     y_pred = voter.predict(X)
     assert y_pred.shape == y.shape
diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py
@@ -30,7 +30,15 @@
 def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
     """Private function used to fit an estimator within a job."""
     if sample_weight is not None:
-        estimator.fit(X, y, sample_weight=sample_weight)
+        try:
+            estimator.fit(X, y, sample_weight=sample_weight)
+        except TypeError as exc:
+            if "unexpected keyword argument 'sample_weight'" in str(exc):
+                raise ValueError(
+                    "Underlying estimator {} does not support sample weights."
+                    .format(estimator.__class__.__name__)
+                ) from exc
+            raise
     else:
         estimator.fit(X, y)
     return estimator
@@ -53,8 +61,8 @@ def _weights_not_none(self):
         """Get the weights of not `None` estimators"""
         if self.weights is None:
             return None
-        return [w for est, w in zip(self.estimators,
-                                    self.weights) if est[1] is not None]
+        return [w for est, w in zip(self.estimators, self.weights)
+                if est[1] not in (None, 'drop')]
 
     def _predict(self, X):
         """Collect results from clf.predict calls. """
@@ -76,26 +84,22 @@ def fit(self, X, y, sample_weight=None):
                              '; got %d weights, %d estimators'
                              % (len(self.weights), len(self.estimators)))
 
-        if sample_weight is not None:
-            for name, step in self.estimators:
-                if step is None:
-                    continue
-                if not has_fit_parameter(step, 'sample_weight'):
-                    raise ValueError('Underlying estimator \'%s\' does not'
-                                     ' support sample weights.' % name)
-
         names, clfs = zip(*self.estimators)
         self._validate_names(names)
 
-        n_isnone = np.sum([clf is None for _, clf in self.estimators])
+        n_isnone = np.sum(
+            [clf in (None, 'drop') for _, clf in self.estimators]
+        )
         if n_isnone == len(self.estimators):
-            raise ValueError('All estimators are None. At least one is '
-                             'required!')
+            raise ValueError(
+                'All estimators are None or "drop". At least one is required!'
+            )
 
         self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                 delayed(_parallel_fit_estimator)(clone(clf), X, y,
                                                  sample_weight=sample_weight)
-                for clf in clfs if clf is not None)
+                for clf in clfs if clf not in (None, 'drop')
+            )
 
         self.named_estimators_ = Bunch()
         for k, e in zip(self.estimators, self.estimators_):
@@ -149,8 +153,8 @@ class VotingClassifier(_BaseVoting, ClassifierMixin):
     estimators : list of (string, estimator) tuples
         Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
         of those original estimators that will be stored in the class attribute
-        ``self.estimators_``. An estimator can be set to `None` using
-        ``set_params``.
+        ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'``
+        using ``set_params``.
 
     voting : str, {'hard', 'soft'} (default='hard')
         If 'hard', uses predicted class labels for majority rule voting.
@@ -381,9 +385,9 @@ class VotingRegressor(_BaseVoting, RegressorMixin):
     Parameters
     ----------
     estimators : list of (string, estimator) tuples
-        Invoking the ``fit`` method on the ``VotingRegressor`` will fit
-        clones of those original estimators that will be stored in the class
-        attribute ``self.estimators_``. An estimator can be set to `None`
+        Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
+        of those original estimators that will be stored in the class attribute
+        ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'``
         using ``set_params``.
 
     weights : array-like, shape (n_regressors,), optional (default=`None`)