FIX accept meta-estimator in SelfTrainingClassifier (#19126)

glemaitre · thomasjpfan · ogrisel · jeremiedbb · commit 78e1530c323c · 2021-01-19T11:28:01.000+01:00
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -2,6 +2,23 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_0_24_1:
+
+Version 0.24.1
+==============
+
+Changelog
+---------
+
+:mod:`sklearn.semi_supervised`
+..............................
+
+- |Fix| :class:`semi_supervised.SelfTrainingClassifier` is now accepting
+  meta-estimator (e.g. :class:`ensemble.StackingClassifier`). The validation
+  of this estimator is done on the fitted estimator, once we know the existence
+  of the method `predict_proba`.
+  :pr:`19126` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 .. _changes_0_24:
 
 Version 0.24.0
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
@@ -205,10 +205,10 @@ def fit(self, X, y):
                 X[safe_mask(X, has_label)],
                 self.transduction_[has_label])
 
-            if self.n_iter_ == 1:
-                # Only validate in the first iteration so that n_iter=0 is
-                # equivalent to the base_estimator itself.
-                _validate_estimator(self.base_estimator)
+            # Validate the fitted estimator since `predict_proba` can be
+            # delegated to an underlying "final" fitted estimator as
+            # generally done in meta-estimator or pipeline.
+            _validate_estimator(self.base_estimator_)
 
             # Predict on the unlabeled samples
             prob = self.base_estimator_.predict_proba(
diff --git a/sklearn/semi_supervised/tests/test_self_training.py b/sklearn/semi_supervised/tests/test_self_training.py
@@ -4,14 +4,16 @@
 from numpy.testing import assert_array_equal
 import pytest
 
+from sklearn.ensemble import StackingClassifier
 from sklearn.exceptions import NotFittedError
-from sklearn.semi_supervised import SelfTrainingClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_iris, make_blobs
 from sklearn.metrics import accuracy_score
 
+from sklearn.semi_supervised import SelfTrainingClassifier
+
 # Author: Oliver Rausch <rauscho@ethz.ch>
 # License: BSD 3 clause
 
@@ -318,3 +320,26 @@ def test_k_best_selects_best():
 
     for row in most_confident_svc.tolist():
         assert row in added_by_st
+
+
+def test_base_estimator_meta_estimator():
+    # Check that a meta-estimator relying on an estimator implementing
+    # `predict_proba` will work even if it does expose this method before being
+    # fitted.
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/19119
+
+    base_estimator = StackingClassifier(
+        estimators=[
+            ("svc_1", SVC(probability=True)), ("svc_2", SVC(probability=True)),
+        ],
+        final_estimator=SVC(probability=True), cv=2
+    )
+
+    # make sure that the `base_estimator` does not expose `predict_proba`
+    # without being fitted
+    assert not hasattr(base_estimator, "predict_proba")
+
+    clf = SelfTrainingClassifier(base_estimator=base_estimator)
+    clf.fit(X_train, y_train_missing_labels)
+    clf.predict_proba(X_test)