diff --git a/doc/whats_new/upcoming_changes/sklearn.multiclass/31228.fix.rst b/doc/whats_new/upcoming_changes/sklearn.multiclass/31228.fix.rst new file mode 100644 index 0000000000000..a22932c4ff397 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.multiclass/31228.fix.rst @@ -0,0 +1,4 @@ +- The method `predict_proba` of :class:`sklearn.multiclass.OneVsRestClassifier` now + avoids division by zero when normalizing the predicted probabilities matrix. + By :user:`Luis M. B. Varona `, :user:`Marc Bresson `, and + :user:`Jérémie du Boisberranger ` diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index fa86201fb1d89..257c9a40b7741 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -553,8 +553,10 @@ def predict_proba(self, X): Y = np.concatenate(((1 - Y), Y), axis=1) if not self.multilabel_: - # Then, probabilities should be normalized to 1. - Y /= np.sum(Y, axis=1)[:, np.newaxis] + # Then, (nonzero) sample probability distributions should be normalized. + row_sums = np.sum(Y, axis=1)[:, np.newaxis] + np.divide(Y, row_sums, out=Y, where=row_sums != 0) # Avoid division by 0 + return Y @available_if(_estimators_has("decision_function")) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 566b8f535c9cb..b814a62fbc064 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -6,6 +6,7 @@ from numpy.testing import assert_allclose from sklearn import datasets, svm +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.datasets import load_breast_cancer from sklearn.exceptions import NotFittedError from sklearn.impute import SimpleImputer @@ -429,6 +430,39 @@ def test_ovr_single_label_predict_proba(): assert not (pred - Y_pred).any() +def test_ovr_single_label_predict_proba_zero_row(): + class NaiveBinaryClassifier(BaseEstimator, ClassifierMixin): + def fit(self, X, y): + self.classes_ = np.unique(y) + return self + + def predict_proba(self, X): + ones = np.ones((len(X), len(self.classes_))) + # Probability of being the positive class is 0 when all features are 0 + ones[:, 1] = np.any(X, axis=1).astype(int) + return ones + + base_clf = NaiveBinaryClassifier() + X, Y = iris.data, iris.target # Three-class problem with 150 samples + X_train, Y_train = X[:80], Y[:80] + X_test = X[80:] + + zero_indices = np.random.choice(np.arange(len(X_test)), size=5, replace=False) + X_test[zero_indices] = 0 # Change 5 random samples in the test set to be all zeros + + clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) + Y_proba = clf.predict_proba( + X_test + ) # Our classifier predicts 0 for the zero samples + + zero_indices = np.repeat(zero_indices, len(clf.classes_)) + nonzero_indices = np.setdiff1d(np.arange(len(Y_proba)), zero_indices) + # Nonzero sample probability distributions should be normalized to sum to 1 + assert_almost_equal(np.sum(Y_proba[nonzero_indices], axis=1), 1.0) + # Zero-confidence samples should remain as-is and not be normalized + assert_array_equal(Y_proba[zero_indices], 0) + + def test_ovr_multilabel_decision_function(): X, Y = datasets.make_multilabel_classification( n_samples=100, @@ -597,13 +631,15 @@ def test_ovo_decision_function(): # mostly tied predictions: assert set(votes[:, class_idx]).issubset(set([0.0, 1.0, 2.0])) - # The OVO decision function on the other hand is able to resolve + # The OVO decision function, on the other hand, is able to resolve # most of the ties on this data as it combines both the vote counts # and the aggregated confidence levels of the binary classifiers # to compute the aggregate decision function. The iris dataset # has 150 samples with a couple of duplicates. The OvO decisions - # can resolve most of the ties: - assert len(np.unique(decisions[:, class_idx])) > 146 + # can resolve most of the ties; just to be safe, we allow for up to 9 + # redundant values (i.e., at least 140 unique values), although there + # will probably be even less than that: + assert len(np.unique(decisions[:, class_idx])) > 140 def test_ovo_gridsearch():