|
| 1 | +from copy import deepcopy |
1 | 2 | import pickle
|
2 | 3 | import tempfile
|
3 | 4 | import shutil
|
|
16 | 17 | from sklearn.utils._testing import ignore_warnings
|
17 | 18 |
|
18 | 19 | from sklearn.base import BaseEstimator
|
19 |
| -from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score, |
20 |
| - log_loss, precision_score, recall_score, |
21 |
| - jaccard_score) |
| 20 | +from sklearn.metrics import ( |
| 21 | + average_precision_score, |
| 22 | + brier_score_loss, |
| 23 | + f1_score, |
| 24 | + fbeta_score, |
| 25 | + jaccard_score, |
| 26 | + log_loss, |
| 27 | + precision_score, |
| 28 | + r2_score, |
| 29 | + recall_score, |
| 30 | + roc_auc_score, |
| 31 | +) |
22 | 32 | from sklearn.metrics import cluster as cluster_module
|
23 | 33 | from sklearn.metrics import check_scoring
|
24 | 34 | from sklearn.metrics._scorer import (_PredictScorer, _passthrough_scorer,
|
@@ -618,6 +628,8 @@ def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count,
|
618 | 628 | mock_est.predict = predict_func
|
619 | 629 | mock_est.predict_proba = predict_proba_func
|
620 | 630 | mock_est.decision_function = decision_function_func
|
| 631 | + # add the classes that would be found during fit |
| 632 | + mock_est.classes_ = np.array([0, 1]) |
621 | 633 |
|
622 | 634 | scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers)
|
623 | 635 | multi_scorer = _MultimetricScorer(**scorer_dict)
|
@@ -747,3 +759,211 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
|
747 | 759 | msg = "'Perceptron' object has no attribute 'predict_proba'"
|
748 | 760 | with pytest.raises(AttributeError, match=msg):
|
749 | 761 | scorer(lr, X, y)
|
| 762 | + |
| 763 | + |
| 764 | +@pytest.fixture |
| 765 | +def string_labeled_classification_problem(): |
| 766 | + """Train a classifier on binary problem with string target. |
| 767 | +
|
| 768 | + The classifier is trained on a binary classification problem where the |
| 769 | + minority class of interest has a string label that is intentionally not the |
| 770 | + greatest class label using the lexicographic order. In this case, "cancer" |
| 771 | + is the positive label, and `classifier.classes_` is |
| 772 | + `["cancer", "not cancer"]`. |
| 773 | +
|
| 774 | + In addition, the dataset is imbalanced to better identify problems when |
| 775 | + using non-symmetric performance metrics such as f1-score, average precision |
| 776 | + and so on. |
| 777 | +
|
| 778 | + Returns |
| 779 | + ------- |
| 780 | + classifier : estimator object |
| 781 | + Trained classifier on the binary problem. |
| 782 | + X_test : ndarray of shape (n_samples, n_features) |
| 783 | + Data to be used as testing set in tests. |
| 784 | + y_test : ndarray of shape (n_samples,), dtype=object |
| 785 | + Binary target where labels are strings. |
| 786 | + y_pred : ndarray of shape (n_samples,), dtype=object |
| 787 | + Prediction of `classifier` when predicting for `X_test`. |
| 788 | + y_pred_proba : ndarray of shape (n_samples, 2), dtype=np.float64 |
| 789 | + Probabilities of `classifier` when predicting for `X_test`. |
| 790 | + y_pred_decision : ndarray of shape (n_samples,), dtype=np.float64 |
| 791 | + Decision function values of `classifier` when predicting on `X_test`. |
| 792 | + """ |
| 793 | + from sklearn.datasets import load_breast_cancer |
| 794 | + from sklearn.utils import shuffle |
| 795 | + |
| 796 | + X, y = load_breast_cancer(return_X_y=True) |
| 797 | + # create an highly imbalanced classification task |
| 798 | + idx_positive = np.flatnonzero(y == 1) |
| 799 | + idx_negative = np.flatnonzero(y == 0) |
| 800 | + idx_selected = np.hstack([idx_negative, idx_positive[:25]]) |
| 801 | + X, y = X[idx_selected], y[idx_selected] |
| 802 | + X, y = shuffle(X, y, random_state=42) |
| 803 | + # only use 2 features to make the problem even harder |
| 804 | + X = X[:, :2] |
| 805 | + y = np.array( |
| 806 | + ["cancer" if c == 1 else "not cancer" for c in y], dtype=object |
| 807 | + ) |
| 808 | + X_train, X_test, y_train, y_test = train_test_split( |
| 809 | + X, y, stratify=y, random_state=0, |
| 810 | + ) |
| 811 | + classifier = LogisticRegression().fit(X_train, y_train) |
| 812 | + y_pred = classifier.predict(X_test) |
| 813 | + y_pred_proba = classifier.predict_proba(X_test) |
| 814 | + y_pred_decision = classifier.decision_function(X_test) |
| 815 | + |
| 816 | + return classifier, X_test, y_test, y_pred, y_pred_proba, y_pred_decision |
| 817 | + |
| 818 | + |
| 819 | +def test_average_precision_pos_label(string_labeled_classification_problem): |
| 820 | + # check that _ThresholdScorer will lead to the right score when passing |
| 821 | + # `pos_label`. Currently, only `average_precision_score` is defined to |
| 822 | + # be such a scorer. |
| 823 | + clf, X_test, y_test, _, y_pred_proba, y_pred_decision = \ |
| 824 | + string_labeled_classification_problem |
| 825 | + |
| 826 | + pos_label = "cancer" |
| 827 | + # we need to select the positive column or reverse the decision values |
| 828 | + y_pred_proba = y_pred_proba[:, 0] |
| 829 | + y_pred_decision = y_pred_decision * -1 |
| 830 | + assert clf.classes_[0] == pos_label |
| 831 | + |
| 832 | + # check that when calling the scoring function, probability estimates and |
| 833 | + # decision values lead to the same results |
| 834 | + ap_proba = average_precision_score( |
| 835 | + y_test, y_pred_proba, pos_label=pos_label |
| 836 | + ) |
| 837 | + ap_decision_function = average_precision_score( |
| 838 | + y_test, y_pred_decision, pos_label=pos_label |
| 839 | + ) |
| 840 | + assert ap_proba == pytest.approx(ap_decision_function) |
| 841 | + |
| 842 | + # create a scorer which would require to pass a `pos_label` |
| 843 | + # check that it fails if `pos_label` is not provided |
| 844 | + average_precision_scorer = make_scorer( |
| 845 | + average_precision_score, needs_threshold=True, |
| 846 | + ) |
| 847 | + err_msg = "pos_label=1 is not a valid label. It should be one of " |
| 848 | + with pytest.raises(ValueError, match=err_msg): |
| 849 | + average_precision_scorer(clf, X_test, y_test) |
| 850 | + |
| 851 | + # otherwise, the scorer should give the same results than calling the |
| 852 | + # scoring function |
| 853 | + average_precision_scorer = make_scorer( |
| 854 | + average_precision_score, needs_threshold=True, pos_label=pos_label |
| 855 | + ) |
| 856 | + ap_scorer = average_precision_scorer(clf, X_test, y_test) |
| 857 | + |
| 858 | + assert ap_scorer == pytest.approx(ap_proba) |
| 859 | + |
| 860 | + # The above scorer call is using `clf.decision_function`. We will force |
| 861 | + # it to use `clf.predict_proba`. |
| 862 | + clf_without_predict_proba = deepcopy(clf) |
| 863 | + |
| 864 | + def _predict_proba(self, X): |
| 865 | + raise NotImplementedError |
| 866 | + |
| 867 | + clf_without_predict_proba.predict_proba = partial( |
| 868 | + _predict_proba, clf_without_predict_proba |
| 869 | + ) |
| 870 | + # sanity check |
| 871 | + with pytest.raises(NotImplementedError): |
| 872 | + clf_without_predict_proba.predict_proba(X_test) |
| 873 | + |
| 874 | + ap_scorer = average_precision_scorer( |
| 875 | + clf_without_predict_proba, X_test, y_test |
| 876 | + ) |
| 877 | + assert ap_scorer == pytest.approx(ap_proba) |
| 878 | + |
| 879 | + |
| 880 | +def test_brier_score_loss_pos_label(string_labeled_classification_problem): |
| 881 | + # check that _ProbaScorer leads to the right score when `pos_label` is |
| 882 | + # provided. Currently only the `brier_score_loss` is defined to be such |
| 883 | + # a scorer. |
| 884 | + clf, X_test, y_test, _, y_pred_proba, _ = \ |
| 885 | + string_labeled_classification_problem |
| 886 | + |
| 887 | + pos_label = "cancer" |
| 888 | + assert clf.classes_[0] == pos_label |
| 889 | + |
| 890 | + # brier score loss is symmetric |
| 891 | + brier_pos_cancer = brier_score_loss( |
| 892 | + y_test, y_pred_proba[:, 0], pos_label="cancer" |
| 893 | + ) |
| 894 | + brier_pos_not_cancer = brier_score_loss( |
| 895 | + y_test, y_pred_proba[:, 1], pos_label="not cancer" |
| 896 | + ) |
| 897 | + assert brier_pos_cancer == pytest.approx(brier_pos_not_cancer) |
| 898 | + |
| 899 | + brier_scorer = make_scorer( |
| 900 | + brier_score_loss, needs_proba=True, pos_label=pos_label, |
| 901 | + ) |
| 902 | + assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer) |
| 903 | + |
| 904 | + |
| 905 | +@pytest.mark.parametrize( |
| 906 | + "score_func", [f1_score, precision_score, recall_score, jaccard_score] |
| 907 | +) |
| 908 | +def test_non_symmetric_metric_pos_label( |
| 909 | + score_func, string_labeled_classification_problem |
| 910 | +): |
| 911 | + # check that _PredictScorer leads to the right score when `pos_label` is |
| 912 | + # provided. We check for all possible metric supported. |
| 913 | + # Note: At some point we may end up having "scorer tags". |
| 914 | + clf, X_test, y_test, y_pred, _, _ = string_labeled_classification_problem |
| 915 | + |
| 916 | + pos_label = "cancer" |
| 917 | + assert clf.classes_[0] == pos_label |
| 918 | + |
| 919 | + score_pos_cancer = score_func(y_test, y_pred, pos_label="cancer") |
| 920 | + score_pos_not_cancer = score_func(y_test, y_pred, pos_label="not cancer") |
| 921 | + |
| 922 | + assert score_pos_cancer != pytest.approx(score_pos_not_cancer) |
| 923 | + |
| 924 | + scorer = make_scorer(score_func, pos_label=pos_label) |
| 925 | + assert scorer(clf, X_test, y_test) == pytest.approx(score_pos_cancer) |
| 926 | + |
| 927 | + |
| 928 | +@pytest.mark.parametrize( |
| 929 | + "scorer", |
| 930 | + [ |
| 931 | + make_scorer( |
| 932 | + average_precision_score, needs_threshold=True, pos_label="xxx" |
| 933 | + ), |
| 934 | + make_scorer(brier_score_loss, needs_proba=True, pos_label="xxx"), |
| 935 | + make_scorer(f1_score, pos_label="xxx") |
| 936 | + ], |
| 937 | + ids=["ThresholdScorer", "ProbaScorer", "PredictScorer"], |
| 938 | +) |
| 939 | +def test_scorer_select_proba_error(scorer): |
| 940 | + # check that we raise the the proper error when passing an unknown |
| 941 | + # pos_label |
| 942 | + X, y = make_classification( |
| 943 | + n_classes=2, n_informative=3, n_samples=20, random_state=0 |
| 944 | + ) |
| 945 | + lr = LogisticRegression().fit(X, y) |
| 946 | + assert scorer._kwargs["pos_label"] not in np.unique(y).tolist() |
| 947 | + |
| 948 | + err_msg = "is not a valid label" |
| 949 | + with pytest.raises(ValueError, match=err_msg): |
| 950 | + scorer(lr, X, y) |
| 951 | + |
| 952 | + |
| 953 | +def test_scorer_no_op_multiclass_select_proba(): |
| 954 | + # check that calling a ProbaScorer on a multiclass problem do not raise |
| 955 | + # even if `y_true` would be binary during the scoring. |
| 956 | + # `_select_proba_binary` should not be called in this case. |
| 957 | + X, y = make_classification( |
| 958 | + n_classes=3, n_informative=3, n_samples=20, random_state=0 |
| 959 | + ) |
| 960 | + lr = LogisticRegression().fit(X, y) |
| 961 | + |
| 962 | + mask_last_class = y == lr.classes_[-1] |
| 963 | + X_test, y_test = X[~mask_last_class], y[~mask_last_class] |
| 964 | + assert_array_equal(np.unique(y_test), lr.classes_[:-1]) |
| 965 | + |
| 966 | + scorer = make_scorer( |
| 967 | + roc_auc_score, needs_proba=True, multi_class="ovo", labels=lr.classes_, |
| 968 | + ) |
| 969 | + scorer(lr, X_test, y_test) |
0 commit comments