scikit-learn · lorentzenchr · Jun 3, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/examples/model_selection/plot_cost_sensitive_learning.py b/examples/model_selection/plot_cost_sensitive_learning.py
@@ -172,7 +172,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
     return np.sum(cm * gain_matrix)
 
 
-scoring["cost_gain"] = make_scorer(
+scoring["credit_gain"] = make_scorer(
     credit_gain_score, neg_label=neg_label, pos_label=pos_label
 )
 # %%
@@ -247,7 +247,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 # However, we recall that the original aim was to minimize the cost (or maximize the
 # gain) as defined by the business metric. We can compute the value of the business
 # metric:
-print(f"Business defined metric: {scoring['cost_gain'](model, X_test, y_test)}")
+print(f"Business defined metric: {scoring['credit_gain'](model, X_test, y_test)}")
 
 # %%
 # At this stage we don't know if any other cut-off can lead to a greater gain. To find
@@ -272,7 +272,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
 
 tuned_model = TunedThresholdClassifierCV(
     estimator=model,
-    scoring=scoring["cost_gain"],
+    scoring=scoring["credit_gain"],
     store_cv_results=True,  # necessary to inspect all results
 )
 tuned_model.fit(X_train, y_train)
@@ -379,7 +379,7 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
 #
 # We can now check if choosing this cut-off point leads to a better score on the testing
 # set:
-print(f"Business defined metric: {scoring['cost_gain'](tuned_model, X_test, y_test)}")
+print(f"Business defined metric: {scoring['credit_gain'](tuned_model, X_test, y_test)}")
 
 # %%
 # We observe that tuning the decision threshold almost improves our business gains
@@ -487,7 +487,7 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
 fraud = target == 1
 amount_fraud = data["Amount"][fraud]
 _, ax = plt.subplots()
-ax.hist(amount_fraud, bins=100)
+ax.hist(amount_fraud, bins=30)
 ax.set_title("Amount of fraud transaction")
 _ = ax.set_xlabel("Amount (€)")
 
@@ -500,20 +500,18 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
 # a gain of 2% of the amount of the transaction. However, accepting a fraudulent
 # transaction result in a loss of the amount of the transaction. As stated in [2]_, the
 # gain and loss related to refusals (of fraudulent and legitimate transactions) are not
-# trivial to define. Here, we define that a refusal of a legitimate transaction is
-# estimated to a loss of 5€ while the refusal of a fraudulent transaction is estimated
-# to a gain of 50€ and the amount of the transaction. Therefore, we define the
-# following function to compute the total benefit of a given decision:
+# trivial to define. Here, we define that a refusal of a legitimate transaction
+# is estimated to a loss of 5€ while the refusal of a fraudulent transaction is
+# estimated to a gain of 50€. Therefore, we define the following function to
+# compute the total benefit of a given decision:
 
 
 def business_metric(y_true, y_pred, amount):
     mask_true_positive = (y_true == 1) & (y_pred == 1)
     mask_true_negative = (y_true == 0) & (y_pred == 0)
     mask_false_positive = (y_true == 0) & (y_pred == 1)
     mask_false_negative = (y_true == 1) & (y_pred == 0)
-    fraudulent_refuse = (mask_true_positive.sum() * 50) + amount[
-        mask_true_positive
-    ].sum()
+    fraudulent_refuse = mask_true_positive.sum() * 50
     fraudulent_accept = -amount[mask_false_negative].sum()
     legitimate_refuse = mask_false_positive.sum() * -5
     legitimate_accept = (amount[mask_true_negative] * 0.02).sum()
@@ -540,7 +538,6 @@ def business_metric(y_true, y_pred, amount):
 amount = credit_card.frame["Amount"].to_numpy()
 
 # %%
-# We first start to train a dummy classifier to have some baseline results.
 from sklearn.model_selection import train_test_split
 
 data_train, data_test, target_train, target_test, amount_train, amount_test = (
@@ -550,50 +547,44 @@ def business_metric(y_true, y_pred, amount):
 )
 
 # %%
+# We first evaluate some baseline policies to serve as reference. Recall that
+# class "0" is the legitimate class and class "1" is the fraudulent class.
 from sklearn.dummy import DummyClassifier
 
-easy_going_classifier = DummyClassifier(strategy="constant", constant=0)
-easy_going_classifier.fit(data_train, target_train)
-benefit_cost = business_scorer(
-    easy_going_classifier, data_test, target_test, amount=amount_test
+always_accept_policy = DummyClassifier(strategy="constant", constant=0)
+always_accept_policy.fit(data_train, target_train)
+benefit = business_scorer(
+    always_accept_policy, data_test, target_test, amount=amount_test
 )
-print(f"Benefit/cost of our easy-going classifier: {benefit_cost:,.2f}€")
+print(f"Benefit of the 'always accept' policy: {benefit:,.2f}€")
 
 # %%
-# A classifier that predict all transactions as legitimate would create a profit of
-# around 220,000.€ We make the same evaluation for a classifier that predicts all
+# A policy that considers all transactions as legitimate would create a profit of
+# around 220,000€. We make the same evaluation for a classifier that predicts all
 # transactions as fraudulent.
-intolerant_classifier = DummyClassifier(strategy="constant", constant=1)
-intolerant_classifier.fit(data_train, target_train)
-benefit_cost = business_scorer(
-    intolerant_classifier, data_test, target_test, amount=amount_test
+always_reject_policy = DummyClassifier(strategy="constant", constant=1)
+always_reject_policy.fit(data_train, target_train)
+benefit = business_scorer(
+    always_reject_policy, data_test, target_test, amount=amount_test
 )
-print(f"Benefit/cost of our intolerant classifier: {benefit_cost:,.2f}€")
+print(f"Benefit of the 'always reject' policy: {benefit:,.2f}€")
 
-# %%
-# Such a classifier create a loss of around 670,000.€ A predictive model should allow
-# us to make a profit larger than 220,000.€ It is interesting to compare this business
-# metric with another "standard" statistical metric such as the balanced accuracy.
-from sklearn.metrics import get_scorer
-
-balanced_accuracy_scorer = get_scorer("balanced_accuracy")
-print(
-    "Balanced accuracy of our easy-going classifier: "
-    f"{balanced_accuracy_scorer(easy_going_classifier, data_test, target_test):.3f}"
-)
-print(
-    "Balanced accuracy of our intolerant classifier: "
-    f"{balanced_accuracy_scorer(intolerant_classifier, data_test, target_test):.3f}"
-)
 
 # %%
-# This is not a surprise that the balanced accuracy is at 0.5 for both classifiers.
-# However, we need to be careful in the rest of the evaluation: we potentially can
-# obtain a model with a decent balanced accuracy that does not make any profit.
-# In this case, the model would be harmful for our business.
+# Such a policy would entail a catastrophic loss: around 670,000€. This is
+# expected since the vast majority of the transactions are legitimate and the
+# policy would refuse them at a non-trivial cost.
+#
+# A predictive model that adapts the accept/reject decisions on a per
+# transaction basis should ideally allow us to make a profit larger than the
+# 220,000€ of the best of our constant baseline policies.
 #
-# Let's now create a predictive model using a logistic regression without tuning the
-# decision threshold.
+# We start with a logistic regression model with the default decision threshold
+# at 0.5. Here we tune the hyperparameter `C` of the logistic regression with a
+# proper scoring rule (the log loss) to ensure that the model's probabilistic
+# predictions returned by its `predict_proba` method are as accurate as
+# possible, irrespectively of the choice of the value of the decision
+# threshold.
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import make_pipeline
@@ -604,21 +595,19 @@ def business_metric(y_true, y_pred, amount):
 model = GridSearchCV(logistic_regression, param_grid, scoring="neg_log_loss").fit(
     data_train, target_train
 )
+model
 
+# %%
 print(
-    "Benefit/cost of our logistic regression: "
+    "Benefit of logistic regression with default threshold: "
     f"{business_scorer(model, data_test, target_test, amount=amount_test):,.2f}€"
 )
-print(
-    "Balanced accuracy of our logistic regression: "
-    f"{balanced_accuracy_scorer(model, data_test, target_test):.3f}"
-)
 
 # %%
-# By observing the balanced accuracy, we see that our predictive model is learning
-# some associations between the features and the target. The business metric also shows
-# that our model is beating the baseline in terms of profit and it would be already
-# beneficial to use it instead of ignoring the fraud detection problem.
+# The business metric shows that our predictive model with a default decision
+# threshold is already winning over the baseline in terms of profit and it would be
+# already beneficial to use it to accept or reject transactions instead of
+# accepting all transactions.
 #
 # Tuning the decision threshold
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -642,30 +631,21 @@ def business_metric(y_true, y_pred, amount):
 # automatically dispatching this metadata to the underlying scorer.
 tuned_model.fit(data_train, target_train, amount=amount_train)
 
+# %%
+# We observe that the tuned decision threshold is far away from the default 0.5:
+print(f"Tuned decision threshold: {tuned_model.best_threshold_:.2f}")
+
 # %%
 print(
-    "Benefit/cost of our logistic regression: "
+    "Benefit of logistic regression with a tuned threshold: "
     f"{business_scorer(tuned_model, data_test, target_test, amount=amount_test):,.2f}€"
 )
-print(
-    "Balanced accuracy of our logistic regression: "
-    f"{balanced_accuracy_scorer(tuned_model, data_test, target_test):.3f}"
-)
 
 # %%
-# We observe that tuning the decision threshold increases the expected profit of
-# deploying our model as estimated by the business metric.
-# Eventually, the balanced accuracy also increased. Note that it might not always be
-# the case because the statistical metric is not necessarily a surrogate of the
-# business metric. It is therefore important, whenever possible, optimize the decision
-# threshold with respect to the business metric.
-#
-# Finally, the estimate of the business metric itself can be unreliable, in
-# particular when the number of data points in the minority class is so small.
-# Any business impact estimated by cross-validation of a business metric on
-# historical data (offline evaluation) should ideally be confirmed by A/B testing
-# on live data (online evaluation). Note however that A/B testing models is
-# beyond the scope of the scikit-learn library itself.
+# We observe that tuning the decision threshold increases the expected profit
+# when deploying our model - as indicated by the business metric. It is therefore
+# valuable, whenever possible, to optimize the decision threshold with respect
+# to the business metric.
 #
 # Manually setting the decision threshold instead of tuning it
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -684,19 +664,22 @@ def business_metric(y_true, y_pred, amount):
 from sklearn.model_selection import FixedThresholdClassifier
 
 model_fixed_threshold = FixedThresholdClassifier(
-    estimator=model, threshold=tuned_model.best_threshold_
+    estimator=model, threshold=tuned_model.best_threshold_, prefit=True
 ).fit(data_train, target_train)
 
 # %%
 business_score = business_scorer(
     model_fixed_threshold, data_test, target_test, amount=amount_test
 )
-print(f"Benefit/cost of our logistic regression: {business_score:,.2f}€")
-print(
-    "Balanced accuracy of our logistic regression: "
-    f"{balanced_accuracy_scorer(model_fixed_threshold, data_test, target_test):.3f}"
-)
+print(f"Benefit of logistic regression with a tuned threshold:  {business_score:,.2f}€")
 
 # %%
-# We observe that we obtained the exact same results but the fitting process was much
-# faster since we did not perform any search.
+# We observe that we obtained the exact same results but the fitting process
+# was much faster since we did not perform any hyper-parameter search.
+#
+# Finally, the estimate of the (average) business metric itself can be unreliable, in
+# particular when the number of data points in the minority class is very small.
+# Any business impact estimated by cross-validation of a business metric on
+# historical data (offline evaluation) should ideally be confirmed by A/B testing
+# on live data (online evaluation). Note however that A/B testing models is
+# beyond the scope of the scikit-learn library itself.