-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
FIX CalibratedClassifierCV to handle correctly sample_weight when ensemble=False #20638
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fe5d1a6
e42eac2
3b962cc
23b32e5
4d2e271
ea8acd3
40fc953
bd4045e
916eeb3
ee92ef3
da6970a
421b992
4f93e38
57d3aa6
1d0187b
81e8bd3
cf99fba
e719952
71de6f7
9b021c3
734870c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -166,6 +166,12 @@ def test_sample_weight(data, method, ensemble): | |||||||||||||||||||||||
X_train, y_train, sw_train = X[:n_samples], y[:n_samples], sample_weight[:n_samples] | ||||||||||||||||||||||||
X_test = X[n_samples:] | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
scaler = StandardScaler() | ||||||||||||||||||||||||
X_train = scaler.fit_transform( | ||||||||||||||||||||||||
X_train | ||||||||||||||||||||||||
) # compute mean, std and transform training data as well | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove the comment There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can new remove this comment as well |
||||||||||||||||||||||||
X_test = scaler.transform(X_test) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
base_estimator = LinearSVC(random_state=42) | ||||||||||||||||||||||||
calibrated_clf = CalibratedClassifierCV( | ||||||||||||||||||||||||
base_estimator, method=method, ensemble=ensemble | ||||||||||||||||||||||||
|
@@ -182,6 +188,68 @@ def test_sample_weight(data, method, ensemble): | |||||||||||||||||||||||
assert diff > 0.1 | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
@pytest.mark.parametrize("method", ["sigmoid", "isotonic"]) | ||||||||||||||||||||||||
@pytest.mark.parametrize("ensemble", [True, False]) | ||||||||||||||||||||||||
def test_sample_weight_class_imbalanced(method, ensemble): | ||||||||||||||||||||||||
JulienB-78 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
"""Use an imbalanced dataset to check that `sample_weight` is taken into | ||||||||||||||||||||||||
account in the calibration estimator.""" | ||||||||||||||||||||||||
X, y = make_blobs((100, 1000), center_box=(-1, 1), random_state=42) | ||||||||||||||||||||||||
glemaitre marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Compute weights to compensate for the unbalance of the dataset | ||||||||||||||||||||||||
weights = np.array([0.9, 0.1]) | ||||||||||||||||||||||||
sample_weight = weights[(y == 1).astype(int)] | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split( | ||||||||||||||||||||||||
X, y, sample_weight, stratify=y, random_state=42 | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# FIXME: ideally we should create a `Pipeline` with the `StandardScaler` | ||||||||||||||||||||||||
# followed by the `LinearSVC`. However, `Pipeline` does not expose | ||||||||||||||||||||||||
# `sample_weight` and it will be silently ignored. | ||||||||||||||||||||||||
scaler = StandardScaler() | ||||||||||||||||||||||||
JulienB-78 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
X_train = scaler.fit_transform(X_train) | ||||||||||||||||||||||||
X_test = scaler.transform(X_test) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
base_estimator = LinearSVC(random_state=42) | ||||||||||||||||||||||||
glemaitre marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
calibrated_clf = CalibratedClassifierCV( | ||||||||||||||||||||||||
base_estimator, method=method, ensemble=ensemble | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
calibrated_clf.fit(X_train, y_train, sample_weight=sw_train) | ||||||||||||||||||||||||
predictions = calibrated_clf.predict_proba(X_test)[:, 1] | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
assert brier_score_loss(y_test, predictions, sample_weight=sw_test) < 0.2 | ||||||||||||||||||||||||
glemaitre marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
@pytest.mark.parametrize("method", ["sigmoid", "isotonic"]) | ||||||||||||||||||||||||
def test_sample_weight_class_imbalanced_ensemble_equivalent(method): | ||||||||||||||||||||||||
X, y = make_blobs((100, 1000), center_box=(-1, 1), random_state=42) | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you add a small docstring mentioning what do we try to achieve here |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Compute weigths to compensate the unbalance of the dataset | ||||||||||||||||||||||||
sample_weight = 9 * (y == 0) + 1 | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you make the same change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split( | ||||||||||||||||||||||||
X, y, sample_weight, stratify=y, random_state=42 | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
scaler = StandardScaler() | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can add a comment as before |
||||||||||||||||||||||||
X_train = scaler.fit_transform( | ||||||||||||||||||||||||
X_train | ||||||||||||||||||||||||
) # compute mean, std and transform training data as well | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove the comment |
||||||||||||||||||||||||
X_test = scaler.transform(X_test) | ||||||||||||||||||||||||
Comment on lines
+234
to
+238
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
predictions = [] | ||||||||||||||||||||||||
for ensemble in [True, False]: | ||||||||||||||||||||||||
base_estimator = LinearSVC(random_state=42) | ||||||||||||||||||||||||
calibrated_clf = CalibratedClassifierCV( | ||||||||||||||||||||||||
base_estimator, method=method, ensemble=ensemble | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
calibrated_clf.fit(X_train, y_train, sample_weight=sw_train) | ||||||||||||||||||||||||
predictions.append(calibrated_clf.predict_proba(X_test)[:, 1]) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
diff = np.linalg.norm(predictions[0] - predictions[1]) | ||||||||||||||||||||||||
assert diff < 1.5 | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
@pytest.mark.parametrize("method", ["sigmoid", "isotonic"]) | ||||||||||||||||||||||||
@pytest.mark.parametrize("ensemble", [True, False]) | ||||||||||||||||||||||||
def test_parallel_execution(data, method, ensemble): | ||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.