Skip to content

Commit c1fa16f

Browse files
ogriselamueller
authored andcommitted
ENH no need for tie breaking jitter in calibration
The isotonic regression routine now implements deterministic tie-breaking by default.
1 parent ab556be commit c1fa16f

File tree

2 files changed

+5
-26
lines changed

2 files changed

+5
-26
lines changed

sklearn/calibration.py

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
from .base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
2020
from .preprocessing import LabelBinarizer
21-
from .utils import check_random_state
2221
from .utils import check_X_y, check_array, indexable, column_or_1d
2322
from .utils.validation import check_is_fitted
2423
from .isotonic import IsotonicRegression
@@ -59,9 +58,6 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
5958
If "prefit" is passed, it is assumed that base_estimator has been
6059
fitted already and all data is used for calibration.
6160
62-
random_state : int, RandomState instance or None (default=None)
63-
Used to randomly break ties when method is 'isotonic'.
64-
6561
Attributes
6662
----------
6763
classes_ : array, shape (n_classes)
@@ -86,12 +82,10 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
8682
.. [4] Predicting Good Probabilities with Supervised Learning,
8783
A. Niculescu-Mizil & R. Caruana, ICML 2005
8884
"""
89-
def __init__(self, base_estimator=None, method='sigmoid', cv=3,
90-
random_state=None):
85+
def __init__(self, base_estimator=None, method='sigmoid', cv=3):
9186
self.base_estimator = base_estimator
9287
self.method = method
9388
self.cv = cv
94-
self.random_state = random_state
9589

9690
def fit(self, X, y, sample_weight=None):
9791
"""Fit the calibrated model
@@ -116,7 +110,6 @@ def fit(self, X, y, sample_weight=None):
116110
X, y = indexable(X, y)
117111
lb = LabelBinarizer().fit(y)
118112
self.classes_ = lb.classes_
119-
random_state = check_random_state(self.random_state)
120113

121114
# Check that we each cross-validation fold can have at least one
122115
# example per class
@@ -136,7 +129,7 @@ def fit(self, X, y, sample_weight=None):
136129

137130
if self.cv == "prefit":
138131
calibrated_classifier = _CalibratedClassifier(
139-
base_estimator, method=self.method, random_state=random_state)
132+
base_estimator, method=self.method)
140133
if sample_weight is not None:
141134
calibrated_classifier.fit(X, y, sample_weight)
142135
else:
@@ -164,8 +157,7 @@ def fit(self, X, y, sample_weight=None):
164157
this_estimator.fit(X[train], y[train])
165158

166159
calibrated_classifier = _CalibratedClassifier(
167-
this_estimator, method=self.method,
168-
random_state=random_state)
160+
this_estimator, method=self.method)
169161
if sample_weight is not None:
170162
calibrated_classifier.fit(X[test], y[test],
171163
sample_weight[test])
@@ -242,9 +234,6 @@ class _CalibratedClassifier(object):
242234
corresponds to Platt's method or 'isotonic' which is a
243235
non-parameteric approach based on isotonic regression.
244236
245-
random_state : int, RandomState instance or None (default=None)
246-
Used to randomly break ties when method is 'isotonic'.
247-
248237
References
249238
----------
250239
.. [1] Obtaining calibrated probability estimates from decision trees
@@ -259,11 +248,9 @@ class _CalibratedClassifier(object):
259248
.. [4] Predicting Good Probabilities with Supervised Learning,
260249
A. Niculescu-Mizil & R. Caruana, ICML 2005
261250
"""
262-
def __init__(self, base_estimator, method='sigmoid',
263-
random_state=None):
251+
def __init__(self, base_estimator, method='sigmoid'):
264252
self.base_estimator = base_estimator
265253
self.method = method
266-
self.random_state = random_state
267254

268255
def _preproc(self, X):
269256
n_classes = len(self.classes_)
@@ -312,13 +299,6 @@ def fit(self, X, y, sample_weight=None):
312299
for k, this_df in zip(idx_pos_class, df.T):
313300
if self.method == 'isotonic':
314301
calibrator = IsotonicRegression(out_of_bounds='clip')
315-
# XXX: isotonic regression cannot deal correctly with
316-
# situations in which multiple inputs are identical but
317-
# have different outputs. Since this is not untypical
318-
# when calibrating, we add some small random jitter to
319-
# the inputs.
320-
jitter = self.random_state.normal(0, 1e-10, this_df.shape[0])
321-
this_df = this_df + jitter
322302
elif self.method == 'sigmoid':
323303
calibrator = _SigmoidCalibration()
324304
else:

sklearn/tests/test_calibration.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,7 @@ def test_sample_weight_warning():
114114

115115
for method in ['sigmoid', 'isotonic']:
116116
base_estimator = LinearSVC(random_state=42)
117-
calibrated_clf = CalibratedClassifierCV(base_estimator, method=method,
118-
random_state=42)
117+
calibrated_clf = CalibratedClassifierCV(base_estimator, method=method)
119118
# LinearSVC does not currently support sample weights but they
120119
# can still be used for the calibration step (with a warning)
121120
msg = "LinearSVC does not support sample_weight."

0 commit comments

Comments
 (0)