Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,9 @@ Bug fixes
to make it consistent with the documentation and
``decision_function``. By Artem Sobolev.

- Fixed handling of ties in :class:`isotonic.IsotonicRegression`.
We now use the weighted average of targets (secondary method). By
`Andreas Müller`_ and `Michael Bommarito <http://bommaritollc.com/>`_.

API changes summary
-------------------
Expand Down
2,837 changes: 1,757 additions & 1,080 deletions sklearn/_isotonic.c

Large diffs are not rendered by default.

53 changes: 53 additions & 0 deletions sklearn/_isotonic.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,56 @@ def _isotonic_regression(np.ndarray[DOUBLE, ndim=1] y,
break

return solution


@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def _make_unique(np.ndarray[dtype=np.float64_t] X,
np.ndarray[dtype=np.float64_t] y,
np.ndarray[dtype=np.float64_t] sample_weights):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to add a docstring to explain that the number of samples is reduced whenever there is a tie in X: the matching y values are averaged using the weights and the sample_weights are accumulated to conserve the original info.

It is my understanding that X should be sorted before calling this utility, this should also be made explicit in the docstring or in an inline comment to improve understandability of the code.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

"""Average targets for duplicate X, drop duplicates.

Aggregates duplicate X values into a single X value where
the target y is a (sample_weighted) average of the individual
targets.

Assumes that X is ordered, so that all duplicates follow each other.
"""
unique_values = len(np.unique(X))
if unique_values == len(X):
return X, y, sample_weights
cdef np.ndarray[dtype=np.float64_t] y_out = np.empty(unique_values)
cdef np.ndarray[dtype=np.float64_t] x_out = np.empty(unique_values)
cdef np.ndarray[dtype=np.float64_t] weights_out = np.empty(unique_values)

cdef float current_x = X[0]
cdef float current_y = 0
cdef float current_weight = 0
cdef float y_old = 0
cdef int i = 0
cdef int current_count = 0
cdef int j
cdef float x
cdef int n_samples = len(X)
for j in range(n_samples):
x = X[j]
if x != current_x:
# next unique value
x_out[i] = current_x
weights_out[i] = current_weight / current_count
y_out[i] = current_y / current_weight
i += 1
current_x = x
current_weight = sample_weights[j]
current_y = y[j] * sample_weights[j]
current_count = 1
else:
current_weight += sample_weights[j]
current_y += y[j] * sample_weights[j]
current_count += 1

x_out[i] = current_x
weights_out[i] = current_weight / current_count
y_out[i] = current_y / current_weight
return x_out, y_out, weights_out
28 changes: 4 additions & 24 deletions sklearn/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from .base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
from .preprocessing import LabelBinarizer
from .utils import check_random_state
from .utils import check_X_y, check_array, indexable, column_or_1d
from .utils.validation import check_is_fitted
from .isotonic import IsotonicRegression
Expand Down Expand Up @@ -59,9 +58,6 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
If "prefit" is passed, it is assumed that base_estimator has been
fitted already and all data is used for calibration.

random_state : int, RandomState instance or None (default=None)
Used to randomly break ties when method is 'isotonic'.

Attributes
----------
classes_ : array, shape (n_classes)
Expand All @@ -86,12 +82,10 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
.. [4] Predicting Good Probabilities with Supervised Learning,
A. Niculescu-Mizil & R. Caruana, ICML 2005
"""
def __init__(self, base_estimator=None, method='sigmoid', cv=3,
random_state=None):
def __init__(self, base_estimator=None, method='sigmoid', cv=3):
self.base_estimator = base_estimator
self.method = method
self.cv = cv
self.random_state = random_state

def fit(self, X, y, sample_weight=None):
"""Fit the calibrated model
Expand All @@ -116,7 +110,6 @@ def fit(self, X, y, sample_weight=None):
X, y = indexable(X, y)
lb = LabelBinarizer().fit(y)
self.classes_ = lb.classes_
random_state = check_random_state(self.random_state)

# Check that we each cross-validation fold can have at least one
# example per class
Expand All @@ -136,7 +129,7 @@ def fit(self, X, y, sample_weight=None):

if self.cv == "prefit":
calibrated_classifier = _CalibratedClassifier(
base_estimator, method=self.method, random_state=random_state)
base_estimator, method=self.method)
if sample_weight is not None:
calibrated_classifier.fit(X, y, sample_weight)
else:
Expand Down Expand Up @@ -164,8 +157,7 @@ def fit(self, X, y, sample_weight=None):
this_estimator.fit(X[train], y[train])

calibrated_classifier = _CalibratedClassifier(
this_estimator, method=self.method,
random_state=random_state)
this_estimator, method=self.method)
if sample_weight is not None:
calibrated_classifier.fit(X[test], y[test],
sample_weight[test])
Expand Down Expand Up @@ -242,9 +234,6 @@ class _CalibratedClassifier(object):
corresponds to Platt's method or 'isotonic' which is a
non-parameteric approach based on isotonic regression.

random_state : int, RandomState instance or None (default=None)
Used to randomly break ties when method is 'isotonic'.

References
----------
.. [1] Obtaining calibrated probability estimates from decision trees
Expand All @@ -259,11 +248,9 @@ class _CalibratedClassifier(object):
.. [4] Predicting Good Probabilities with Supervised Learning,
A. Niculescu-Mizil & R. Caruana, ICML 2005
"""
def __init__(self, base_estimator, method='sigmoid',
random_state=None):
def __init__(self, base_estimator, method='sigmoid'):
self.base_estimator = base_estimator
self.method = method
self.random_state = random_state

def _preproc(self, X):
n_classes = len(self.classes_)
Expand Down Expand Up @@ -312,13 +299,6 @@ def fit(self, X, y, sample_weight=None):
for k, this_df in zip(idx_pos_class, df.T):
if self.method == 'isotonic':
calibrator = IsotonicRegression(out_of_bounds='clip')
# XXX: isotonic regression cannot deal correctly with
# situations in which multiple inputs are identical but
# have different outputs. Since this is not untypical
# when calibrating, we add some small random jitter to
# the inputs.
jitter = self.random_state.normal(0, 1e-10, this_df.shape[0])
this_df = this_df + jitter
elif self.method == 'sigmoid':
calibrator = _SigmoidCalibration()
else:
Expand Down
70 changes: 27 additions & 43 deletions sklearn/isotonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from scipy.stats import spearmanr
from .base import BaseEstimator, TransformerMixin, RegressorMixin
from .utils import as_float_array, check_array, check_consistent_length
from ._isotonic import _isotonic_regression
from .utils.fixes import astype
from ._isotonic import _isotonic_regression, _make_unique
import warnings
import math

Expand Down Expand Up @@ -200,12 +201,24 @@ class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
The stepwise interpolating function that covers the domain
X_.

Notes
-----
Ties are broken using the secondary method from Leeuw, 1977.

References
----------
Isotonic Median Regression: A Linear Programming Approach
Nilotpal Chakravarti
Mathematics of Operations Research
Vol. 14, No. 2 (May, 1989), pp. 303-308

Isotone Optimization in R : Pool-Adjacent-Violators
Algorithm (PAVA) and Active Set Methods
Leeuw, Hornik, Mair
Journal of Statistical Software 2009

Correctness of Kruskal's algorithms for monotone regression with ties
Leeuw, Psychometrica, 1977
"""
def __init__(self, y_min=None, y_max=None, increasing=True,
out_of_bounds='nan'):
Expand All @@ -228,8 +241,12 @@ def _build_f(self, X, y):
.format(self.out_of_bounds))

bounds_error = self.out_of_bounds == "raise"
self.f_ = interpolate.interp1d(X, y, kind='slinear',
bounds_error=bounds_error)
if len(y) == 1:
# single y, constant prediction
self.f_ = lambda x: y.repeat(x.shape)
else:
self.f_ = interpolate.interp1d(X, y, kind='slinear',
bounds_error=bounds_error)

def _build_y(self, X, y, sample_weight):
"""Build the y_ IsotonicRegression."""
Expand All @@ -249,8 +266,13 @@ def _build_y(self, X, y, sample_weight):

order = np.lexsort((y, X))
order_inv = np.argsort(order)
self.X_ = as_float_array(X[order], copy=False)
self.y_ = isotonic_regression(y[order], sample_weight, self.y_min,
if sample_weight is None:
sample_weight = np.ones(len(y))
X, y, sample_weight = [astype(array[order], np.float64, copy=False)
for array in [X, y, sample_weight]]
unique_X, unique_y, unique_sample_weight = _make_unique(X, y, sample_weight)
self.X_ = unique_X
self.y_ = isotonic_regression(unique_y, unique_sample_weight, self.y_min,
self.y_max, increasing=self.increasing_)

return order_inv
Expand Down Expand Up @@ -319,44 +341,6 @@ def transform(self, T):
T = np.clip(T, self.X_min_, self.X_max_)
return self.f_(T)

def fit_transform(self, X, y, sample_weight=None):
"""Fit model and transform y by linear interpolation.

Parameters
----------
X : array-like, shape=(n_samples,)
Training data.

y : array-like, shape=(n_samples,)
Training target.

sample_weight : array-like, shape=(n_samples,), optional, default: None
Weights. If set to None, all weights will be equal to 1 (equal
weights).

Returns
-------
y_ : array, shape=(n_samples,)
The transformed data.

Notes
-----
X doesn't influence the result of `fit_transform`. It is however stored
for future use, as `transform` needs X to interpolate new input
data.
"""
# Build y_
order_inv = self._build_y(X, y, sample_weight)

# Handle the left and right bounds on X
self.X_min_ = np.min(self.X_)
self.X_max_ = np.max(self.X_)

# Build f_
self._build_f(self.X_, self.y_)

return self.y_[order_inv]

def predict(self, T):
"""Predict new data by linear interpolation.

Expand Down
3 changes: 1 addition & 2 deletions sklearn/tests/test_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,7 @@ def test_sample_weight_warning():

for method in ['sigmoid', 'isotonic']:
base_estimator = LinearSVC(random_state=42)
calibrated_clf = CalibratedClassifierCV(base_estimator, method=method,
random_state=42)
calibrated_clf = CalibratedClassifierCV(base_estimator, method=method)
# LinearSVC does not currently support sample weights but they
# can still be used for the calibration step (with a warning)
msg = "LinearSVC does not support sample_weight."
Expand Down
Loading