Merge pull request #4302 from amueller/isotonic_regression_duplicate_fixes

ogrisel · ogrisel · commit ec84a007f401 · 2015-03-06T17:29:46.000+01:00
[MRG+1] Isotonic regression duplicate fixes
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -333,6 +333,9 @@ Bug fixes
       to make it consistent with the documentation and
       ``decision_function``. By Artem Sobolev.
 
+    - Fixed handling of ties in :class:`isotonic.IsotonicRegression`.
+      We now use the weighted average of targets (secondary method). By
+      `Andreas Müller`_ and `Michael Bommarito <http://bommaritollc.com/>`_.
 
 API changes summary
 -------------------
diff --git a/sklearn/_isotonic.c b/sklearn/_isotonic.c
diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx
@@ -69,3 +69,56 @@ def _isotonic_regression(np.ndarray[DOUBLE, ndim=1] y,
             break
 
     return solution
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+def _make_unique(np.ndarray[dtype=np.float64_t] X,
+                  np.ndarray[dtype=np.float64_t] y,
+                  np.ndarray[dtype=np.float64_t] sample_weights):
+    """Average targets for duplicate X, drop duplicates.
+
+    Aggregates duplicate X values into a single X value where
+    the target y is a (sample_weighted) average of the individual
+    targets.
+
+    Assumes that X is ordered, so that all duplicates follow each other.
+    """
+    unique_values = len(np.unique(X))
+    if unique_values == len(X):
+        return X, y, sample_weights
+    cdef np.ndarray[dtype=np.float64_t] y_out = np.empty(unique_values)
+    cdef np.ndarray[dtype=np.float64_t] x_out = np.empty(unique_values)
+    cdef np.ndarray[dtype=np.float64_t] weights_out = np.empty(unique_values)
+
+    cdef float current_x = X[0]
+    cdef float current_y = 0
+    cdef float current_weight = 0
+    cdef float y_old = 0
+    cdef int i = 0
+    cdef int current_count = 0
+    cdef int j
+    cdef float x
+    cdef int n_samples = len(X)
+    for j in range(n_samples):
+        x = X[j]
+        if x != current_x:
+            # next unique value
+            x_out[i] = current_x
+            weights_out[i] = current_weight / current_count
+            y_out[i] = current_y / current_weight
+            i += 1
+            current_x = x
+            current_weight = sample_weights[j]
+            current_y = y[j] * sample_weights[j]
+            current_count = 1
+        else:
+            current_weight += sample_weights[j]
+            current_y += y[j] * sample_weights[j]
+            current_count += 1
+            
+    x_out[i] = current_x
+    weights_out[i] = current_weight / current_count
+    y_out[i] = current_y / current_weight    
+    return x_out, y_out, weights_out
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -18,7 +18,6 @@
 
 from .base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
 from .preprocessing import LabelBinarizer
-from .utils import check_random_state
 from .utils import check_X_y, check_array, indexable, column_or_1d
 from .utils.validation import check_is_fitted
 from .isotonic import IsotonicRegression
@@ -59,9 +58,6 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
         If "prefit" is passed, it is assumed that base_estimator has been
         fitted already and all data is used for calibration.
 
-    random_state : int, RandomState instance or None (default=None)
-        Used to randomly break ties when method is 'isotonic'.
-
     Attributes
     ----------
     classes_ : array, shape (n_classes)
@@ -86,12 +82,10 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
     .. [4] Predicting Good Probabilities with Supervised Learning,
            A. Niculescu-Mizil & R. Caruana, ICML 2005
     """
-    def __init__(self, base_estimator=None, method='sigmoid', cv=3,
-                 random_state=None):
+    def __init__(self, base_estimator=None, method='sigmoid', cv=3):
         self.base_estimator = base_estimator
         self.method = method
         self.cv = cv
-        self.random_state = random_state
 
     def fit(self, X, y, sample_weight=None):
         """Fit the calibrated model
@@ -116,7 +110,6 @@ def fit(self, X, y, sample_weight=None):
         X, y = indexable(X, y)
         lb = LabelBinarizer().fit(y)
         self.classes_ = lb.classes_
-        random_state = check_random_state(self.random_state)
 
         # Check that we each cross-validation fold can have at least one
         # example per class
@@ -136,7 +129,7 @@ def fit(self, X, y, sample_weight=None):
 
         if self.cv == "prefit":
             calibrated_classifier = _CalibratedClassifier(
-                base_estimator, method=self.method, random_state=random_state)
+                base_estimator, method=self.method)
             if sample_weight is not None:
                 calibrated_classifier.fit(X, y, sample_weight)
             else:
@@ -164,8 +157,7 @@ def fit(self, X, y, sample_weight=None):
                     this_estimator.fit(X[train], y[train])
 
                 calibrated_classifier = _CalibratedClassifier(
-                    this_estimator, method=self.method,
-                    random_state=random_state)
+                    this_estimator, method=self.method)
                 if sample_weight is not None:
                     calibrated_classifier.fit(X[test], y[test],
                                               sample_weight[test])
@@ -242,9 +234,6 @@ class _CalibratedClassifier(object):
         corresponds to Platt's method or 'isotonic' which is a
         non-parameteric approach based on isotonic regression.
 
-    random_state : int, RandomState instance or None (default=None)
-        Used to randomly break ties when method is 'isotonic'.
-
     References
     ----------
     .. [1] Obtaining calibrated probability estimates from decision trees
@@ -259,11 +248,9 @@ class _CalibratedClassifier(object):
     .. [4] Predicting Good Probabilities with Supervised Learning,
            A. Niculescu-Mizil & R. Caruana, ICML 2005
     """
-    def __init__(self, base_estimator, method='sigmoid',
-                 random_state=None):
+    def __init__(self, base_estimator, method='sigmoid'):
         self.base_estimator = base_estimator
         self.method = method
-        self.random_state = random_state
 
     def _preproc(self, X):
         n_classes = len(self.classes_)
@@ -312,13 +299,6 @@ def fit(self, X, y, sample_weight=None):
         for k, this_df in zip(idx_pos_class, df.T):
             if self.method == 'isotonic':
                 calibrator = IsotonicRegression(out_of_bounds='clip')
-                # XXX: isotonic regression cannot deal correctly with
-                #      situations in which multiple inputs are identical but
-                #      have different outputs. Since this is not untypical
-                #      when calibrating, we add some small random jitter to
-                #      the inputs.
-                jitter = self.random_state.normal(0, 1e-10, this_df.shape[0])
-                this_df = this_df + jitter
             elif self.method == 'sigmoid':
                 calibrator = _SigmoidCalibration()
             else:
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
@@ -8,7 +8,8 @@
 from scipy.stats import spearmanr
 from .base import BaseEstimator, TransformerMixin, RegressorMixin
 from .utils import as_float_array, check_array, check_consistent_length
-from ._isotonic import _isotonic_regression
+from .utils.fixes import astype
+from ._isotonic import _isotonic_regression, _make_unique
 import warnings
 import math
 
@@ -200,12 +201,24 @@ class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
         The stepwise interpolating function that covers the domain
         X_.
 
+    Notes
+    -----
+    Ties are broken using the secondary method from Leeuw, 1977.
+
     References
     ----------
     Isotonic Median Regression: A Linear Programming Approach
     Nilotpal Chakravarti
     Mathematics of Operations Research
     Vol. 14, No. 2 (May, 1989), pp. 303-308
+
+    Isotone Optimization in R : Pool-Adjacent-Violators
+    Algorithm (PAVA) and Active Set Methods
+    Leeuw, Hornik, Mair
+    Journal of Statistical Software 2009
+
+    Correctness of Kruskal's algorithms for monotone regression with ties
+    Leeuw, Psychometrica, 1977
     """
     def __init__(self, y_min=None, y_max=None, increasing=True,
                  out_of_bounds='nan'):
@@ -228,8 +241,12 @@ def _build_f(self, X, y):
                              .format(self.out_of_bounds))
 
         bounds_error = self.out_of_bounds == "raise"
-        self.f_ = interpolate.interp1d(X, y, kind='slinear',
-                                       bounds_error=bounds_error)
+        if len(y) == 1:
+            # single y, constant prediction
+            self.f_ = lambda x: y.repeat(x.shape)
+        else:
+            self.f_ = interpolate.interp1d(X, y, kind='slinear',
+                                           bounds_error=bounds_error)
 
     def _build_y(self, X, y, sample_weight):
         """Build the y_ IsotonicRegression."""
@@ -249,8 +266,13 @@ def _build_y(self, X, y, sample_weight):
 
         order = np.lexsort((y, X))
         order_inv = np.argsort(order)
-        self.X_ = as_float_array(X[order], copy=False)
-        self.y_ = isotonic_regression(y[order], sample_weight, self.y_min,
+        if sample_weight is None:
+            sample_weight = np.ones(len(y))
+        X, y, sample_weight = [astype(array[order], np.float64, copy=False)
+                               for array in [X, y, sample_weight]]
+        unique_X, unique_y, unique_sample_weight = _make_unique(X, y, sample_weight)
+        self.X_ = unique_X
+        self.y_ = isotonic_regression(unique_y, unique_sample_weight, self.y_min,
                                       self.y_max, increasing=self.increasing_)
 
         return order_inv
@@ -319,44 +341,6 @@ def transform(self, T):
             T = np.clip(T, self.X_min_, self.X_max_)
         return self.f_(T)
 
-    def fit_transform(self, X, y, sample_weight=None):
-        """Fit model and transform y by linear interpolation.
-
-        Parameters
-        ----------
-        X : array-like, shape=(n_samples,)
-            Training data.
-
-        y : array-like, shape=(n_samples,)
-            Training target.
-
-        sample_weight : array-like, shape=(n_samples,), optional, default: None
-            Weights. If set to None, all weights will be equal to 1 (equal
-            weights).
-
-        Returns
-        -------
-        y_ : array, shape=(n_samples,)
-            The transformed data.
-
-        Notes
-        -----
-        X doesn't influence the result of `fit_transform`. It is however stored
-        for future use, as `transform` needs X to interpolate new input
-        data.
-        """
-        # Build y_
-        order_inv = self._build_y(X, y, sample_weight)
-
-        # Handle the left and right bounds on X
-        self.X_min_ = np.min(self.X_)
-        self.X_max_ = np.max(self.X_)
-
-        # Build f_
-        self._build_f(self.X_, self.y_)
-
-        return self.y_[order_inv]
-
     def predict(self, T):
         """Predict new data by linear interpolation.
 
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
@@ -114,8 +114,7 @@ def test_sample_weight_warning():
 
     for method in ['sigmoid', 'isotonic']:
         base_estimator = LinearSVC(random_state=42)
-        calibrated_clf = CalibratedClassifierCV(base_estimator, method=method,
-                                                random_state=42)
+        calibrated_clf = CalibratedClassifierCV(base_estimator, method=method)
         # LinearSVC does not currently support sample weights but they
         # can still be used for the calibration step (with a warning)
         msg = "LinearSVC does not support sample_weight."
diff --git a/sklearn/tests/test_isotonic.py b/sklearn/tests/test_isotonic.py