much faster isotonic regression prediction (involved re-setting interpolation to linear)

jarfa · jarfa · commit 7eff50e97f25 · 2016-02-03T17:38:13.000-05:00
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -88,6 +88,9 @@ Enhancements
    - Add ``sample_weight`` parameter to :func:`metrics.confusion_matrix`.
      By `Bernardo Stein`_.
 
+   - Prediction of out-of-sample events with Isotonic Regression is now much
+     faster (over 1000x in tests with synthetic data). By `Jonathan Arfa`_.
+
 
 Bug fixes
 .........
@@ -127,6 +130,9 @@ API changes summary
    - ``residual_metric`` has been deprecated in :class:`linear_model.RANSACRegressor`.
      Use ``loss`` instead. By `Manoj Kumar`_.
 
+   - Access to public attributes ``.X_`` and ``.y_`` has been deprecated in
+     :class:`isotonic.IsotonicRegression`. By `Jonathan Arfa`_.
+
 
 .. _changes_0_17_1:
 
@@ -4037,3 +4043,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Imaculate: https://github.com/Imaculate
 
 .. _Bernardo Stein: https://github.com/DanielSidhion
+
+.. _Jonathan Arfa: https://github.com/jarfa
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
@@ -7,7 +7,7 @@
 from scipy import interpolate
 from scipy.stats import spearmanr
 from .base import BaseEstimator, TransformerMixin, RegressorMixin
-from .utils import as_float_array, check_array, check_consistent_length
+from .utils import as_float_array, check_array, check_consistent_length, deprecated
 from .utils.fixes import astype
 from ._isotonic import _isotonic_regression, _make_unique
 import warnings
@@ -234,6 +234,32 @@ def __init__(self, y_min=None, y_max=None, increasing=True,
         self.increasing = increasing
         self.out_of_bounds = out_of_bounds
 
+    @property
+    @deprecated("Attribute ``X_`` is deprecated in version 0.18 and will be removed in version 0.20.")
+    def X_(self):
+        return self._X_
+
+    @X_.setter
+    def X_(self, value):
+        self._X_ = value
+
+    @X_.deleter
+    def X_(self):
+        del self._X_
+
+    @property
+    @deprecated("Attribute ``y_`` is deprecated in version 0.18 and will be removed in version 0.20.")
+    def y_(self):
+        return self._y_
+
+    @y_.setter
+    def y_(self, value):
+        self._y_ = value
+
+    @y_.deleter
+    def y_(self):
+        del self._y_
+
     def _check_fit_data(self, X, y, sample_weight=None):
         if len(X.shape) != 1:
             raise ValueError("X should be a 1d array")
@@ -252,7 +278,7 @@ def _build_f(self, X, y):
             # single y, constant prediction
             self.f_ = lambda x: y.repeat(x.shape)
         else:
-            self.f_ = interpolate.interp1d(X, y, kind='slinear',
+            self.f_ = interpolate.interp1d(X, y, kind='linear',
                                            bounds_error=bounds_error)
 
     def _build_y(self, X, y, sample_weight):
@@ -282,8 +308,8 @@ def _build_y(self, X, y, sample_weight):
         X, y, sample_weight = [astype(array[order], np.float64, copy=False)
                                for array in [X, y, sample_weight]]
         unique_X, unique_y, unique_sample_weight = _make_unique(X, y, sample_weight)
-        self.X_ = unique_X
-        self.y_ = isotonic_regression(unique_y, unique_sample_weight, self.y_min,
+        self._X_ = unique_X
+        self._y_ = isotonic_regression(unique_y, unique_sample_weight, self.y_min,
                                       self.y_max, increasing=self.increasing_)
 
         return order_inv
@@ -317,11 +343,24 @@ def fit(self, X, y, sample_weight=None):
         self._build_y(X, y, sample_weight)
 
         # Handle the left and right bounds on X
-        self.X_min_ = np.min(self.X_)
-        self.X_max_ = np.max(self.X_)
+        self.X_min_ = np.min(self._X_)
+        self.X_max_ = np.max(self._X_)
+
+        # Remove unnecessary points for faster prediction
+        keep_data = np.ones((len(self._y_),), dtype=bool)
+        # Aside from the 1st and last point, remove points whose y values
+        # are equal to both the point before and the point after it.
+        keep_data[1:-1] = np.logical_or(
+            np.not_equal(self._y_[1:-1], self._y_[:-2]),
+            np.not_equal(self._y_[1:-1], self._y_[2:])
+            )
+        # We're keeping self.X_ and self.y_ around for backwards compatibility,
+        # but they should be considered deprecated.
+        self._necessary_X_ = self._X_[keep_data]
+        self._necessary_y_ = self._y_[keep_data]
 
         # Build f_
-        self._build_f(self.X_, self.y_)
+        self._build_f(self._necessary_X_, self._necessary_y_)
 
         return self
 
@@ -381,4 +420,4 @@ def __setstate__(self, state):
         We need to rebuild the interpolation function.
         """
         self.__dict__.update(state)
-        self._build_f(self.X_, self.y_)
+        self._build_f(self._necessary_X_, self._necessary_y_)
diff --git a/sklearn/tests/test_isotonic.py b/sklearn/tests/test_isotonic.py
@@ -346,3 +346,35 @@ def test_isotonic_zero_weight_loop():
 
     # This will hang in failure case.
     regression.fit(x, y, sample_weight=w)
+
+
+def test_fast_predict():
+    # test that the faster prediction (https://github.com/scikit-learn/scikit-learn/pull/6206)
+    # change doesn't affect out-of-sample predictions.
+    rng = np.random.RandomState(123)
+    N = 10**3
+    # X values over the -10,10 range
+    training_X = 20.0 * rng.rand(N) - 10
+    training_Y = np.less(
+        rng.rand(N),
+        1.0 / (1.0 + np.exp(-training_X))
+        ).astype('int64')
+    
+    slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
+    fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
+    
+    # fit with ALL input data, not just necessary
+    # this code is taken from the .fit() method, without removing unnecessary points
+    slow_model._build_y(training_X, training_Y, None)
+    slow_model.X_min_ = np.min(slow_model._X_)
+    slow_model.X_max_ = np.max(slow_model._X_)
+    slow_model._build_f(slow_model._X_, slow_model._y_)
+
+    #fit with just the necessary data
+    fast_model.fit(training_X, training_Y)
+    
+    pred_X = 20.0 * rng.rand(N) - 10
+    pred_slow = slow_model.predict(pred_X)
+    pred_fast = fast_model.predict(pred_X)
+    
+    assert_array_equal(pred_slow, pred_fast)