scikit-learn · arjoly · Sep 29, 2015 · Sep 11, 2015 · arjoly · Sep 28, 2015
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
@@ -312,10 +312,13 @@ total cost over the entire trees (by summing the cost at each node) of
 Scikit-learn offers a more efficient implementation for the construction of
 decision trees.  A naive implementation (as above) would recompute the class
 label histograms (for classification) or the means (for regression) at for each
-new split point along a given feature. By presorting the feature over all
-relevant samples, and retaining a running label count, we reduce the complexity
+new split point along a given feature. Presorting the feature over all
+relevant samples, and retaining a running label count, will reduce the complexity
 at each node to :math:`O(n_{features}\log(n_{samples}))`, which results in a
-total cost of :math:`O(n_{features}n_{samples}\log(n_{samples}))`.
+total cost of :math:`O(n_{features}n_{samples}\log(n_{samples}))`. This is an option
+for all tree based algorithms. By default it is turned on for gradient boosting,
+where in general it makes training faster, but turned off for all other algorithms as 
+it tends to slow down training when training deep trees.
 
 
 Tips on practical use

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -202,6 +202,11 @@ Enhancements
    - Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for
      the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_.
 
+   - Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor` 
+     and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior
+     the same. This allows gradient boosters to turn off presorting when building
+     deep trees or using sparse data. By `Jacob Schreiber`_.
+
 Bug fixes
 .........
 

diff --git a/sklearn/ensemble/_gradient_boosting.c b/sklearn/ensemble/_gradient_boosting.c
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
@@ -22,36 +22,48 @@
 
 from __future__ import print_function
 from __future__ import division
-from abc import ABCMeta, abstractmethod
-from time import time
-
-import numbers
-import numpy as np
 
-from scipy import stats
+from abc import ABCMeta
+from abc import abstractmethod
 
 from .base import BaseEnsemble
 from ..base import BaseEstimator
 from ..base import ClassifierMixin
 from ..base import RegressorMixin
-from ..utils import check_random_state, check_array, check_X_y, column_or_1d
-from ..utils import check_consistent_length, deprecated
-from ..utils.extmath import logsumexp
-from ..utils.fixes import expit, bincount
-from ..utils.stats import _weighted_percentile
-from ..utils.validation import check_is_fitted, NotFittedError
+
 from ..externals import six
 from ..feature_selection.from_model import _LearntSelectorMixin
 
-from ..tree.tree import DecisionTreeRegressor
-from ..tree._tree import DTYPE, TREE_LEAF
-from ..tree._splitter import PresortBestSplitter
-from ..tree._criterion import FriedmanMSE
-
 from ._gradient_boosting import predict_stages
 from ._gradient_boosting import predict_stage
 from ._gradient_boosting import _random_sample_mask
 
+import numbers
+import numpy as np
+
+from scipy import stats
+from scipy.sparse import csc_matrix
+from scipy.sparse import csr_matrix
+from scipy.sparse import issparse
+
+from time import time
+from ..tree.tree import DecisionTreeRegressor
+from ..tree._tree import DTYPE
+from ..tree._tree import TREE_LEAF
+
+from ..utils import check_random_state 
+from ..utils import check_array
+from ..utils import check_X_y
+from ..utils import column_or_1d
+from ..utils import check_consistent_length
+from ..utils import deprecated
+from ..utils.extmath import logsumexp
+from ..utils.fixes import expit
+from ..utils.fixes import bincount
+from ..utils.stats import _weighted_percentile
+from ..utils.validation import check_is_fitted
+from ..utils.validation import  NotFittedError
+
 
 class QuantileEstimator(BaseEstimator):
     """An estimator predicting the alpha-quantile of the training targets."""
@@ -711,7 +723,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
                  min_samples_leaf, min_weight_fraction_leaf,
                  max_depth, init, subsample, max_features,
                  random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
-                 warm_start=False):
+                 warm_start=False, presort='auto'):
 
         self.n_estimators = n_estimators
         self.learning_rate = learning_rate
@@ -728,11 +740,12 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
         self.verbose = verbose
         self.max_leaf_nodes = max_leaf_nodes
         self.warm_start = warm_start
+        self.presort = presort
 
         self.estimators_ = np.empty((0, 0), dtype=np.object)
 
     def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
-                   criterion, splitter, random_state):
+                   random_state, X_idx_sorted, X_csc=None, X_csr=None):
         """Fit another stage of ``n_classes_`` trees to the boosting model. """
 
         assert sample_mask.dtype == np.bool
@@ -748,27 +761,37 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
 
             # induce regression tree on residuals
             tree = DecisionTreeRegressor(
-                criterion=criterion,
-                splitter=splitter,
+                criterion='friedman_mse',
+                splitter='best',
                 max_depth=self.max_depth,
                 min_samples_split=self.min_samples_split,
                 min_samples_leaf=self.min_samples_leaf,
                 min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                 max_features=self.max_features,
                 max_leaf_nodes=self.max_leaf_nodes,
-                random_state=random_state)
+                random_state=random_state,
+                presort=self.presort)
 
             if self.subsample < 1.0:
                 # no inplace multiplication!
                 sample_weight = sample_weight * sample_mask.astype(np.float64)
 
-            tree.fit(X, residual, sample_weight=sample_weight,
-                     check_input=False)
+            if X_csc is not None:
+                tree.fit(X_csc, residual, sample_weight=sample_weight,
+                         check_input=False, X_idx_sorted=X_idx_sorted)
+            else:
+                tree.fit(X, residual, sample_weight=sample_weight,
+                         check_input=False, X_idx_sorted=X_idx_sorted)
 
             # update tree leaves
-            loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
-                                         sample_weight, sample_mask,
-                                         self.learning_rate, k=k)
+            if X_csr is not None:
+                loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
+                                             sample_weight, sample_mask,
+                                             self.learning_rate, k=k)
+            else:
+                loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
+                                             sample_weight, sample_mask,
+                                             self.learning_rate, k=k)
 
             # add tree to ensemble
             self.estimators_[i, k] = tree
@@ -944,7 +967,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
             self._clear_state()
 
         # Check input
-        X, y = check_X_y(X, y, dtype=DTYPE)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
         n_samples, self.n_features = X.shape
         if sample_weight is None:
             sample_weight = np.ones(n_samples, dtype=np.float32)
@@ -981,9 +1004,25 @@ def fit(self, X, y, sample_weight=None, monitor=None):
             y_pred = self._decision_function(X)
             self._resize_state()
 
+        X_idx_sorted = None
+        presort = self.presort
+        # Allow presort to be 'auto', which means True if the dataset is dense,
+        # otherwise it will be False.
+        if presort == 'auto' and issparse(X):
+            presort = False
+        elif presort == 'auto':
+            presort = True
+
+        if self.presort == True:
+            if issparse(X):
+                raise ValueError("Presorting is not supported for sparse matrices.")
+            else:
+                X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
+                                                 dtype=np.int32)
+
         # fit the boosting stages
         n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
-                                    begin_at_stage, monitor)
+                                    begin_at_stage, monitor, X_idx_sorted)
         # change shape of arrays after fit (early-stopping or additional ests)
         if n_stages != self.estimators_.shape[0]:
             self.estimators_ = self.estimators_[:n_stages]
@@ -994,7 +1033,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         return self
 
     def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
-                    begin_at_stage=0, monitor=None):
+                    begin_at_stage=0, monitor=None, X_idx_sorted=None):
         """Iteratively fits the stages.
 
         For each stage it computes the progress (OOB, train score)
@@ -1015,18 +1054,13 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
         else:
             min_weight_leaf = 0.
 
-        # init criterion and splitter
-        criterion = FriedmanMSE(1)
-        splitter = PresortBestSplitter(criterion,
-                                       self.max_features_,
-                                       self.min_samples_leaf,
-                                       min_weight_leaf,
-                                       random_state)
-
         if self.verbose:
             verbose_reporter = VerboseReporter(self.verbose)
             verbose_reporter.init(self, begin_at_stage)
 
+        X_csc = csc_matrix(X) if issparse(X) else None
+        X_csr = csr_matrix(X) if issparse(X) else None
+
         # perform boosting iterations
         i = begin_at_stage
         for i in range(begin_at_stage, self.n_estimators):
@@ -1042,8 +1076,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
 
             # fit next stage of trees
             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
-                                     sample_mask, criterion, splitter,
-                                     random_state)
+                                     sample_mask, random_state, X_idx_sorted,
+                                     X_csc, X_csr)
 
             # track deviance (= loss)
             if do_oob:
@@ -1074,6 +1108,7 @@ def _make_estimator(self, append=True):
     def _init_decision_function(self, X):
         """Check input and compute prediction of ``init``. """
         self._check_initialized()
+        X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
         if X.shape[1] != self.n_features:
             raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format(
                 self.n_features, X.shape[1]))
@@ -1104,7 +1139,9 @@ def decision_function(self, X):
             Regression and binary classification produce an array of shape
             [n_samples].
         """
-        X = check_array(X, dtype=DTYPE, order="C")
+
+        self._check_initialized()
+        X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
         score = self._decision_function(X)
         if score.shape[1] == 1:
             return score.ravel()
@@ -1318,6 +1355,12 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
+    presort : bool or 'auto', optional (default='auto')
+        Whether to presort the data to speed up the finding of best splits in
+        fitting. Auto mode by default will use presorting on dense data and
+        default to normal sorting on sparse data. Setting presort to true on
+        sparse data will raise an error.
+
     Attributes
     ----------
     feature_importances_ : array, shape = [n_features]
@@ -1369,7 +1412,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, init=None, random_state=None,
                  max_features=None, verbose=0,
-                 max_leaf_nodes=None, warm_start=False):
+                 max_leaf_nodes=None, warm_start=False,
+                 presort='auto'):
 
         super(GradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
@@ -1379,7 +1423,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
             max_depth=max_depth, init=init, subsample=subsample,
             max_features=max_features,
             random_state=random_state, verbose=verbose,
-            max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
+            max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
+            presort=presort)
 
     def _validate_y(self, y):
         self.classes_, y = np.unique(y, return_inverse=True)
@@ -1644,6 +1689,11 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
+    presort : bool or 'auto', optional (default='auto')
+        Whether to presort the data to speed up the finding of best splits in
+        fitting. Auto mode by default will use presorting on dense data and
+        default to normal sorting on sparse data. Setting presort to true on
+        sparse data will raise an error.
 
     Attributes
     ----------
@@ -1693,7 +1743,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, init=None, random_state=None,
                  max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
-                 warm_start=False):
+                 warm_start=False, presort='auto'):
 
         super(GradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
@@ -1703,7 +1753,8 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
             max_depth=max_depth, init=init, subsample=subsample,
             max_features=max_features,
             random_state=random_state, alpha=alpha, verbose=verbose,
-            max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
+            max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
+            presort='auto')
 
     def predict(self, X):
         """Predict regression target for X.