Skip to content

[MRG+1] Merge PresortBestSplitter and BestSplitter #5252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 29, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions doc/modules/tree.rst
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,13 @@ total cost over the entire trees (by summing the cost at each node) of
Scikit-learn offers a more efficient implementation for the construction of
decision trees. A naive implementation (as above) would recompute the class
label histograms (for classification) or the means (for regression) at for each
new split point along a given feature. By presorting the feature over all
relevant samples, and retaining a running label count, we reduce the complexity
new split point along a given feature. Presorting the feature over all
relevant samples, and retaining a running label count, will reduce the complexity
at each node to :math:`O(n_{features}\log(n_{samples}))`, which results in a
total cost of :math:`O(n_{features}n_{samples}\log(n_{samples}))`.
total cost of :math:`O(n_{features}n_{samples}\log(n_{samples}))`. This is an option
for all tree based algorithms. By default it is turned on for gradient boosting,
where in general it makes training faster, but turned off for all other algorithms as
it tends to slow down training when training deep trees.


Tips on practical use
Expand Down
5 changes: 5 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,11 @@ Enhancements
- Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for
the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_.

- Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor`
and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior
the same. This allows gradient boosters to turn off presorting when building
deep trees or using sparse data. By `Jacob Schreiber`_.

Bug fixes
.........

Expand Down
21 changes: 18 additions & 3 deletions sklearn/ensemble/_gradient_boosting.c

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

141 changes: 96 additions & 45 deletions sklearn/ensemble/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,36 +22,48 @@

from __future__ import print_function
from __future__ import division
from abc import ABCMeta, abstractmethod
from time import time

import numbers
import numpy as np

from scipy import stats
from abc import ABCMeta
from abc import abstractmethod

from .base import BaseEnsemble
from ..base import BaseEstimator
from ..base import ClassifierMixin
from ..base import RegressorMixin
from ..utils import check_random_state, check_array, check_X_y, column_or_1d
from ..utils import check_consistent_length, deprecated
from ..utils.extmath import logsumexp
from ..utils.fixes import expit, bincount
from ..utils.stats import _weighted_percentile
from ..utils.validation import check_is_fitted, NotFittedError

from ..externals import six
from ..feature_selection.from_model import _LearntSelectorMixin

from ..tree.tree import DecisionTreeRegressor
from ..tree._tree import DTYPE, TREE_LEAF
from ..tree._splitter import PresortBestSplitter
from ..tree._criterion import FriedmanMSE

from ._gradient_boosting import predict_stages
from ._gradient_boosting import predict_stage
from ._gradient_boosting import _random_sample_mask

import numbers
import numpy as np

from scipy import stats
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import issparse

from time import time
from ..tree.tree import DecisionTreeRegressor
from ..tree._tree import DTYPE
from ..tree._tree import TREE_LEAF

from ..utils import check_random_state
from ..utils import check_array
from ..utils import check_X_y
from ..utils import column_or_1d
from ..utils import check_consistent_length
from ..utils import deprecated
from ..utils.extmath import logsumexp
from ..utils.fixes import expit
from ..utils.fixes import bincount
from ..utils.stats import _weighted_percentile
from ..utils.validation import check_is_fitted
from ..utils.validation import NotFittedError


class QuantileEstimator(BaseEstimator):
"""An estimator predicting the alpha-quantile of the training targets."""
Expand Down Expand Up @@ -711,7 +723,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
min_samples_leaf, min_weight_fraction_leaf,
max_depth, init, subsample, max_features,
random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
warm_start=False):
warm_start=False, presort='auto'):

self.n_estimators = n_estimators
self.learning_rate = learning_rate
Expand All @@ -728,11 +740,12 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
self.verbose = verbose
self.max_leaf_nodes = max_leaf_nodes
self.warm_start = warm_start
self.presort = presort

self.estimators_ = np.empty((0, 0), dtype=np.object)

def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
criterion, splitter, random_state):
random_state, X_idx_sorted, X_csc=None, X_csr=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it would be better to have X_fit and X_predict instead of X, X_csc and X_csr.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disagree. X_csc and X_csr is explicit about what those are. If I saw X_fit and X_predict in the wild, I'd assume that they were different datasets. Here they are the same dataset.

"""Fit another stage of ``n_classes_`` trees to the boosting model. """

assert sample_mask.dtype == np.bool
Expand All @@ -748,27 +761,37 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,

# induce regression tree on residuals
tree = DecisionTreeRegressor(
criterion=criterion,
splitter=splitter,
criterion='friedman_mse',
splitter='best',
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
max_features=self.max_features,
max_leaf_nodes=self.max_leaf_nodes,
random_state=random_state)
random_state=random_state,
presort=self.presort)

if self.subsample < 1.0:
# no inplace multiplication!
sample_weight = sample_weight * sample_mask.astype(np.float64)

tree.fit(X, residual, sample_weight=sample_weight,
check_input=False)
if X_csc is not None:
tree.fit(X_csc, residual, sample_weight=sample_weight,
check_input=False, X_idx_sorted=X_idx_sorted)
else:
tree.fit(X, residual, sample_weight=sample_weight,
check_input=False, X_idx_sorted=X_idx_sorted)

# update tree leaves
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
sample_weight, sample_mask,
self.learning_rate, k=k)
if X_csr is not None:
loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
sample_weight, sample_mask,
self.learning_rate, k=k)
else:
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
sample_weight, sample_mask,
self.learning_rate, k=k)

# add tree to ensemble
self.estimators_[i, k] = tree
Expand Down Expand Up @@ -944,7 +967,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
self._clear_state()

# Check input
X, y = check_X_y(X, y, dtype=DTYPE)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
n_samples, self.n_features = X.shape
if sample_weight is None:
sample_weight = np.ones(n_samples, dtype=np.float32)
Expand Down Expand Up @@ -981,9 +1004,25 @@ def fit(self, X, y, sample_weight=None, monitor=None):
y_pred = self._decision_function(X)
self._resize_state()

X_idx_sorted = None
presort = self.presort
# Allow presort to be 'auto', which means True if the dataset is dense,
# otherwise it will be False.
if presort == 'auto' and issparse(X):
presort = False
elif presort == 'auto':
presort = True

if self.presort == True:
if issparse(X):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not necessary given line above.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be indeed simplify

if presort == True: 
    if issparse(X):
        ....
    else:
       ...

raise ValueError("Presorting is not supported for sparse matrices.")
else:
X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
dtype=np.int32)

# fit the boosting stages
n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
begin_at_stage, monitor)
begin_at_stage, monitor, X_idx_sorted)
# change shape of arrays after fit (early-stopping or additional ests)
if n_stages != self.estimators_.shape[0]:
self.estimators_ = self.estimators_[:n_stages]
Expand All @@ -994,7 +1033,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
return self

def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
begin_at_stage=0, monitor=None):
begin_at_stage=0, monitor=None, X_idx_sorted=None):
"""Iteratively fits the stages.

For each stage it computes the progress (OOB, train score)
Expand All @@ -1015,18 +1054,13 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
else:
min_weight_leaf = 0.

# init criterion and splitter
criterion = FriedmanMSE(1)
splitter = PresortBestSplitter(criterion,
self.max_features_,
self.min_samples_leaf,
min_weight_leaf,
random_state)

if self.verbose:
verbose_reporter = VerboseReporter(self.verbose)
verbose_reporter.init(self, begin_at_stage)

X_csc = csc_matrix(X) if issparse(X) else None
X_csr = csr_matrix(X) if issparse(X) else None

# perform boosting iterations
i = begin_at_stage
for i in range(begin_at_stage, self.n_estimators):
Expand All @@ -1042,8 +1076,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,

# fit next stage of trees
y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
sample_mask, criterion, splitter,
random_state)
sample_mask, random_state, X_idx_sorted,
X_csc, X_csr)

# track deviance (= loss)
if do_oob:
Expand Down Expand Up @@ -1074,6 +1108,7 @@ def _make_estimator(self, append=True):
def _init_decision_function(self, X):
"""Check input and compute prediction of ``init``. """
self._check_initialized()
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
if X.shape[1] != self.n_features:
raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format(
self.n_features, X.shape[1]))
Expand Down Expand Up @@ -1104,7 +1139,9 @@ def decision_function(self, X):
Regression and binary classification produce an array of shape
[n_samples].
"""
X = check_array(X, dtype=DTYPE, order="C")

self._check_initialized()
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
score = self._decision_function(X)
if score.shape[1] == 1:
return score.ravel()
Expand Down Expand Up @@ -1318,6 +1355,12 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
If None, the random number generator is the RandomState instance used
by `np.random`.

presort : bool or 'auto', optional (default='auto')
Whether to presort the data to speed up the finding of best splits in
fitting. Auto mode by default will use presorting on dense data and
default to normal sorting on sparse data. Setting presort to true on
sparse data will raise an error.

Attributes
----------
feature_importances_ : array, shape = [n_features]
Expand Down Expand Up @@ -1369,7 +1412,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
min_samples_leaf=1, min_weight_fraction_leaf=0.,
max_depth=3, init=None, random_state=None,
max_features=None, verbose=0,
max_leaf_nodes=None, warm_start=False):
max_leaf_nodes=None, warm_start=False,
presort='auto'):

super(GradientBoostingClassifier, self).__init__(
loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
Expand All @@ -1379,7 +1423,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
max_depth=max_depth, init=init, subsample=subsample,
max_features=max_features,
random_state=random_state, verbose=verbose,
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
presort=presort)

def _validate_y(self, y):
self.classes_, y = np.unique(y, return_inverse=True)
Expand Down Expand Up @@ -1644,6 +1689,11 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
If None, the random number generator is the RandomState instance used
by `np.random`.

presort : bool or 'auto', optional (default='auto')
Whether to presort the data to speed up the finding of best splits in
fitting. Auto mode by default will use presorting on dense data and
default to normal sorting on sparse data. Setting presort to true on
sparse data will raise an error.

Attributes
----------
Expand Down Expand Up @@ -1693,7 +1743,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
min_samples_leaf=1, min_weight_fraction_leaf=0.,
max_depth=3, init=None, random_state=None,
max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
warm_start=False):
warm_start=False, presort='auto'):

super(GradientBoostingRegressor, self).__init__(
loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
Expand All @@ -1703,7 +1753,8 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
max_depth=max_depth, init=init, subsample=subsample,
max_features=max_features,
random_state=random_state, alpha=alpha, verbose=verbose,
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
presort='auto')

def predict(self, X):
"""Predict regression target for X.
Expand Down
Loading