-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG+1] Merge PresortBestSplitter and BestSplitter #5252
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,36 +22,48 @@ | |
|
||
from __future__ import print_function | ||
from __future__ import division | ||
from abc import ABCMeta, abstractmethod | ||
from time import time | ||
|
||
import numbers | ||
import numpy as np | ||
|
||
from scipy import stats | ||
from abc import ABCMeta | ||
from abc import abstractmethod | ||
|
||
from .base import BaseEnsemble | ||
from ..base import BaseEstimator | ||
from ..base import ClassifierMixin | ||
from ..base import RegressorMixin | ||
from ..utils import check_random_state, check_array, check_X_y, column_or_1d | ||
from ..utils import check_consistent_length, deprecated | ||
from ..utils.extmath import logsumexp | ||
from ..utils.fixes import expit, bincount | ||
from ..utils.stats import _weighted_percentile | ||
from ..utils.validation import check_is_fitted, NotFittedError | ||
|
||
from ..externals import six | ||
from ..feature_selection.from_model import _LearntSelectorMixin | ||
|
||
from ..tree.tree import DecisionTreeRegressor | ||
from ..tree._tree import DTYPE, TREE_LEAF | ||
from ..tree._splitter import PresortBestSplitter | ||
from ..tree._criterion import FriedmanMSE | ||
|
||
from ._gradient_boosting import predict_stages | ||
from ._gradient_boosting import predict_stage | ||
from ._gradient_boosting import _random_sample_mask | ||
|
||
import numbers | ||
import numpy as np | ||
|
||
from scipy import stats | ||
from scipy.sparse import csc_matrix | ||
from scipy.sparse import csr_matrix | ||
from scipy.sparse import issparse | ||
|
||
from time import time | ||
from ..tree.tree import DecisionTreeRegressor | ||
from ..tree._tree import DTYPE | ||
from ..tree._tree import TREE_LEAF | ||
|
||
from ..utils import check_random_state | ||
from ..utils import check_array | ||
from ..utils import check_X_y | ||
from ..utils import column_or_1d | ||
from ..utils import check_consistent_length | ||
from ..utils import deprecated | ||
from ..utils.extmath import logsumexp | ||
from ..utils.fixes import expit | ||
from ..utils.fixes import bincount | ||
from ..utils.stats import _weighted_percentile | ||
from ..utils.validation import check_is_fitted | ||
from ..utils.validation import NotFittedError | ||
|
||
|
||
class QuantileEstimator(BaseEstimator): | ||
"""An estimator predicting the alpha-quantile of the training targets.""" | ||
|
@@ -711,7 +723,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, | |
min_samples_leaf, min_weight_fraction_leaf, | ||
max_depth, init, subsample, max_features, | ||
random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, | ||
warm_start=False): | ||
warm_start=False, presort='auto'): | ||
|
||
self.n_estimators = n_estimators | ||
self.learning_rate = learning_rate | ||
|
@@ -728,11 +740,12 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, | |
self.verbose = verbose | ||
self.max_leaf_nodes = max_leaf_nodes | ||
self.warm_start = warm_start | ||
self.presort = presort | ||
|
||
self.estimators_ = np.empty((0, 0), dtype=np.object) | ||
|
||
def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, | ||
criterion, splitter, random_state): | ||
random_state, X_idx_sorted, X_csc=None, X_csr=None): | ||
"""Fit another stage of ``n_classes_`` trees to the boosting model. """ | ||
|
||
assert sample_mask.dtype == np.bool | ||
|
@@ -748,27 +761,37 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, | |
|
||
# induce regression tree on residuals | ||
tree = DecisionTreeRegressor( | ||
criterion=criterion, | ||
splitter=splitter, | ||
criterion='friedman_mse', | ||
splitter='best', | ||
max_depth=self.max_depth, | ||
min_samples_split=self.min_samples_split, | ||
min_samples_leaf=self.min_samples_leaf, | ||
min_weight_fraction_leaf=self.min_weight_fraction_leaf, | ||
max_features=self.max_features, | ||
max_leaf_nodes=self.max_leaf_nodes, | ||
random_state=random_state) | ||
random_state=random_state, | ||
presort=self.presort) | ||
|
||
if self.subsample < 1.0: | ||
# no inplace multiplication! | ||
sample_weight = sample_weight * sample_mask.astype(np.float64) | ||
|
||
tree.fit(X, residual, sample_weight=sample_weight, | ||
check_input=False) | ||
if X_csc is not None: | ||
tree.fit(X_csc, residual, sample_weight=sample_weight, | ||
check_input=False, X_idx_sorted=X_idx_sorted) | ||
else: | ||
tree.fit(X, residual, sample_weight=sample_weight, | ||
check_input=False, X_idx_sorted=X_idx_sorted) | ||
|
||
# update tree leaves | ||
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred, | ||
sample_weight, sample_mask, | ||
self.learning_rate, k=k) | ||
if X_csr is not None: | ||
loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred, | ||
sample_weight, sample_mask, | ||
self.learning_rate, k=k) | ||
else: | ||
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred, | ||
sample_weight, sample_mask, | ||
self.learning_rate, k=k) | ||
|
||
# add tree to ensemble | ||
self.estimators_[i, k] = tree | ||
|
@@ -944,7 +967,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): | |
self._clear_state() | ||
|
||
# Check input | ||
X, y = check_X_y(X, y, dtype=DTYPE) | ||
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE) | ||
n_samples, self.n_features = X.shape | ||
if sample_weight is None: | ||
sample_weight = np.ones(n_samples, dtype=np.float32) | ||
|
@@ -981,9 +1004,25 @@ def fit(self, X, y, sample_weight=None, monitor=None): | |
y_pred = self._decision_function(X) | ||
self._resize_state() | ||
|
||
X_idx_sorted = None | ||
presort = self.presort | ||
# Allow presort to be 'auto', which means True if the dataset is dense, | ||
# otherwise it will be False. | ||
if presort == 'auto' and issparse(X): | ||
presort = False | ||
elif presort == 'auto': | ||
presort = True | ||
|
||
if self.presort == True: | ||
if issparse(X): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not necessary given line above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be indeed simplify
|
||
raise ValueError("Presorting is not supported for sparse matrices.") | ||
else: | ||
X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0), | ||
dtype=np.int32) | ||
|
||
# fit the boosting stages | ||
n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state, | ||
begin_at_stage, monitor) | ||
begin_at_stage, monitor, X_idx_sorted) | ||
# change shape of arrays after fit (early-stopping or additional ests) | ||
if n_stages != self.estimators_.shape[0]: | ||
self.estimators_ = self.estimators_[:n_stages] | ||
|
@@ -994,7 +1033,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): | |
return self | ||
|
||
def _fit_stages(self, X, y, y_pred, sample_weight, random_state, | ||
begin_at_stage=0, monitor=None): | ||
begin_at_stage=0, monitor=None, X_idx_sorted=None): | ||
"""Iteratively fits the stages. | ||
|
||
For each stage it computes the progress (OOB, train score) | ||
|
@@ -1015,18 +1054,13 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state, | |
else: | ||
min_weight_leaf = 0. | ||
|
||
# init criterion and splitter | ||
criterion = FriedmanMSE(1) | ||
splitter = PresortBestSplitter(criterion, | ||
self.max_features_, | ||
self.min_samples_leaf, | ||
min_weight_leaf, | ||
random_state) | ||
|
||
if self.verbose: | ||
verbose_reporter = VerboseReporter(self.verbose) | ||
verbose_reporter.init(self, begin_at_stage) | ||
|
||
X_csc = csc_matrix(X) if issparse(X) else None | ||
X_csr = csr_matrix(X) if issparse(X) else None | ||
|
||
# perform boosting iterations | ||
i = begin_at_stage | ||
for i in range(begin_at_stage, self.n_estimators): | ||
|
@@ -1042,8 +1076,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state, | |
|
||
# fit next stage of trees | ||
y_pred = self._fit_stage(i, X, y, y_pred, sample_weight, | ||
sample_mask, criterion, splitter, | ||
random_state) | ||
sample_mask, random_state, X_idx_sorted, | ||
X_csc, X_csr) | ||
|
||
# track deviance (= loss) | ||
if do_oob: | ||
|
@@ -1074,6 +1108,7 @@ def _make_estimator(self, append=True): | |
def _init_decision_function(self, X): | ||
"""Check input and compute prediction of ``init``. """ | ||
self._check_initialized() | ||
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) | ||
if X.shape[1] != self.n_features: | ||
raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format( | ||
self.n_features, X.shape[1])) | ||
|
@@ -1104,7 +1139,9 @@ def decision_function(self, X): | |
Regression and binary classification produce an array of shape | ||
[n_samples]. | ||
""" | ||
X = check_array(X, dtype=DTYPE, order="C") | ||
|
||
self._check_initialized() | ||
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) | ||
score = self._decision_function(X) | ||
if score.shape[1] == 1: | ||
return score.ravel() | ||
|
@@ -1318,6 +1355,12 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): | |
If None, the random number generator is the RandomState instance used | ||
by `np.random`. | ||
|
||
presort : bool or 'auto', optional (default='auto') | ||
Whether to presort the data to speed up the finding of best splits in | ||
fitting. Auto mode by default will use presorting on dense data and | ||
default to normal sorting on sparse data. Setting presort to true on | ||
sparse data will raise an error. | ||
|
||
Attributes | ||
---------- | ||
feature_importances_ : array, shape = [n_features] | ||
|
@@ -1369,7 +1412,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, | |
min_samples_leaf=1, min_weight_fraction_leaf=0., | ||
max_depth=3, init=None, random_state=None, | ||
max_features=None, verbose=0, | ||
max_leaf_nodes=None, warm_start=False): | ||
max_leaf_nodes=None, warm_start=False, | ||
presort='auto'): | ||
|
||
super(GradientBoostingClassifier, self).__init__( | ||
loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, | ||
|
@@ -1379,7 +1423,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, | |
max_depth=max_depth, init=init, subsample=subsample, | ||
max_features=max_features, | ||
random_state=random_state, verbose=verbose, | ||
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start) | ||
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, | ||
presort=presort) | ||
|
||
def _validate_y(self, y): | ||
self.classes_, y = np.unique(y, return_inverse=True) | ||
|
@@ -1644,6 +1689,11 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): | |
If None, the random number generator is the RandomState instance used | ||
by `np.random`. | ||
|
||
presort : bool or 'auto', optional (default='auto') | ||
Whether to presort the data to speed up the finding of best splits in | ||
fitting. Auto mode by default will use presorting on dense data and | ||
default to normal sorting on sparse data. Setting presort to true on | ||
sparse data will raise an error. | ||
|
||
Attributes | ||
---------- | ||
|
@@ -1693,7 +1743,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, | |
min_samples_leaf=1, min_weight_fraction_leaf=0., | ||
max_depth=3, init=None, random_state=None, | ||
max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, | ||
warm_start=False): | ||
warm_start=False, presort='auto'): | ||
|
||
super(GradientBoostingRegressor, self).__init__( | ||
loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, | ||
|
@@ -1703,7 +1753,8 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, | |
max_depth=max_depth, init=init, subsample=subsample, | ||
max_features=max_features, | ||
random_state=random_state, alpha=alpha, verbose=verbose, | ||
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start) | ||
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, | ||
presort='auto') | ||
|
||
def predict(self, X): | ||
"""Predict regression target for X. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here it would be better to have X_fit and X_predict instead of X, X_csc and X_csr.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I disagree. X_csc and X_csr is explicit about what those are. If I saw X_fit and X_predict in the wild, I'd assume that they were different datasets. Here they are the same dataset.