-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG+1] Apply method added to GradientBoosting #5228
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ | |
""" | ||
|
||
# Authors: Peter Prettenhofer, Scott White, Gilles Louppe, Emanuele Olivetti, | ||
# Arnaud Joly | ||
# Arnaud Joly, Jacob Schreiber | ||
# License: BSD 3 clause | ||
|
||
from __future__ import print_function | ||
|
@@ -898,6 +898,13 @@ def _resize_state(self): | |
def _is_initialized(self): | ||
return len(getattr(self, 'estimators_', [])) > 0 | ||
|
||
def _check_initialized(self): | ||
"""Check that the estimator is initialized, raising an error if not.""" | ||
if self.estimators_ is None or len(self.estimators_) == 0: | ||
raise NotFittedError("Estimator not fitted, call `fit`" | ||
" before making predictions`.") | ||
|
||
|
||
def fit(self, X, y, sample_weight=None, monitor=None): | ||
"""Fit the gradient boosting model. | ||
|
||
|
@@ -1067,9 +1074,7 @@ def _make_estimator(self, append=True): | |
|
||
def _init_decision_function(self, X): | ||
"""Check input and compute prediction of ``init``. """ | ||
if self.estimators_ is None or len(self.estimators_) == 0: | ||
raise NotFittedError("Estimator not fitted, call `fit`" | ||
" before making predictions`.") | ||
self._check_initialized() | ||
if X.shape[1] != self.n_features: | ||
raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format( | ||
self.n_features, X.shape[1])) | ||
|
@@ -1164,9 +1169,7 @@ def feature_importances_(self): | |
------- | ||
feature_importances_ : array, shape = [n_features] | ||
""" | ||
if self.estimators_ is None or len(self.estimators_) == 0: | ||
raise NotFittedError("Estimator not fitted, call `fit` before" | ||
" `feature_importances_`.") | ||
self._check_initialized() | ||
|
||
total_sum = np.zeros((self.n_features, ), dtype=np.float64) | ||
for stage in self.estimators_: | ||
|
@@ -1184,6 +1187,38 @@ def _validate_y(self, y): | |
# Default implementation | ||
return y | ||
|
||
def apply(self, X): | ||
"""Apply trees in the ensemble to X, return leaf indices. | ||
|
||
Parameters | ||
---------- | ||
X : array-like or sparse matrix, shape = [n_samples, n_features] | ||
The input samples. Internally, it will be converted to | ||
``dtype=np.float32`` and if a sparse matrix is provided | ||
to a sparse ``csr_matrix``. | ||
|
||
Returns | ||
------- | ||
X_leaves : array_like, shape = [n_samples, n_estimators, n_classes] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As pointed out in the test below, you should also specify the expected format for binary classification and regression. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately this docstring will also be used for the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ogrisel With the latest changes at jmschrei@4f7f73e#diff-2229647ab9a84dc25ab3d3d13800cb43R1205 isn't it good enough? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am in favour of ravelling in the regression case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure I understand what you mean by "ravelling in the regression case". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok I get it, removing the 3rd dimension for the regression case. I agree, we can override the apply method in 1- fix the docstring to avoid mentioning n_classes |
||
For each datapoint x in X and for each tree in the ensemble, | ||
return the index of the leaf x ends up in in each estimator. | ||
In the case of binary classification n_classes is 1. | ||
""" | ||
|
||
self._check_initialized() | ||
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) | ||
|
||
# n_classes will be equal to 1 in the binary classification or the | ||
# regression case. | ||
n_estimators, n_classes = self.estimators_.shape | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add an inline comment here that says that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. n_classes is 0; we changed the shape to [n_samples, n_estimators] in a regression context. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's got to be |
||
leaves = np.zeros((X.shape[0], n_estimators, n_classes)) | ||
|
||
for i in range(n_estimators): | ||
for j in range(n_classes): | ||
estimator = self.estimators_[i, j] | ||
leaves[:, i, j] = estimator.apply(X, check_input=False) | ||
|
||
return leaves | ||
|
||
class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): | ||
"""Gradient Boosting for classification. | ||
|
@@ -1704,3 +1739,25 @@ def staged_predict(self, X): | |
""" | ||
for y in self._staged_decision_function(X): | ||
yield y.ravel() | ||
|
||
def apply(self, X): | ||
"""Apply trees in the ensemble to X, return leaf indices. | ||
|
||
Parameters | ||
---------- | ||
X : array-like or sparse matrix, shape = [n_samples, n_features] | ||
The input samples. Internally, it will be converted to | ||
``dtype=np.float32`` and if a sparse matrix is provided | ||
to a sparse ``csr_matrix``. | ||
|
||
Returns | ||
------- | ||
X_leaves : array_like, shape = [n_samples, n_estimators] | ||
For each datapoint x in X and for each tree in the ensemble, | ||
return the index of the leaf x ends up in in each estimator. | ||
""" | ||
|
||
leaves = super(GradientBoostingRegressor, self).apply(X) | ||
leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0]) | ||
return leaves | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,9 @@ def test_classification_toy(): | |
assert np.any(deviance_decrease >= 0.0), \ | ||
"Train deviance does not monotonically decrease." | ||
|
||
leaves = clf.apply(X) | ||
assert_equal(leaves.shape, (6, 10, 1)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we follow the docstring strictly, we should expect a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docstring has been updated to say 1 for binary classification. |
||
|
||
|
||
def test_parameter_checks(): | ||
# Check input parameter validation. | ||
|
@@ -182,6 +185,9 @@ def test_boston(): | |
assert_raises(ValueError, clf.predict, boston.data) | ||
clf.fit(boston.data, boston.target, | ||
sample_weight=sample_weight) | ||
leaves = clf.apply(boston.data) | ||
assert_equal(leaves.shape, (506, 100)) | ||
|
||
y_pred = clf.predict(boston.data) | ||
mse = mean_squared_error(boston.target, y_pred) | ||
assert mse < 6.0, "Failed with loss %s and " \ | ||
|
@@ -207,6 +213,9 @@ def test_iris(): | |
assert score > 0.9, "Failed with subsample %.1f " \ | ||
"and score = %f" % (subsample, score) | ||
|
||
leaves = clf.apply(iris.data) | ||
assert_equal(leaves.shape, (150, 100, 3)) | ||
|
||
|
||
def test_regression_synthetic(): | ||
# Test on synthetic regression datasets used in Leo Breiman, | ||
|
@@ -1012,3 +1021,4 @@ def test_non_uniform_weights_toy_edge_case_clf(): | |
gb = GradientBoostingClassifier(n_estimators=5) | ||
gb.fit(X, y, sample_weight=sample_weight) | ||
assert_array_equal(gb.predict([[1, 0]]), [1]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be great to do a second plot that zooms on the top left corner of the first plot to see the difference among the best models:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To create several figures in a single example you should use
plt.figure()
(checkgit grep "plt.figure()"
to see other examples).