Skip to content

Commit cd8a5c9

Browse files
committed
ENH apply method added to Gradient Boosting
1 parent 77c963d commit cd8a5c9

File tree

4 files changed

+79
-19
lines changed

4 files changed

+79
-19
lines changed

doc/whats_new.rst

+5
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ Enhancements
156156
visible with extra trees and on datasets with categorical or sparse
157157
features. By `Arnaud Joly`_.
158158

159+
- :class:`ensemble.GradientBoostingRegressor` and
160+
:class:`ensemble.GradientBoostingClassifier` now expose an ``apply``
161+
method for retrieving the leaf indices each sample ends up in under
162+
each try. By `Jacob Schreiber`_.
163+
159164
Bug fixes
160165
.........
161166

examples/ensemble/plot_feature_transformation.py

+3-12
Original file line numberDiff line numberDiff line change
@@ -70,24 +70,15 @@
7070
y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
7171
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
7272

73-
74-
# Supervised transformation based on gradient boosted trees. Demonstrates
75-
# the use of each tree's apply() method.
76-
def gradient_apply(clf, X):
77-
X_trans = []
78-
for tree in clf.estimators_.ravel():
79-
X_trans.append(tree.apply(X))
80-
return np.array(X_trans).T
81-
8273
grd = GradientBoostingClassifier(n_estimators=n_estimator)
8374
grd_enc = OneHotEncoder()
8475
grd_lm = LogisticRegression()
8576
grd.fit(X_train, y_train)
86-
grd_enc.fit(gradient_apply(grd, X_train))
87-
grd_lm.fit(grd_enc.transform(gradient_apply(grd, X_train_lr)), y_train_lr)
77+
grd_enc.fit(grd.apply(X_train)[:, :, 0])
78+
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
8879

8980
y_pred_grd_lm = grd_lm.predict_proba(
90-
grd_enc.transform(gradient_apply(grd, X_test)))[:, 1]
81+
grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
9182
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
9283

9384

sklearn/ensemble/gradient_boosting.py

+61-7
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"""
1818

1919
# Authors: Peter Prettenhofer, Scott White, Gilles Louppe, Emanuele Olivetti,
20-
# Arnaud Joly
20+
# Arnaud Joly, Jacob Schreiber
2121
# License: BSD 3 clause
2222

2323
from __future__ import print_function
@@ -898,6 +898,13 @@ def _resize_state(self):
898898
def _is_initialized(self):
899899
return len(getattr(self, 'estimators_', [])) > 0
900900

901+
def _check_initialized(self):
902+
"""Check that the estimator is initialized, raising an error if not."""
903+
if self.estimators_ is None or len(self.estimators_) == 0:
904+
raise NotFittedError("Estimator not fitted, call `fit`"
905+
" before making predictions`.")
906+
907+
901908
def fit(self, X, y, sample_weight=None, monitor=None):
902909
"""Fit the gradient boosting model.
903910
@@ -1067,9 +1074,7 @@ def _make_estimator(self, append=True):
10671074

10681075
def _init_decision_function(self, X):
10691076
"""Check input and compute prediction of ``init``. """
1070-
if self.estimators_ is None or len(self.estimators_) == 0:
1071-
raise NotFittedError("Estimator not fitted, call `fit`"
1072-
" before making predictions`.")
1077+
self._check_initialized()
10731078
if X.shape[1] != self.n_features:
10741079
raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format(
10751080
self.n_features, X.shape[1]))
@@ -1164,9 +1169,7 @@ def feature_importances_(self):
11641169
-------
11651170
feature_importances_ : array, shape = [n_features]
11661171
"""
1167-
if self.estimators_ is None or len(self.estimators_) == 0:
1168-
raise NotFittedError("Estimator not fitted, call `fit` before"
1169-
" `feature_importances_`.")
1172+
self._check_initialized()
11701173

11711174
total_sum = np.zeros((self.n_features, ), dtype=np.float64)
11721175
for stage in self.estimators_:
@@ -1184,6 +1187,36 @@ def _validate_y(self, y):
11841187
# Default implementation
11851188
return y
11861189

1190+
def apply(self, X):
1191+
"""Apply trees in the ensemble to X, return leaf indices.
1192+
1193+
Parameters
1194+
----------
1195+
X : array-like or sparse matrix, shape = [n_samples, n_features]
1196+
The input samples. Internally, it will be converted to
1197+
``dtype=np.float32`` and if a sparse matrix is provided
1198+
to a sparse ``csr_matrix``.
1199+
1200+
Returns
1201+
-------
1202+
X_leaves : array_like, shape = [n_samples, n_estimators, n_classes]
1203+
For each datapoint x in X and for each tree in the ensemble,
1204+
return the index of the leaf x ends up in in each estimator.
1205+
In the case of binary classification n_classes is 1.
1206+
"""
1207+
1208+
self._check_initialized()
1209+
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
1210+
1211+
n_estimators, n_classes = self.estimators_.shape
1212+
leaves = np.zeros((X.shape[0], n_estimators, n_classes))
1213+
1214+
for i in range(n_estimators):
1215+
for j in range(n_classes):
1216+
estimator = self.estimators_[i, j]
1217+
leaves[:, i, j] = estimator.apply(X, check_input=False)
1218+
1219+
return leaves
11871220

11881221
class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
11891222
"""Gradient Boosting for classification.
@@ -1704,3 +1737,24 @@ def staged_predict(self, X):
17041737
"""
17051738
for y in self._staged_decision_function(X):
17061739
yield y.ravel()
1740+
1741+
def apply(self, X):
1742+
"""Apply trees in the ensemble to X, return leaf indices.
1743+
1744+
Parameters
1745+
----------
1746+
X : array-like or sparse matrix, shape = [n_samples, n_features]
1747+
The input samples. Internally, it will be converted to
1748+
``dtype=np.float32`` and if a sparse matrix is provided
1749+
to a sparse ``csr_matrix``.
1750+
1751+
Returns
1752+
-------
1753+
X_leaves : array_like, shape = [n_samples, n_estimators]
1754+
For each datapoint x in X and for each tree in the ensemble,
1755+
return the index of the leaf x ends up in in each estimator.
1756+
"""
1757+
1758+
leaves = super(GradientBoostingRegressor, self).apply(X)
1759+
leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0])
1760+
return leaves

sklearn/ensemble/tests/test_gradient_boosting.py

+10
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ def test_classification_toy():
6262
assert np.any(deviance_decrease >= 0.0), \
6363
"Train deviance does not monotonically decrease."
6464

65+
leaves = clf.apply(X)
66+
assert_equal(leaves.shape, (6, 10, 1))
67+
6568

6669
def test_parameter_checks():
6770
# Check input parameter validation.
@@ -182,6 +185,9 @@ def test_boston():
182185
assert_raises(ValueError, clf.predict, boston.data)
183186
clf.fit(boston.data, boston.target,
184187
sample_weight=sample_weight)
188+
leaves = clf.apply(boston.data)
189+
assert_equal(leaves.shape, (506, 100))
190+
185191
y_pred = clf.predict(boston.data)
186192
mse = mean_squared_error(boston.target, y_pred)
187193
assert mse < 6.0, "Failed with loss %s and " \
@@ -207,6 +213,9 @@ def test_iris():
207213
assert score > 0.9, "Failed with subsample %.1f " \
208214
"and score = %f" % (subsample, score)
209215

216+
leaves = clf.apply(iris.data)
217+
assert_equal(leaves.shape, (150, 100, 3))
218+
210219

211220
def test_regression_synthetic():
212221
# Test on synthetic regression datasets used in Leo Breiman,
@@ -1012,3 +1021,4 @@ def test_non_uniform_weights_toy_edge_case_clf():
10121021
gb = GradientBoostingClassifier(n_estimators=5)
10131022
gb.fit(X, y, sample_weight=sample_weight)
10141023
assert_array_equal(gb.predict([[1, 0]]), [1])
1024+

0 commit comments

Comments
 (0)