diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 315b23486a1b0..374a2db42ad3f 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -306,6 +306,9 @@ the transformation performs an implicit, non-parametric density estimation.
  * :ref:`example_manifold_plot_lle_digits.py` compares non-linear
    dimensionality reduction techniques on handwritten digits.
 
+ * :ref:`example_ensemble_plot_feature_transformation.py` compares
+   supervised and unsupervised tree based feature transformations.
+
 .. seealso::
 
    :ref:`manifold` techniques can also be useful to derive non-linear
diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
new file mode 100644
index 0000000000000..2271c51daaf75
--- /dev/null
+++ b/examples/ensemble/plot_feature_transformation.py
@@ -0,0 +1,115 @@
+"""
+===============================================
+Feature transformations with ensembles of trees
+===============================================
+
+Transform your features into a higher dimensional, sparse space. Then
+train a linear model on these features.
+
+First fit an ensemble of trees (totally random trees, a random
+forest, or gradient boosted trees) on the training set. Then each leaf
+of each tree in the ensemble is assigned a fixed arbitrary feature
+index in a new feature space. These leaf indices are then encoded in a
+one-hot fashion.
+
+Each sample goes through the decisions of each tree of the ensemble
+and ends up in one leaf per tree. The sample is encoded by setting
+feature values for these leaves to 1 and the other feature values to 0.
+
+The resulting transformer has then learned a supervised, sparse,
+high-dimensional categorical embedding of the data.
+
+"""
+
+# Author: Tim Head <betatim@gmail.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+np.random.seed(10)
+
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import make_classification
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
+                              GradientBoostingClassifier)
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.pipeline import Pipeline
+from sklearn.cross_validation import train_test_split
+from sklearn.metrics import roc_curve
+
+n_estimator = 10
+X, y = make_classification(n_samples=80000)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
+# It is important to train the ensemble of trees on a different subset
+# of the training data than the linear regression model to avoid
+# overfitting, in particular if the total number of leaves is
+# similar to the number of training samples
+X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
+                                                            y_train,
+                                                            test_size=0.5)
+
+# Unsupervised transformation based on totally random trees
+rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator)
+rt_lm = LogisticRegression()
+rt.fit(X_train, y_train)
+rt_lm.fit(rt.transform(X_train_lr), y_train_lr)
+
+y_pred_rt = rt_lm.predict_proba(rt.transform(X_test))[:, 1]
+fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
+
+
+# Supervised transformation based on random forests
+rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
+rf_enc = OneHotEncoder()
+rf_lm = LogisticRegression()
+rf.fit(X_train, y_train)
+rf_enc.fit(rf.apply(X_train))
+rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
+
+y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
+fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
+
+
+# Supervised transformation based on gradient boosted trees. Demonstrates
+# the use of each tree's apply() method.
+def gradient_apply(clf, X):
+    X_trans = []
+    for tree in clf.estimators_.ravel():
+        X_trans.append(tree.apply(X))
+    return np.array(X_trans).T
+
+grd = GradientBoostingClassifier(n_estimators=n_estimator)
+grd_enc = OneHotEncoder()
+grd_lm = LogisticRegression()
+grd.fit(X_train, y_train)
+grd_enc.fit(gradient_apply(grd, X_train))
+grd_lm.fit(grd_enc.transform(gradient_apply(grd, X_train_lr)), y_train_lr)
+
+y_pred_grd_lm = grd_lm.predict_proba(
+    grd_enc.transform(gradient_apply(grd, X_test)))[:, 1]
+fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
+
+
+# The gradient boosted model by itself
+y_pred_grd = grd.predict_proba(X_test)[:, 1]
+fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
+
+
+# The random forest model by itself
+y_pred_rf = rf.predict_proba(X_test)[:, 1]
+fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
+
+
+plt.plot([0, 1], [0, 1], 'k--')
+plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
+plt.plot(fpr_rf, tpr_rf, label='RF')
+plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
+plt.plot(fpr_grd, tpr_grd, label='GBT')
+plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
+plt.xlabel('False positive rate')
+plt.ylabel('True positive rate')
+plt.title('ROC curve')
+plt.legend(loc='best')
+plt.show()