scikit-learn · agramfort · Oct 22, 2015 · Oct 20, 2015 · Oct 20, 2015 · Oct 20, 2015
diff --git a/examples/tree/unveil_tree_structure.py b/examples/tree/unveil_tree_structure.py
@@ -0,0 +1,134 @@
+"""
+=========================================
+Understanding the decision tree structure
+=========================================
+
+The decision tree structure can be analysed to gain further insight on the
+relation between the features and the target to predict. In this example, we
+show how to retrieve:
+
+- the binary tree structure;
+- the depth of each node and whether or not it's a leaf;
+- the nodes that were reached by a sample using the ``decision_path`` method;
+- the leaf that was reached by a sample using the apply method;
+- the rules that were used to predict a sample;
+- the decision path shared by a group of samples.
+
+"""
+import numpy as np
+
+from sklearn.cross_validation import train_test_split
+from sklearn.datasets import load_iris
+from sklearn.tree import DecisionTreeClassifier
+
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
+estimator.fit(X_train, y_train)
+
+# The decision estimator has an attribute called tree_  which stores the entire
+# tree structure and allows access to low level attributes. The binary tree
+# tree_ is represented as a number of parallel arrays. The i-th element of each
+# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
+# Some of the arrays only apply to either leaves or split nodes, resp. In this
+# case the values of nodes of the other type are arbitrary!
+#
+# Among those arrays, we have:
+#   - left_child, id of the left child of the node
+#   - right_child, id of the right child of the node
+#   - feature, feature used for splitting the node
+#   - threshold, threshold value at the node
+#
+
+# Using those arrays, we can parse the tree structure:
+
+n_nodes = estimator.tree_.node_count
+children_left = estimator.tree_.children_left
+children_right = estimator.tree_.children_right
+feature = estimator.tree_.feature
+threshold = estimator.tree_.threshold
+
+
+# The tree structure can be traversed to compute various properties such
+# as the depth of each node and whether or not it is a leaf.
+node_depth = np.zeros(shape=n_nodes)
+is_leaves = np.zeros(shape=n_nodes, dtype=bool)
+stack = [(0, -1)]  # seed is the root node id and its parent depth
+while len(stack) > 0:
+    node_id, parent_depth = stack.pop()
+    node_depth[node_id] = parent_depth + 1
+
+    # If we have a test node
+    if (children_left[node_id] != children_right[node_id]):
+        stack.append((children_left[node_id], parent_depth + 1))
+        stack.append((children_right[node_id], parent_depth + 1))
+    else:
+        is_leaves[node_id] = True
+
+print("The binary tree structure has %s nodes and has "
+      "the following tree structure:"
+      % n_nodes)
+for i in range(n_nodes):
+    if is_leaves[i]:
+        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
+    else:
+        print("%snode=%s test node: go to node %s if X[:, %s] <= %ss else to "
+              "node %s."
+              % (node_depth[i] * "\t",
+                 i,
+                 children_left[i],
+                 feature[i],
+                 threshold[i],
+                 children_right[i],
+                 ))
+print()
+
+# First let's retrieve the decision path of each sample. The decision_path
+# method allows to retrieve the node indicator functions. A non zero element of
+# indicator matrix at the position (i, j) indicates that the sample i goes
+# through the node j.
+
+node_indicator = estimator.decision_path(X_test)
+
+# Similarly, we can also have the leaves ids reached by each sample.
+
+leave_id = estimator.apply(X_test)
+
+# Now, it's possible to get the tests that were used to predict a sample or
+# a group of samples. First, let's make it for the sample.
+
+sample_id = 0
+node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
+                                    node_indicator.indptr[sample_id + 1]]
+
+print('Rules used to predict sample %s: ' % sample_id)
+for node_id in node_index:
+    if leave_id[sample_id] != node_id:
+        continue
+
+    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
+        threshold_sign = "<="
+    else:
+        threshold_sign = ">"
+
+    print("decision id node %s : (X[%s, %s] (= %s) %s %s)"
+          % (node_id,
+             sample_id,
+             feature[node_id],
+             X_test[i, feature[node_id]],
+             threshold_sign,
+             threshold[node_id]))
+
+# For a group of samples, we have the following common node.
+sample_ids = [0, 1]
+common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
+                len(sample_ids))
+
+common_node_id = np.arange(n_nodes)[common_nodes]
+
+print("\nThe following samples %s share the node %s in the tree"
+      % (sample_ids, common_node_id))
+print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -48,6 +48,8 @@ class calls the ``fit`` method of each sub-estimator on random samples
 
 import numpy as np
 from scipy.sparse import issparse
+from scipy.sparse import hstack as sparse_hstack
+
 
 from ..base import ClassifierMixin, RegressorMixin
 from ..externals.joblib import Parallel, delayed
@@ -182,6 +184,39 @@ def apply(self, X):
 
         return np.array(results).T
 
+    def decision_path(self, X):
+        """Return the decision path in the forest
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape = [n_samples, n_features]
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        indicator : sparse csr array, shape = [n_samples, n_nodes]
+            Return a node indicator matrix where non zero elements
+            indicates that the samples goes through the nodes.
+
+        n_nodes_ptr : array of size (n_estimators + 1, )
+            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
+            gives the indicator value for the i-th estimator.
+        """
+        X = self._validate_X_predict(X)
+        indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
+                              backend="threading")(
+            delayed(_parallel_helper)(tree, 'decision_path', X,
+                                      check_input=False)
+            for tree in self.estimators_)
+
+        n_nodes = [0]
+        n_nodes.extend([i.shape[1] for i in indicators])
+        n_nodes_ptr = np.array(n_nodes).cumsum()
+
+        return sparse_hstack(indicators).tocsr(), n_nodes_ptr
+
     def fit(self, X, y, sample_weight=None):
         """Build a forest of trees from the training set (X, y).
 

diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -138,7 +138,7 @@ def check_boston_criterion(name, criterion):
     # Check consistency on dataset boston house prices.
     ForestRegressor = FOREST_REGRESSORS[name]
 
-    clf = ForestRegressor(n_estimators=5, criterion=criterion, 
+    clf = ForestRegressor(n_estimators=5, criterion=criterion,
                           random_state=1)
     clf.fit(boston.data, boston.target)
     score = clf.score(boston.data, boston.target)
@@ -1098,3 +1098,32 @@ def test_dtype_convert(n_classes=15):
     result = classifier.fit(X, y).predict(X)
     assert_array_equal(classifier.classes_, y)
     assert_array_equal(result, y)
+
+
+def check_decision_path(name):
+    X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
+    n_samples = X.shape[0]
+    ForestEstimator = FOREST_ESTIMATORS[name]
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False,
+                          random_state=1)
+    est.fit(X, y)
+    indicator, n_nodes_ptr = est.decision_path(X)
+
+    assert_equal(indicator.shape[1], n_nodes_ptr[-1])
+    assert_equal(indicator.shape[0], n_samples)
+    assert_array_equal(np.diff(n_nodes_ptr),
+                       [e.tree_.node_count for e in est.estimators_])
+
+    # Assert that leaves index are correct
+    leaves = est.apply(X)
+    for est_id in range(leaves.shape[1]):
+        leave_indicator = [indicator[i, n_nodes_ptr[est_id] + j]
+                           for i, j in enumerate(leaves[:, est_id])]
+        assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
+
+
+def test_decision_path():
+    for name in FOREST_CLASSIFIERS:
+        yield check_decision_path, name
+    for name in FOREST_REGRESSORS:
+        yield check_decision_path, name