Skip to content

[MRG+1] Add an example and a method to analyse the decision tree stucture #5487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 22, 2015
Merged
134 changes: 134 additions & 0 deletions examples/tree/unveil_tree_structure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""
=========================================
Understanding the decision tree structure
=========================================

The decision tree structure can be analysed to gain further insight on the
relation between the features and the target to predict. In this example, we
show how to retrieve:

- the binary tree structure;
- the depth of each node and whether or not it's a leaf;
- the nodes that were reached by a sample using the ``decision_path`` method;
- the leaf that was reached by a sample using the apply method;
- the rules that were used to predict a sample;
- the decision path shared by a group of samples.

"""
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
estimator.fit(X_train, y_train)

# The decision estimator has an attribute called tree_ which stores the entire
# tree structure and allows access to low level attributes. The binary tree
# tree_ is represented as a number of parallel arrays. The i-th element of each
# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this
# case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
# - left_child, id of the left child of the node
# - right_child, id of the right child of the node
# - feature, feature used for splitting the node
# - threshold, threshold value at the node
#

# Using those arrays, we can parse the tree structure:

n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold


# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1

# If we have a test node
if (children_left[node_id] != children_right[node_id]):
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
"the following tree structure:"
% n_nodes)
for i in range(n_nodes):
if is_leaves[i]:
print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
else:
print("%snode=%s test node: go to node %s if X[:, %s] <= %ss else to "
"node %s."
% (node_depth[i] * "\t",
i,
children_left[i],
feature[i],
threshold[i],
children_right[i],
))
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 whitespaces before 'sample'

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
if leave_id[sample_id] != node_id:
continue

if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
threshold_sign = "<="
else:
threshold_sign = ">"

print("decision id node %s : (X[%s, %s] (= %s) %s %s)"
% (node_id,
sample_id,
feature[node_id],
X_test[i, feature[node_id]],
threshold_sign,
threshold[node_id]))

# For a group of samples, we have the following common node.
sample_ids = [0, 1]
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
len(sample_ids))

common_node_id = np.arange(n_nodes)[common_nodes]

print("\nThe following samples %s share the node %s in the tree"
% (sample_ids, common_node_id))
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))
35 changes: 35 additions & 0 deletions sklearn/ensemble/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ class calls the ``fit`` method of each sub-estimator on random samples

import numpy as np
from scipy.sparse import issparse
from scipy.sparse import hstack as sparse_hstack


from ..base import ClassifierMixin, RegressorMixin
from ..externals.joblib import Parallel, delayed
Expand Down Expand Up @@ -182,6 +184,39 @@ def apply(self, X):

return np.array(results).T

def decision_path(self, X):
"""Return the decision path in the forest

Parameters
----------
X : array-like or sparse matrix, shape = [n_samples, n_features]
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.

Returns
-------
indicator : sparse csr array, shape = [n_samples, n_nodes]
Return a node indicator matrix where non zero elements
indicates that the samples goes through the nodes.

n_nodes_ptr : array of size (n_estimators + 1, )
The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
gives the indicator value for the i-th estimator.
"""
X = self._validate_X_predict(X)
indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
backend="threading")(
delayed(_parallel_helper)(tree, 'decision_path', X,
check_input=False)
for tree in self.estimators_)

n_nodes = [0]
n_nodes.extend([i.shape[1] for i in indicators])
n_nodes_ptr = np.array(n_nodes).cumsum()

return sparse_hstack(indicators).tocsr(), n_nodes_ptr
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@glouppe Should I keep the original format coo or make the conversion to csr? In some case, we might want a csc matrix.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am happy with csr, as for single trees.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what cases would we want a csc matrix? Given that you can still convert from csr to csc, and csc is the more used case, I think csr is the preferable option here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also remind me what the purpose of n_nodes_ptr is? Is this information not stored in the csr matrix?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n_nodes_ptr gives the column slice of the csr matrix where you can have the node indicator related to a given estimator. The indprt from the csr matrix tells you the row slice where you have the non zero elements for each samples.


def fit(self, X, y, sample_weight=None):
"""Build a forest of trees from the training set (X, y).

Expand Down
31 changes: 30 additions & 1 deletion sklearn/ensemble/tests/test_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def check_boston_criterion(name, criterion):
# Check consistency on dataset boston house prices.
ForestRegressor = FOREST_REGRESSORS[name]

clf = ForestRegressor(n_estimators=5, criterion=criterion,
clf = ForestRegressor(n_estimators=5, criterion=criterion,
random_state=1)
clf.fit(boston.data, boston.target)
score = clf.score(boston.data, boston.target)
Expand Down Expand Up @@ -1098,3 +1098,32 @@ def test_dtype_convert(n_classes=15):
result = classifier.fit(X, y).predict(X)
assert_array_equal(classifier.classes_, y)
assert_array_equal(result, y)


def check_decision_path(name):
X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1)
n_samples = X.shape[0]
ForestEstimator = FOREST_ESTIMATORS[name]
est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False,
random_state=1)
est.fit(X, y)
indicator, n_nodes_ptr = est.decision_path(X)

assert_equal(indicator.shape[1], n_nodes_ptr[-1])
assert_equal(indicator.shape[0], n_samples)
assert_array_equal(np.diff(n_nodes_ptr),
[e.tree_.node_count for e in est.estimators_])

# Assert that leaves index are correct
leaves = est.apply(X)
for est_id in range(leaves.shape[1]):
leave_indicator = [indicator[i, n_nodes_ptr[est_id] + j]
for i, j in enumerate(leaves[:, est_id])]
assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the almost_equal


def test_decision_path():
for name in FOREST_CLASSIFIERS:
yield check_decision_path, name
for name in FOREST_REGRESSORS:
yield check_decision_path, name
Loading