-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[MRG+1] Add an example and a method to analyse the decision tree stucture #5487
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f6d605f
392ef50
317281b
4d9a955
bc02bd5
be21b6a
405c717
7d4755e
c9888a6
ec8fbef
4d0ba3c
7ad90f5
3fbd3b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
""" | ||
========================================= | ||
Understanding the decision tree structure | ||
========================================= | ||
|
||
The decision tree structure can be analysed to gain further insight on the | ||
relation between the features and the target to predict. In this example, we | ||
show how to retrieve: | ||
|
||
- the binary tree structure; | ||
- the depth of each node and whether or not it's a leaf; | ||
- the nodes that were reached by a sample using the ``decision_path`` method; | ||
- the leaf that was reached by a sample using the apply method; | ||
- the rules that were used to predict a sample; | ||
- the decision path shared by a group of samples. | ||
|
||
""" | ||
import numpy as np | ||
|
||
from sklearn.cross_validation import train_test_split | ||
from sklearn.datasets import load_iris | ||
from sklearn.tree import DecisionTreeClassifier | ||
|
||
iris = load_iris() | ||
X = iris.data | ||
y = iris.target | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) | ||
|
||
estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0) | ||
estimator.fit(X_train, y_train) | ||
|
||
# The decision estimator has an attribute called tree_ which stores the entire | ||
# tree structure and allows access to low level attributes. The binary tree | ||
# tree_ is represented as a number of parallel arrays. The i-th element of each | ||
# array holds information about the node `i`. Node 0 is the tree's root. NOTE: | ||
# Some of the arrays only apply to either leaves or split nodes, resp. In this | ||
# case the values of nodes of the other type are arbitrary! | ||
# | ||
# Among those arrays, we have: | ||
# - left_child, id of the left child of the node | ||
# - right_child, id of the right child of the node | ||
# - feature, feature used for splitting the node | ||
# - threshold, threshold value at the node | ||
# | ||
|
||
# Using those arrays, we can parse the tree structure: | ||
|
||
n_nodes = estimator.tree_.node_count | ||
children_left = estimator.tree_.children_left | ||
children_right = estimator.tree_.children_right | ||
feature = estimator.tree_.feature | ||
threshold = estimator.tree_.threshold | ||
|
||
|
||
# The tree structure can be traversed to compute various properties such | ||
# as the depth of each node and whether or not it is a leaf. | ||
node_depth = np.zeros(shape=n_nodes) | ||
is_leaves = np.zeros(shape=n_nodes, dtype=bool) | ||
stack = [(0, -1)] # seed is the root node id and its parent depth | ||
while len(stack) > 0: | ||
node_id, parent_depth = stack.pop() | ||
node_depth[node_id] = parent_depth + 1 | ||
|
||
# If we have a test node | ||
if (children_left[node_id] != children_right[node_id]): | ||
stack.append((children_left[node_id], parent_depth + 1)) | ||
stack.append((children_right[node_id], parent_depth + 1)) | ||
else: | ||
is_leaves[node_id] = True | ||
|
||
print("The binary tree structure has %s nodes and has " | ||
"the following tree structure:" | ||
% n_nodes) | ||
for i in range(n_nodes): | ||
if is_leaves[i]: | ||
print("%snode=%s leaf node." % (node_depth[i] * "\t", i)) | ||
else: | ||
print("%snode=%s test node: go to node %s if X[:, %s] <= %ss else to " | ||
"node %s." | ||
% (node_depth[i] * "\t", | ||
i, | ||
children_left[i], | ||
feature[i], | ||
threshold[i], | ||
children_right[i], | ||
)) | ||
print() | ||
|
||
# First let's retrieve the decision path of each sample. The decision_path | ||
# method allows to retrieve the node indicator functions. A non zero element of | ||
# indicator matrix at the position (i, j) indicates that the sample i goes | ||
# through the node j. | ||
|
||
node_indicator = estimator.decision_path(X_test) | ||
|
||
# Similarly, we can also have the leaves ids reached by each sample. | ||
|
||
leave_id = estimator.apply(X_test) | ||
|
||
# Now, it's possible to get the tests that were used to predict a sample or | ||
# a group of samples. First, let's make it for the sample. | ||
|
||
sample_id = 0 | ||
node_index = node_indicator.indices[node_indicator.indptr[sample_id]: | ||
node_indicator.indptr[sample_id + 1]] | ||
|
||
print('Rules used to predict sample %s: ' % sample_id) | ||
for node_id in node_index: | ||
if leave_id[sample_id] != node_id: | ||
continue | ||
|
||
if (X_test[sample_id, feature[node_id]] <= threshold[node_id]): | ||
threshold_sign = "<=" | ||
else: | ||
threshold_sign = ">" | ||
|
||
print("decision id node %s : (X[%s, %s] (= %s) %s %s)" | ||
% (node_id, | ||
sample_id, | ||
feature[node_id], | ||
X_test[i, feature[node_id]], | ||
threshold_sign, | ||
threshold[node_id])) | ||
|
||
# For a group of samples, we have the following common node. | ||
sample_ids = [0, 1] | ||
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) == | ||
len(sample_ids)) | ||
|
||
common_node_id = np.arange(n_nodes)[common_nodes] | ||
|
||
print("\nThe following samples %s share the node %s in the tree" | ||
% (sample_ids, common_node_id)) | ||
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,8 @@ class calls the ``fit`` method of each sub-estimator on random samples | |
|
||
import numpy as np | ||
from scipy.sparse import issparse | ||
from scipy.sparse import hstack as sparse_hstack | ||
|
||
|
||
from ..base import ClassifierMixin, RegressorMixin | ||
from ..externals.joblib import Parallel, delayed | ||
|
@@ -182,6 +184,39 @@ def apply(self, X): | |
|
||
return np.array(results).T | ||
|
||
def decision_path(self, X): | ||
"""Return the decision path in the forest | ||
|
||
Parameters | ||
---------- | ||
X : array-like or sparse matrix, shape = [n_samples, n_features] | ||
The input samples. Internally, it will be converted to | ||
``dtype=np.float32`` and if a sparse matrix is provided | ||
to a sparse ``csr_matrix``. | ||
|
||
Returns | ||
------- | ||
indicator : sparse csr array, shape = [n_samples, n_nodes] | ||
Return a node indicator matrix where non zero elements | ||
indicates that the samples goes through the nodes. | ||
|
||
n_nodes_ptr : array of size (n_estimators + 1, ) | ||
The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] | ||
gives the indicator value for the i-th estimator. | ||
""" | ||
X = self._validate_X_predict(X) | ||
indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, | ||
backend="threading")( | ||
delayed(_parallel_helper)(tree, 'decision_path', X, | ||
check_input=False) | ||
for tree in self.estimators_) | ||
|
||
n_nodes = [0] | ||
n_nodes.extend([i.shape[1] for i in indicators]) | ||
n_nodes_ptr = np.array(n_nodes).cumsum() | ||
|
||
return sparse_hstack(indicators).tocsr(), n_nodes_ptr | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @glouppe Should I keep the original format coo or make the conversion to csr? In some case, we might want a csc matrix. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am happy with csr, as for single trees. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In what cases would we want a csc matrix? Given that you can still convert from csr to csc, and csc is the more used case, I think csr is the preferable option here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also remind me what the purpose of n_nodes_ptr is? Is this information not stored in the csr matrix? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
def fit(self, X, y, sample_weight=None): | ||
"""Build a forest of trees from the training set (X, y). | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,7 +138,7 @@ def check_boston_criterion(name, criterion): | |
# Check consistency on dataset boston house prices. | ||
ForestRegressor = FOREST_REGRESSORS[name] | ||
|
||
clf = ForestRegressor(n_estimators=5, criterion=criterion, | ||
clf = ForestRegressor(n_estimators=5, criterion=criterion, | ||
random_state=1) | ||
clf.fit(boston.data, boston.target) | ||
score = clf.score(boston.data, boston.target) | ||
|
@@ -1098,3 +1098,32 @@ def test_dtype_convert(n_classes=15): | |
result = classifier.fit(X, y).predict(X) | ||
assert_array_equal(classifier.classes_, y) | ||
assert_array_equal(result, y) | ||
|
||
|
||
def check_decision_path(name): | ||
X, y = datasets.make_hastie_10_2(n_samples=20, random_state=1) | ||
n_samples = X.shape[0] | ||
ForestEstimator = FOREST_ESTIMATORS[name] | ||
est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, | ||
random_state=1) | ||
est.fit(X, y) | ||
indicator, n_nodes_ptr = est.decision_path(X) | ||
|
||
assert_equal(indicator.shape[1], n_nodes_ptr[-1]) | ||
assert_equal(indicator.shape[0], n_samples) | ||
assert_array_equal(np.diff(n_nodes_ptr), | ||
[e.tree_.node_count for e in est.estimators_]) | ||
|
||
# Assert that leaves index are correct | ||
leaves = est.apply(X) | ||
for est_id in range(leaves.shape[1]): | ||
leave_indicator = [indicator[i, n_nodes_ptr[est_id] + j] | ||
for i, j in enumerate(leaves[:, est_id])] | ||
assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand the almost_equal |
||
|
||
def test_decision_path(): | ||
for name in FOREST_CLASSIFIERS: | ||
yield check_decision_path, name | ||
for name in FOREST_REGRESSORS: | ||
yield check_decision_path, name |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2 whitespaces before 'sample'