Skip to content

FIX force node values outside of [0, 1] range for monotonically constraints classification trees #27639

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 1 addition & 12 deletions sklearn/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,23 +1065,12 @@ class in a leaf.
proba = self.tree_.predict(X)

if self.n_outputs_ == 1:
proba = proba[:, : self.n_classes_]
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer

return proba

return proba[:, : self.n_classes_]
else:
all_proba = []

for k in range(self.n_outputs_):
proba_k = proba[:, k, : self.n_classes_[k]]
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
all_proba.append(proba_k)

return all_proba

def predict_log_proba(self, X):
Expand Down
15 changes: 9 additions & 6 deletions sklearn/tree/_criterion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -568,15 +568,18 @@ cdef class ClassificationCriterion(Criterion):
dest : float64_t pointer
The memory address which we will save the node value into.
"""
cdef intp_t k
cdef intp_t k, c

for k in range(self.n_outputs):
memcpy(dest, &self.sum_total[k, 0], self.n_classes[k] * sizeof(float64_t))
for c in range(self.n_classes[k]):
dest[c] = self.sum_total[k, c] / self.weighted_n_node_samples
dest += self.max_n_classes

cdef void clip_node_value(self, float64_t * dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil:
"""Clip the value in dest between lower_bound and upper_bound for monotonic constraints.

cdef inline void clip_node_value(
self, float64_t * dest, float64_t lower_bound, float64_t upper_bound
) noexcept nogil:
"""Clip the values in dest such that predicted probabilities stay between
`lower_bound` and `upper_bound` when monotonic constraints are enforced.
Note that monotonicity constraints are only supported for:
- single-output trees and
- binary classifications.
Expand All @@ -586,7 +589,7 @@ cdef class ClassificationCriterion(Criterion):
elif dest[0] > upper_bound:
dest[0] = upper_bound

# Class proportions for binary classification must sum to 1.
# Values for binary classification must sum to 1.
dest[1] = 1 - dest[0]

cdef inline float64_t middle_value(self) noexcept nogil:
Expand Down
43 changes: 26 additions & 17 deletions sklearn/tree/_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,14 +271,15 @@ def get_fill_color(self, tree, node_id):
# Find max and min values in leaf nodes for regression
self.colors["bounds"] = (np.min(tree.value), np.max(tree.value))
if tree.n_outputs == 1:
node_val = tree.value[node_id][0, :] / tree.weighted_n_node_samples[node_id]
if tree.n_classes[0] == 1:
# Regression or degraded classification with single class
node_val = tree.value[node_id][0, :]
if isinstance(node_val, Iterable) and self.colors["bounds"] is not None:
# Only unpack the float only for the regression tree case.
# Classification tree requires an Iterable in `get_color`.
node_val = node_val.item()
node_val = tree.value[node_id][0, :]
if (
tree.n_classes[0] == 1
and isinstance(node_val, Iterable)
and self.colors["bounds"] is not None
):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is indeed much cleaner.

# Unpack the float only for the regression tree case.
# Classification tree requires an Iterable in `get_color`.
node_val = node_val.item()
else:
# If multi-output color node by impurity
node_val = -tree.impurity[node_id]
Expand Down Expand Up @@ -347,9 +348,9 @@ def node_to_str(self, tree, node_id, criterion):
node_string += str(tree.n_node_samples[node_id]) + characters[4]

# Write node class distribution / regression value
if self.proportion and tree.n_classes[0] != 1:
if not self.proportion and tree.n_classes[0] != 1:
# For classification this will show the proportion of samples
value = value / tree.weighted_n_node_samples[node_id]
value = value * tree.weighted_n_node_samples[node_id]
if labels:
node_string += "value = "
if tree.n_classes[0] == 1:
Expand Down Expand Up @@ -1072,14 +1073,20 @@ def export_text(

export_text.report = ""

def _add_leaf(value, class_name, indent):
def _add_leaf(value, weighted_n_node_samples, class_name, indent):
val = ""
is_classification = isinstance(decision_tree, DecisionTreeClassifier)
if show_weights or not is_classification:
if isinstance(decision_tree, DecisionTreeClassifier):
if show_weights:
val = [
"{1:.{0}f}, ".format(decimals, v * weighted_n_node_samples)
for v in value
]
val = "[" + "".join(val)[:-2] + "]"
weighted_n_node_samples
val += " class: " + str(class_name)
else:
val = ["{1:.{0}f}, ".format(decimals, v) for v in value]
val = "[" + "".join(val)[:-2] + "]"
if is_classification:
val += " class: " + str(class_name)
export_text.report += value_fmt.format(indent, "", val)

def print_tree_recurse(node, depth):
Expand All @@ -1096,6 +1103,8 @@ def print_tree_recurse(node, depth):
if tree_.n_classes[0] != 1 and tree_.n_outputs == 1:
class_name = class_names[class_name]

weighted_n_node_samples = tree_.weighted_n_node_samples[node]

if depth <= max_depth + 1:
info_fmt = ""
info_fmt_left = info_fmt
Expand All @@ -1113,11 +1122,11 @@ def print_tree_recurse(node, depth):
export_text.report += info_fmt_right
print_tree_recurse(tree_.children_right[node], depth + 1)
else: # leaf
_add_leaf(value, class_name, indent)
_add_leaf(value, weighted_n_node_samples, class_name, indent)
else:
subtree_depth = _compute_depth(tree_, node)
if subtree_depth == 1:
_add_leaf(value, class_name, indent)
_add_leaf(value, weighted_n_node_samples, class_name, indent)
else:
trunc_report = "truncated branch of depth %d" % subtree_depth
export_text.report += truncation_fmt.format(indent, trunc_report)
Expand Down
16 changes: 11 additions & 5 deletions sklearn/tree/tests/test_monotonic_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ExtraTreeClassifier,
ExtraTreeRegressor,
)
from sklearn.utils._testing import assert_allclose
from sklearn.utils.fixes import CSC_CONTAINERS

TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier]
Expand Down Expand Up @@ -77,15 +78,20 @@ def test_monotonic_constraints_classifications(
if sparse_splitter:
X_train = csc_container(X_train)
est.fit(X_train, y_train)
y = est.predict_proba(X_test)[:, 1]
proba_test = est.predict_proba(X_test)

assert np.logical_and(
proba_test >= 0.0, proba_test <= 1.0
).all(), "Probability should always be in [0, 1] range."
assert_allclose(proba_test.sum(axis=1), 1.0)
Comment on lines +83 to +86
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do those tests fail without this PR? (They should!)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

    assert np.logical_and(
        proba_test >= 0.0, proba_test <= 1.0
    ).all(), "Probability should always be in [0, 1] range."

definitely fails.

    assert_allclose(proba_test.sum(axis=1), 1.0)

doesn't though as we made sure the sum is 1..


# Monotonic increase constraint, it applies to the positive class
assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= y)
assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= y)
assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= proba_test[:, 1])
assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= proba_test[:, 1])

# Monotonic decrease constraint, it applies to the positive class
assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= y)
assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= y)
assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= proba_test[:, 1])
assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= proba_test[:, 1])


@pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES)
Expand Down