diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 03ba2f108bbdd..2c636d658fab9 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1065,23 +1065,12 @@ class in a leaf. proba = self.tree_.predict(X) if self.n_outputs_ == 1: - proba = proba[:, : self.n_classes_] - normalizer = proba.sum(axis=1)[:, np.newaxis] - normalizer[normalizer == 0.0] = 1.0 - proba /= normalizer - - return proba - + return proba[:, : self.n_classes_] else: all_proba = [] - for k in range(self.n_outputs_): proba_k = proba[:, k, : self.n_classes_[k]] - normalizer = proba_k.sum(axis=1)[:, np.newaxis] - normalizer[normalizer == 0.0] = 1.0 - proba_k /= normalizer all_proba.append(proba_k) - return all_proba def predict_log_proba(self, X): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 89a7639f9bbcf..cb20db9ddb69c 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -568,15 +568,18 @@ cdef class ClassificationCriterion(Criterion): dest : float64_t pointer The memory address which we will save the node value into. """ - cdef intp_t k + cdef intp_t k, c for k in range(self.n_outputs): - memcpy(dest, &self.sum_total[k, 0], self.n_classes[k] * sizeof(float64_t)) + for c in range(self.n_classes[k]): + dest[c] = self.sum_total[k, c] / self.weighted_n_node_samples dest += self.max_n_classes - cdef void clip_node_value(self, float64_t * dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil: - """Clip the value in dest between lower_bound and upper_bound for monotonic constraints. - + cdef inline void clip_node_value( + self, float64_t * dest, float64_t lower_bound, float64_t upper_bound + ) noexcept nogil: + """Clip the values in dest such that predicted probabilities stay between + `lower_bound` and `upper_bound` when monotonic constraints are enforced. Note that monotonicity constraints are only supported for: - single-output trees and - binary classifications. @@ -586,7 +589,7 @@ cdef class ClassificationCriterion(Criterion): elif dest[0] > upper_bound: dest[0] = upper_bound - # Class proportions for binary classification must sum to 1. + # Values for binary classification must sum to 1. dest[1] = 1 - dest[0] cdef inline float64_t middle_value(self) noexcept nogil: diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index ff0d6db5c25a5..f6492cf6a821f 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -271,14 +271,15 @@ def get_fill_color(self, tree, node_id): # Find max and min values in leaf nodes for regression self.colors["bounds"] = (np.min(tree.value), np.max(tree.value)) if tree.n_outputs == 1: - node_val = tree.value[node_id][0, :] / tree.weighted_n_node_samples[node_id] - if tree.n_classes[0] == 1: - # Regression or degraded classification with single class - node_val = tree.value[node_id][0, :] - if isinstance(node_val, Iterable) and self.colors["bounds"] is not None: - # Only unpack the float only for the regression tree case. - # Classification tree requires an Iterable in `get_color`. - node_val = node_val.item() + node_val = tree.value[node_id][0, :] + if ( + tree.n_classes[0] == 1 + and isinstance(node_val, Iterable) + and self.colors["bounds"] is not None + ): + # Unpack the float only for the regression tree case. + # Classification tree requires an Iterable in `get_color`. + node_val = node_val.item() else: # If multi-output color node by impurity node_val = -tree.impurity[node_id] @@ -347,9 +348,9 @@ def node_to_str(self, tree, node_id, criterion): node_string += str(tree.n_node_samples[node_id]) + characters[4] # Write node class distribution / regression value - if self.proportion and tree.n_classes[0] != 1: + if not self.proportion and tree.n_classes[0] != 1: # For classification this will show the proportion of samples - value = value / tree.weighted_n_node_samples[node_id] + value = value * tree.weighted_n_node_samples[node_id] if labels: node_string += "value = " if tree.n_classes[0] == 1: @@ -1072,14 +1073,20 @@ def export_text( export_text.report = "" - def _add_leaf(value, class_name, indent): + def _add_leaf(value, weighted_n_node_samples, class_name, indent): val = "" - is_classification = isinstance(decision_tree, DecisionTreeClassifier) - if show_weights or not is_classification: + if isinstance(decision_tree, DecisionTreeClassifier): + if show_weights: + val = [ + "{1:.{0}f}, ".format(decimals, v * weighted_n_node_samples) + for v in value + ] + val = "[" + "".join(val)[:-2] + "]" + weighted_n_node_samples + val += " class: " + str(class_name) + else: val = ["{1:.{0}f}, ".format(decimals, v) for v in value] val = "[" + "".join(val)[:-2] + "]" - if is_classification: - val += " class: " + str(class_name) export_text.report += value_fmt.format(indent, "", val) def print_tree_recurse(node, depth): @@ -1096,6 +1103,8 @@ def print_tree_recurse(node, depth): if tree_.n_classes[0] != 1 and tree_.n_outputs == 1: class_name = class_names[class_name] + weighted_n_node_samples = tree_.weighted_n_node_samples[node] + if depth <= max_depth + 1: info_fmt = "" info_fmt_left = info_fmt @@ -1113,11 +1122,11 @@ def print_tree_recurse(node, depth): export_text.report += info_fmt_right print_tree_recurse(tree_.children_right[node], depth + 1) else: # leaf - _add_leaf(value, class_name, indent) + _add_leaf(value, weighted_n_node_samples, class_name, indent) else: subtree_depth = _compute_depth(tree_, node) if subtree_depth == 1: - _add_leaf(value, class_name, indent) + _add_leaf(value, weighted_n_node_samples, class_name, indent) else: trunc_report = "truncated branch of depth %d" % subtree_depth export_text.report += truncation_fmt.format(indent, trunc_report) diff --git a/sklearn/tree/tests/test_monotonic_tree.py b/sklearn/tree/tests/test_monotonic_tree.py index fe2f863d314ed..6478c2e2dfd85 100644 --- a/sklearn/tree/tests/test_monotonic_tree.py +++ b/sklearn/tree/tests/test_monotonic_tree.py @@ -14,6 +14,7 @@ ExtraTreeClassifier, ExtraTreeRegressor, ) +from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import CSC_CONTAINERS TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier] @@ -77,15 +78,20 @@ def test_monotonic_constraints_classifications( if sparse_splitter: X_train = csc_container(X_train) est.fit(X_train, y_train) - y = est.predict_proba(X_test)[:, 1] + proba_test = est.predict_proba(X_test) + + assert np.logical_and( + proba_test >= 0.0, proba_test <= 1.0 + ).all(), "Probability should always be in [0, 1] range." + assert_allclose(proba_test.sum(axis=1), 1.0) # Monotonic increase constraint, it applies to the positive class - assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= y) - assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= y) + assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= proba_test[:, 1]) + assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= proba_test[:, 1]) # Monotonic decrease constraint, it applies to the positive class - assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= y) - assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= y) + assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= proba_test[:, 1]) + assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= proba_test[:, 1]) @pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES)