scikit-learn · betatim · May 4, 2023 · Jun 12, 2022 · Jun 12, 2022 · Jun 12, 2022
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
@@ -572,6 +572,65 @@ Mean Absolute Error:
 
 Note that it fits much slower than the MSE criterion.
 
+.. _tree_missing_value_support:
+
+Missing Values Support
+======================
+
+:class:`~tree.DecisionTreeClassifier` and :class:`~tree.DecisionTreeRegressor`
+have built-in support for missing values when `splitter='best'` and criterion is
+`'gini'`, `'entropy`', or `'log_loss'`, for classification or
+`'squared_error'`, `'friedman_mse'`, or `'poisson'` for regression.
+
+For each potential threshold on the non-missing data, the splitter will evaluate
+the split with all the missing values going to the left node or the right node.
+
+Decisions are made as follows:
+
+    - By default when predicting, the samples with missing values are classified
+      with the class used in the split found during training::
+
+        >>> from sklearn.tree import DecisionTreeClassifier
+        >>> import numpy as np
+
+        >>> X = np.array([0, 1, 6, np.nan]).reshape(-1, 1)
+        >>> y = [0, 0, 1, 1]
+
+        >>> tree = DecisionTreeClassifier(random_state=0).fit(X, y)
+        >>> tree.predict(X)
+        array([0, 0, 1, 1])
+
+    - If the the criterion evaluation is the same for both nodes,
+      then the tie for missing value at predict time is broken by going to the
+      right node. The splitter also checks the split where all the missing
+      values go to one child and non-missing values go to the other::
+
+        >>> from sklearn.tree import DecisionTreeClassifier
+        >>> import numpy as np
+
+        >>> X = np.array([np.nan, -1, np.nan, 1]).reshape(-1, 1)
+        >>> y = [0, 0, 1, 1]
+
+        >>> tree = DecisionTreeClassifier(random_state=0).fit(X, y)
+
+        >>> X_test = np.array([np.nan]).reshape(-1, 1)
+        >>> tree.predict(X_test)
+        array([1])
+
+    - If no missing values are seen during training for a given feature, then during
+      prediction missing values are mapped to the child with the most samples::
+
+        >>> from sklearn.tree import DecisionTreeClassifier
+        >>> import numpy as np
+
+        >>> X = np.array([0, 1, 2, 3]).reshape(-1, 1)
+        >>> y = [0, 1, 1, 1]
+
+        >>> tree = DecisionTreeClassifier(random_state=0).fit(X, y)
+
+        >>> X_test = np.array([np.nan]).reshape(-1, 1)
+        >>> tree.predict(X_test)
+        array([1])
 
 .. _minimal_cost_complexity_pruning:
 

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -486,6 +486,12 @@ Changelog
 :mod:`sklearn.tree`
 ...................
 
+- |MajorFeature| :class:`tree.DecisionTreeRegressor` and
+  :class:`tree.DecisionTreeClassifier` support missing values when
+  `splitter='best'` and criterion is `gini`, `entropy`, or `log_loss`,
+  for classification or `squared_error`, `friedman_mse`, or `poisson`
+  for regression. :pr:`23595` by `Thomas Fan`_.
+
 - |Enhancement| Adds a `class_names` parameter to
   :func:`tree.export_text`. This allows specifying the parameter `class_names`
   for each target class in ascending numerical order.

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
@@ -34,6 +34,8 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import assert_all_finite
+from ..utils.validation import _assert_all_finite_element_wise
 from ..utils import compute_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
@@ -48,6 +50,7 @@
 from ._tree import _build_pruned_tree_ccp
 from ._tree import ccp_pruning_path
 from . import _tree, _splitter, _criterion
+from ._utils import _any_isnan_axis0
 
 __all__ = [
     "DecisionTreeClassifier",
@@ -174,19 +177,67 @@ def get_n_leaves(self):
         check_is_fitted(self)
         return self.tree_.n_leaves
 
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    def _support_missing_values(self, X):
+        return not issparse(X) and self._get_tags()["allow_nan"]
+
+    def _compute_feature_has_missing(self, X):
+        """Return boolean mask denoting if there are missing values for each feature.
+
+        This method also ensures that X is finite.
+
+        Parameter
+        ---------
+        X : array-like of shape (n_samples, n_features), dtype=DOUBLE
+            Input data.
+
+        Returns
+        -------
+        feature_has_missing : ndarray of shape (n_features,), or None
+            Missing value mask. If missing values are not supported or there
+            are no missing values, return None.
+        """
+        common_kwargs = dict(estimator_name=self.__class__.__name__, input_name="X")
+
+        if not self._support_missing_values(X):
+            assert_all_finite(X, **common_kwargs)
+            return None
+
+        with np.errstate(over="ignore"):
+            overall_sum = np.sum(X)
+
+        if not np.isfinite(overall_sum):
+            # Raise a ValueError in case of the presence of an infinite element.
+            _assert_all_finite_element_wise(X, xp=np, allow_nan=True, **common_kwargs)
+
+        # If the sum is not nan, then there are no missing values
+        if not np.isnan(overall_sum):
+            return None
+
+        feature_has_missing = _any_isnan_axis0(X)
+        return feature_has_missing
+
+    def _fit(
+        self, X, y, sample_weight=None, check_input=True, feature_has_missing=None
+    ):
         self._validate_params()
         random_state = check_random_state(self.random_state)
 
         if check_input:
             # Need to validate separately here.
             # We can't pass multi_output=True because that would allow y to be
             # csr.
-            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
+
+            # _compute_feature_has_missing will check for finite values and
+            # compute the missing mask if the tree supports missing values
+            check_X_params = dict(
+                dtype=DTYPE, accept_sparse="csc", force_all_finite=False
+            )
             check_y_params = dict(ensure_2d=False, dtype=None)
             X, y = self._validate_data(
                 X, y, validate_separately=(check_X_params, check_y_params)
             )
+
+            feature_has_missing = self._compute_feature_has_missing(X)
             if issparse(X):
                 X.sort_indices()
 
@@ -381,7 +432,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                 self.min_impurity_decrease,
             )
 
-        builder.build(self.tree_, X, y, sample_weight)
+        builder.build(self.tree_, X, y, sample_weight, feature_has_missing)
 
         if self.n_outputs_ == 1 and is_classifier(self):
             self.n_classes_ = self.n_classes_[0]
@@ -394,7 +445,17 @@ def fit(self, X, y, sample_weight=None, check_input=True):
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""
         if check_input:
-            X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
+            if self._support_missing_values(X):
+                force_all_finite = "allow-nan"
+            else:
+                force_all_finite = True
+            X = self._validate_data(
+                X,
+                dtype=DTYPE,
+                accept_sparse="csr",
+                reset=False,
+                force_all_finite=force_all_finite,
+            )
             if issparse(X) and (
                 X.indices.dtype != np.intc or X.indptr.dtype != np.intc
             ):
@@ -886,7 +947,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             Fitted estimator.
         """
 
-        super().fit(
+        super()._fit(
             X,
             y,
             sample_weight=sample_weight,
@@ -971,7 +1032,14 @@ def predict_log_proba(self, X):
             return proba
 
     def _more_tags(self):
-        return {"multilabel": True}
+        # XXX: nan is only support for dense arrays, but we set this for common test to
+        # pass, specifically: check_estimators_nan_inf
+        allow_nan = self.splitter == "best" and self.criterion in {
+            "gini",
+            "log_loss",
+            "entropy",
+        }
+        return {"multilabel": True, "allow_nan": allow_nan}
 
 
 class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
@@ -1239,7 +1307,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             Fitted estimator.
         """
 
-        super().fit(
+        super()._fit(
             X,
             y,
             sample_weight=sample_weight,
@@ -1274,6 +1342,16 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
         )
         return averaged_predictions
 
+    def _more_tags(self):
+        # XXX: nan is only support for dense arrays, but we set this for common test to
+        # pass, specifically: check_estimators_nan_inf
+        allow_nan = self.splitter == "best" and self.criterion in {
+            "squared_error",
+            "friedman_mse",
+            "poisson",
+        }
+        return {"allow_nan": allow_nan}
+
 
 class ExtraTreeClassifier(DecisionTreeClassifier):
     """An extremely randomized tree classifier.

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
@@ -28,6 +28,8 @@ cdef class Criterion:
     cdef SIZE_t start                     # samples[start:pos] are the samples in the left node
     cdef SIZE_t pos                       # samples[pos:end] are the samples in the right node
     cdef SIZE_t end
+    cdef SIZE_t n_missing                # Number of missing values for the feature being evaluated
+    cdef bint missing_go_to_left         # Whether missing values go to the left node
 
     cdef SIZE_t n_outputs                 # Number of outputs
     cdef SIZE_t n_samples                 # Number of samples
@@ -36,6 +38,7 @@ cdef class Criterion:
     cdef double weighted_n_node_samples   # Weighted number of samples in the node
     cdef double weighted_n_left           # Weighted number of samples in the left node
     cdef double weighted_n_right          # Weighted number of samples in the right node
+    cdef double weighted_n_missing       # Weighted number of samples that are missing
 
     # The criterion object is maintained such that left and right collected
     # statistics correspond to samples[start:pos] and samples[pos:end].
@@ -50,6 +53,8 @@ cdef class Criterion:
         SIZE_t start,
         SIZE_t end
     ) except -1 nogil
+    cdef void init_sum_missing(self)
+    cdef void init_missing(self, SIZE_t n_missing) noexcept nogil
     cdef int reset(self) except -1 nogil
     cdef int reverse_reset(self) except -1 nogil
     cdef int update(self, SIZE_t new_pos) except -1 nogil
@@ -77,15 +82,17 @@ cdef class ClassificationCriterion(Criterion):
     cdef SIZE_t[::1] n_classes
     cdef SIZE_t max_n_classes
 
-    cdef double[:, ::1] sum_total   # The sum of the weighted count of each label.
-    cdef double[:, ::1] sum_left    # Same as above, but for the left side of the split
-    cdef double[:, ::1] sum_right   # Same as above, but for the right side of the split
+    cdef double[:, ::1] sum_total    # The sum of the weighted count of each label.
+    cdef double[:, ::1] sum_left     # Same as above, but for the left side of the split
+    cdef double[:, ::1] sum_right    # Same as above, but for the right side of the split
+    cdef double[:, ::1] sum_missing  # Same as above, but for missing values in X
 
 cdef class RegressionCriterion(Criterion):
     """Abstract regression criterion."""
 
     cdef double sq_sum_total
 
-    cdef double[::1] sum_total   # The sum of w*y.
-    cdef double[::1] sum_left    # Same as above, but for the left side of the split
-    cdef double[::1] sum_right   # Same as above, but for the right side of the split
+    cdef double[::1] sum_total    # The sum of w*y.
+    cdef double[::1] sum_left     # Same as above, but for the left side of the split
+    cdef double[::1] sum_right    # Same as above, but for the right side of the split
+    cdef double[::1] sum_missing  # Same as above, but for missing values in X