Skip to content

MAINT Pull apart Splitter and Partitioner in the sklearn/tree code #29458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Jul 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 178 additions & 0 deletions sklearn/tree/_partitioner.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

# See _partitioner.pyx for details.

from ..utils._typedefs cimport (
float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
)
from ._splitter cimport SplitRecord


# Mitigate precision differences between 32 bit and 64 bit
cdef float32_t FEATURE_THRESHOLD = 1e-7


# We provide here the abstract interfact for a Partitioner that would be
# theoretically shared between the Dense and Sparse partitioners. However,
# we leave it commented out for now as it is not used in the current
# implementation due to the performance hit from vtable lookups when using
# inheritance based polymorphism. It is left here for future reference.
#
# Note: Instead, in `_splitter.pyx`, we define a fused type that can be used
# to represent both the dense and sparse partitioners.
#
# cdef class BasePartitioner:
# cdef intp_t[::1] samples
# cdef float32_t[::1] feature_values
# cdef intp_t start
# cdef intp_t end
# cdef intp_t n_missing
# cdef const uint8_t[::1] missing_values_in_feature_mask

# cdef void sort_samples_and_feature_values(
# self, intp_t current_feature
# ) noexcept nogil
# cdef void init_node_split(
# self,
# intp_t start,
# intp_t end
# ) noexcept nogil
# cdef void find_min_max(
# self,
# intp_t current_feature,
# float32_t* min_feature_value_out,
# float32_t* max_feature_value_out,
# ) noexcept nogil
# cdef void next_p(
# self,
# intp_t* p_prev,
# intp_t* p
# ) noexcept nogil
# cdef intp_t partition_samples(
# self,
# float64_t current_threshold
# ) noexcept nogil
# cdef void partition_samples_final(
# self,
# intp_t best_pos,
# float64_t best_threshold,
# intp_t best_feature,
# intp_t n_missing,
# ) noexcept nogil


cdef class DensePartitioner:
"""Partitioner specialized for dense data.

Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
cdef const float32_t[:, :] X
cdef intp_t[::1] samples
cdef float32_t[::1] feature_values
cdef intp_t start
cdef intp_t end
cdef intp_t n_missing
cdef const uint8_t[::1] missing_values_in_feature_mask

cdef void sort_samples_and_feature_values(
self, intp_t current_feature
) noexcept nogil
cdef void init_node_split(
self,
intp_t start,
intp_t end
) noexcept nogil
cdef void find_min_max(
self,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil
cdef void next_p(
self,
intp_t* p_prev,
intp_t* p
) noexcept nogil
cdef intp_t partition_samples(
self,
float64_t current_threshold
) noexcept nogil
cdef void partition_samples_final(
self,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t n_missing,
) noexcept nogil


cdef class SparsePartitioner:
"""Partitioner specialized for sparse CSC data.

Note that this partitioner is agnostic to the splitting strategy (best vs. random).
"""
cdef const float32_t[::1] X_data
cdef const int32_t[::1] X_indices
cdef const int32_t[::1] X_indptr
cdef intp_t n_total_samples
cdef intp_t[::1] index_to_samples
cdef intp_t[::1] sorted_samples
cdef intp_t start_positive
cdef intp_t end_negative
cdef bint is_samples_sorted

cdef intp_t[::1] samples
cdef float32_t[::1] feature_values
cdef intp_t start
cdef intp_t end
cdef intp_t n_missing
cdef const uint8_t[::1] missing_values_in_feature_mask

cdef void sort_samples_and_feature_values(
self, intp_t current_feature
) noexcept nogil
cdef void init_node_split(
self,
intp_t start,
intp_t end
) noexcept nogil
cdef void find_min_max(
self,
intp_t current_feature,
float32_t* min_feature_value_out,
float32_t* max_feature_value_out,
) noexcept nogil
cdef void next_p(
self,
intp_t* p_prev,
intp_t* p
) noexcept nogil
cdef intp_t partition_samples(
self,
float64_t current_threshold
) noexcept nogil
cdef void partition_samples_final(
self,
intp_t best_pos,
float64_t best_threshold,
intp_t best_feature,
intp_t n_missing,
) noexcept nogil

cdef void extract_nnz(
self,
intp_t feature
) noexcept nogil
cdef intp_t _partition(
self,
float64_t threshold,
intp_t zero_pos
) noexcept nogil


cdef void shift_missing_values_to_left_if_required(
SplitRecord* best,
intp_t[::1] samples,
intp_t end,
) noexcept nogil
Loading