diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index b7d3d1f4d86a6..4e1e53933ccb4 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -232,6 +232,7 @@ cdef class DistanceMetric: # metric mappings # These map from metric id strings to class names METRIC_MAPPING{{name_suffix}} = { + 'precomputed': PrecomputedDistanceMatrix{{name_suffix}} 'euclidean': EuclideanDistance{{name_suffix}}, 'l2': EuclideanDistance{{name_suffix}}, 'minkowski': MinkowskiDistance{{name_suffix}}, @@ -359,13 +360,17 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): **User-defined distance:** - =========== =============== ======= - identifier class name args - ----------- --------------- ------- - "pyfunc" PyFuncDistance func - =========== =============== ======= + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "precomputed" PrecomputedDistanceMatrix precomputed + "pyfunc" PyFuncDistance func + =========== =============== ======= - Here ``func`` is a function which takes two one-dimensional numpy + "precomputed" indicates that the user has the distance computed + and wants to pass in the precomputed as an argument. + + ``func`` is a function which takes two one-dimensional numpy arrays, and returns a distance. Note that in order to be used within the BallTree, the distance must be a true metric: i.e. it must satisfy the following properties diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp index 51fb745dca784..2f2d8e03c27a0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp @@ -13,6 +13,7 @@ from ._classmode cimport WeightingStrategy {{for name_suffix in ["32", "64"]}} from ._argkmin cimport ArgKmin{{name_suffix}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} +from ._datasets_pair cimport PrecomputedDistanceMatrix{{name_suffix}} cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): """ diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 9578129993c37..3b8ae51b970fd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -20,8 +20,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: Implementations inherit from this template and may override the several defined hooks as needed in order to easily extend functionality with minimal redundant code. - """ - + """ cdef: readonly DatasetsPair{{name_suffix}} datasets_pair diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 2bbfd74e2c2c3..56e27b0b6b2e0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -127,6 +127,9 @@ cdef class BaseDistancesReduction{{name_suffix}}: Implementations inherit from this template and may override the several defined hooks as needed in order to easily extend functionality with minimal redundant code. + + If metric is 'precomputed' and the precomputed matrix is provided, + a subclass must be able to access it through the compute method. """ def __init__( @@ -137,7 +140,6 @@ cdef class BaseDistancesReduction{{name_suffix}}: ): cdef: intp_t X_n_full_chunks, Y_n_full_chunks - if chunk_size is None: chunk_size = get_config().get("pairwise_dist_chunk_size", 256) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index 1e57b3291a8f4..fe91bf088a8c5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -29,6 +29,11 @@ cdef class DatasetsPair{{name_suffix}}: cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil +cdef class PrecomputedDistanceMatrix{{name_suffix}}(DatasetsPair{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] distance_matrix + + cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:, ::1] X diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 2c3ca44047145..3a0e3f27576f6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -53,8 +53,8 @@ cdef class DatasetsPair{{name_suffix}}: @classmethod def get_for( cls, - X, - Y, + X = None, + Y = None, metric="euclidean", dict metric_kwargs=None, ) -> DatasetsPair{{name_suffix}}: @@ -98,6 +98,9 @@ cdef class DatasetsPair{{name_suffix}}: metric_kwargs = copy.copy(metric_kwargs) metric_kwargs.pop("X_norm_squared", None) metric_kwargs.pop("Y_norm_squared", None) + if metric = precomputed: + return PrecomputedDistanceMatrix{{name_suffix}}(precomputed) + cdef: {{DistanceMetric}} distance_metric = DistanceMetric.get_metric( metric, @@ -158,6 +161,43 @@ cdef class DatasetsPair{{name_suffix}}: # TODO: add "with gil: raise" here when supporting Cython 3.0 return -1 + +@final +cdef class PrecomputedDistanceMatrix{{name_suffix}}(DatasetsPair{{name_suffix}}): + """A subclass of DatasetsPair + + Parameters: must receive precomputed_distance: ndarray of shape + (n_samples_X, n_samples_Y), + Must be C-contiguous. + """ + + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] precomputed_distance, + ): + super().__init__( + distance_metric=DistanceMetric{{name_suffix}}(), + n_features=0, + ) + # This array has already been checked. + self.distance_matrix = precomputed_distance + + @final + cdef intp_t n_samples_X(self) noexcept nogil: + return self.distance_matrix.shape[0] + + @final + cdef intp_t n_samples_Y(self) noexcept nogil: + return self.distance_matrix.shape[1] + + @final + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_matrix[i, j] + + @final + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_matrix[i, j] + @final cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): """Compute distances between row vectors of two arrays. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index d8307cbe84eaa..14cee8d50013a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -81,10 +81,12 @@ def valid_metrics(cls) -> List[str]: "hamming", *BOOL_METRICS, } - return sorted(({"sqeuclidean"} | set(METRIC_MAPPING64.keys())) - excluded) + return sorted( + ({"sqeuclidean", "precomputed"} | set(METRIC_MAPPING64.keys())) - excluded + ) @classmethod - def is_usable_for(cls, X, Y, metric) -> bool: + def is_usable_for(cls, X=None, Y=None, metric="euclidean") -> bool: """Return True if the dispatcher can be used for the given parameters. @@ -96,6 +98,8 @@ def is_usable_for(cls, X, Y, metric) -> bool: Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) Input data. + precomputed: ndarray of shape (n_samples_X, n_samples_Y) + metric : str, default='euclidean' The distance metric to use. For a list of available metrics, see the documentation of @@ -105,7 +109,15 @@ def is_usable_for(cls, X, Y, metric) -> bool: ------- True if the dispatcher can be used, else False. """ - + if metric == "precomputed": + if X is not None and Y is None: + is_usable = True + else: + is_usable = False + + # is_usable = (X is not None and Y is not None) ^ bool(precomputed) + if is_usable == False: + return is_usable # FIXME: the current Cython implementation is too slow for a large number of # features. We temporarily disable it to fallback on SciPy's implementation. # See: https://github.com/scikit-learn/scikit-learn/issues/28191 @@ -188,9 +200,9 @@ class ArgKmin(BaseDistancesReductionDispatcher): @classmethod def compute( cls, - X, - Y, - k, + X=None, + Y=None, + k=None, metric="euclidean", chunk_size=None, metric_kwargs=None, @@ -277,6 +289,25 @@ def compute( for the concrete implementation are therefore freed when this classmethod returns. """ + """ + if X is None and Y is None and precomputed_matrix is None: + raise ValueError("Either X and Y or precomputed_matrix must be provided.") + elif X is not None and Y is not None and precomputed_matrix is not None: + raise ValueError( + "Only one of X and Y or precomputed_matrix must be provided." + ) + elif X is None and Y is not None: + raise ValueError("Y should not be provided without X.") + elif X is not None and Y is None: + raise ValueError("X should not be provided without Y.") + """ + + if metric == "precomputed": + if X is None: + raise ValueError("X should be provided as a precomputed value") + if Y is not None: + raise ValueError("Y should not be provided as a precomputed value") + if X.dtype == Y.dtype == np.float64: return ArgKmin64.compute( X=X, @@ -326,9 +357,9 @@ class RadiusNeighbors(BaseDistancesReductionDispatcher): @classmethod def compute( cls, - X, - Y, - radius, + X=None, + Y=None, + radius=None, metric="euclidean", chunk_size=None, metric_kwargs=None, @@ -421,6 +452,24 @@ def compute( for the concrete implementation are therefore freed when this classmethod returns. """ + """ + if X is None and Y is None and precomputed is None: + raise ValueError("Either X and Y or precomputed must be provided.") + elif X is not None and Y is not None and precomputed is not None: + raise ValueError("Only one of X and Y or precomputed must be provided.") + elif X is None and Y is not None: + raise ValueError("Y should not be provided without X.") + elif X is not None and Y is None: + raise ValueError("X should not be provided without Y.") + elif precomputed: + return precomputed + """ + if metric == "precomputed": + if X is None: + raise ValueError("X should be provided as a precomputed value") + if Y is not None: + raise ValueError("Y should not be provided as a precomputed value") + if X.dtype == Y.dtype == np.float64: return RadiusNeighbors64.compute( X=X, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index d0567f2ead804..64c4b7b730833 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -101,7 +101,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Fall back on a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. pda = RadiusNeighbors{{name_suffix}}( - datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, precomputed, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, strategy=strategy, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp index 0a9b22251843e..7e62b3d77a2fc 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp @@ -48,7 +48,7 @@ cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix} # Use a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. pda = RadiusNeighborsClassMode{{name_suffix}}( - datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, precomputed, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, strategy=strategy, diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index af055a2091790..4644e5e78efda 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -39,7 +39,6 @@ "seuclidean", ] - def _get_metric_params_list(metric: str, n_features: int, seed: int = 1): """Return list of dummy DistanceMetric kwargs for tests.""" @@ -64,7 +63,6 @@ def _get_metric_params_list(metric: str, n_features: int, seed: int = 1): # In those cases, no kwargs is needed. return [{}] - def assert_same_distances_for_common_neighbors( query_idx, dist_row_a, @@ -102,6 +100,38 @@ def assert_same_distances_for_common_neighbors( f" rtol={rtol})" ) from e +def assert_precomputed(precomputed, n_samples_X, n_samples_Y): + """ + Validates a precomputed matrix for compatibility. + + Parameters: + precomputed (np.ndarray): The precomputed matrix to validate. + n_samples_X (int): The expected number of rows in the matrix. + n_samples_Y (int): The expected number of columns in the matrix. + + Raises: + AssertionError: If the input is not valid. + """ + # Check if the input is a numpy array + if not isinstance(precomputed, np.ndarray): + raise AssertionError("Input must be a numpy array.") + + # Check if the array has the correct data type + if precomputed.dtype not in [np.float32, np.float64]: + raise AssertionError("Precomputed matrix must be of type float (float32 or float64).") + + # Check if the array is empty + if precomputed.size == 0: + raise AssertionError("Precomputed matrix should not be empty.") + + # Check if the dimensions match the expected shape + expected_shape = (n_samples_X, n_samples_Y) + if precomputed.shape != expected_shape: + raise AssertionError( + f"Incorrect dimensions for precomputed matrix. " + f"Expected: {expected_shape}, Got: {precomputed.shape}." + ) + def assert_no_missing_neighbors( query_idx, @@ -109,8 +139,7 @@ def assert_no_missing_neighbors( dist_row_b, indices_row_a, indices_row_b, - threshold, -): + threshold): """Compare the indices of neighbors in two results sets. Any neighbor index with a distance below the precision threshold should @@ -241,7 +270,6 @@ def _non_trivial_radius( sampled_dists.sort(axis=1) return sampled_dists[:, expected_n_neighbors].mean() - def assert_compatible_radius_results( neighbors_dists_a, neighbors_dists_b, @@ -347,6 +375,66 @@ def assert_compatible_radius_results( ): partial(assert_compatible_radius_results, **FLOAT32_TOLS), } +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_all_inputs_none(cls): + """Test that ValueError is raised when all inputs are None.""" + with pytest.raises(ValueError, match="Either X and Y or precomputed_matrix must be provided."): + cls.compute(X=None, Y=None, precomputed_matrix=None) + +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_all_inputs_provided(cls): + """Test that ValueError is raised when both X/Y and precomputed_matrix are provided.""" + X = np.random.rand(10, 5) + Y = np.random.rand(10, 5) + precomputed_matrix = np.random.rand(10, 10) + with pytest.raises(ValueError, match="Only one of X and Y or precomputed_matrix must be provided."): + cls.compute(X=X, Y=Y, precomputed_matrix=precomputed_matrix) + +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_only_y(cls): + """Test that ValueError is raised when only Y is provided.""" + Y = np.random.rand(10, 5) + with pytest.raises(ValueError, match="Y should not be provided without X."): + cls.compute(X=None, Y=Y) + +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_only_x(cls): + """Test that ValueError is raised when only X is provided.""" + X = np.random.rand(10, 5) + with pytest.raises(ValueError, match="X should not be provided without Y."): + cls.compute(X=X, Y=None) + +def test_assert_precomputed(): + # Success Case: Valid precomputed matrix + n_samples_X, n_samples_Y = 5, 5 + + # Failure Case: Not a numpy array + with pytest.raises(AssertionError, match="Input must be a numpy array"): + assert_precomputed([[1, 2], [3, 4]], n_samples_X, n_samples_Y) + + # Failure Case: Incorrect dtype + invalid_dtype = np.random.randint(0, 10, (n_samples_X, n_samples_Y)) + with pytest.raises(AssertionError, match="Precomputed matrix must be of type float"): + assert_precomputed(invalid_dtype, n_samples_X, n_samples_Y) + + # Failure Case: Empty array + with pytest.raises(AssertionError, match="Precomputed matrix should not be empty"): + assert_precomputed(np.array([]), n_samples_X, n_samples_Y) + + # Failure Case: Incorrect dimensions + incorrect_shape = np.random.rand(n_samples_X, n_samples_X).astype(np.float32) + with pytest.raises(AssertionError, match="Incorrect dimensions for precomputed matrix"): + assert_precomputed(incorrect_shape, n_samples_X, n_samples_Y) + +def test_my_function_precomputed(): + X = np.array([[1, 2], [3, 4], [5, 6]]) # Small sample data + Y = np.array([[7, 8], [9, 10]]) + D = pairwise_distances(X, Y) # Compute distances ONCE + + result_precomputed = pairwise_distances(D, metric='precomputed') + result_computed = pairwise_distances(X, Y) + + np.testing.assert_allclose(result_precomputed, result_computed) def test_assert_compatible_argkmin_results(): atol = 1e-7 @@ -485,8 +573,7 @@ def test_assert_compatible_argkmin_results(): np.array([[2, 1, 4, 5, 3]]), **tols, ) - - + @pytest.mark.parametrize("check_sorted", [True, False]) def test_assert_compatible_radius_results(check_sorted): atol = 1e-7 @@ -1622,7 +1709,7 @@ def test_radius_neighbors_classmode_strategy_consistent(outlier_label): X=X, Y=Y, radius=radius, - metric=metric, + metric=metric, weights=weights, Y_labels=Y_labels, unique_Y_labels=unique_Y_labels,