From d15aa5b4eae1a1c0d08af64ed5ec553e3708766e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 31 May 2023 10:16:03 -0400 Subject: [PATCH 1/5] Updated DistanceMetric API w/ new ABC/interface --- sklearn/cluster/_agglomerative.py | 4 +-- sklearn/cluster/_hierarchical_fast.pyx | 8 ++--- sklearn/metrics/_dist_metrics.pxd.tp | 7 +++-- sklearn/metrics/_dist_metrics.pyx.tp | 29 +++++++++++++++-- .../_datasets_pair.pxd.tp | 4 +-- .../_datasets_pair.pyx.tp | 6 ++-- .../_dispatcher.py | 4 +-- sklearn/metrics/tests/test_dist_metrics.py | 31 +++++-------------- sklearn/neighbors/__init__.py | 2 -- sklearn/neighbors/_ball_tree.pyx | 30 +++++++++--------- sklearn/neighbors/_binary_tree.pxi | 19 ++++++------ sklearn/neighbors/_distance_metric.py | 22 ------------- sklearn/neighbors/_kd_tree.pyx | 4 +-- sklearn/neighbors/tests/test_neighbors.py | 12 ------- sklearn/tests/test_common.py | 1 + 15 files changed, 79 insertions(+), 104 deletions(-) delete mode 100644 sklearn/neighbors/_distance_metric.py diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 77ac981934468..059056275ef3d 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -19,7 +19,7 @@ from ..metrics.pairwise import paired_distances from ..metrics.pairwise import _VALID_METRICS from ..metrics import DistanceMetric -from ..metrics._dist_metrics import METRIC_MAPPING +from ..metrics._dist_metrics import METRIC_MAPPING64 from ..utils import check_array from ..utils._fast_dict import IntFloatDict from ..utils.graph import _fix_connected_components @@ -543,7 +543,7 @@ def linkage_tree( linkage == "single" and affinity != "precomputed" and not callable(affinity) - and affinity in METRIC_MAPPING + and affinity in METRIC_MAPPING64 ): # We need the fast cythonized metric from neighbors dist_metric = DistanceMetric.get_metric(affinity) diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx index 4ba895ddcf352..55e2b67030e13 100644 --- a/sklearn/cluster/_hierarchical_fast.pyx +++ b/sklearn/cluster/_hierarchical_fast.pyx @@ -3,7 +3,7 @@ import numpy as np cimport cython -from ..metrics._dist_metrics cimport DistanceMetric +from ..metrics._dist_metrics cimport DistanceMetric64 from ..utils._fast_dict cimport IntFloatDict from ..utils._typedefs cimport float64_t, intp_t, uint8_t @@ -432,7 +432,7 @@ def single_linkage_label(L): # Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378 def mst_linkage_core( const float64_t [:, ::1] raw_data, - DistanceMetric dist_metric): + DistanceMetric64 dist_metric): """ Compute the necessary elements of a minimum spanning tree for computation of single linkage clustering. This @@ -449,8 +449,8 @@ def mst_linkage_core( raw_data: array of shape (n_samples, n_features) The array of feature data to be clustered. Must be C-aligned - dist_metric: DistanceMetric - A DistanceMetric object conforming to the API from + dist_metric: DistanceMetric64 + A DistanceMetric64 object conforming to the API from ``sklearn.metrics._dist_metrics.pxd`` that will be used to compute distances. diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 511e0941a7098..34ca704ab68bf 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -16,7 +16,7 @@ implementation_specific_values = [ # The metric mapping is adapted accordingly to route to the correct # implementations. # - ('', 'float64_t', 'np.float64'), + ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] @@ -25,6 +25,9 @@ from libc.math cimport sqrt, exp from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t +cdef class DistanceMetric: + pass + {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} ###################################################################### @@ -68,7 +71,7 @@ cdef inline float64_t euclidean_rdist_to_dist{{name_suffix}}(const {{INPUT_DTYPE ###################################################################### # DistanceMetric{{name_suffix}} base class -cdef class DistanceMetric{{name_suffix}}: +cdef class DistanceMetric{{name_suffix}}(DistanceMetric): # The following attributes are required for a few of the subclasses. # we must define them here so that cython's limited polymorphism will work. # Because we don't expect to instantiate a lot of these objects, the diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 7e960c4e97934..a3f004abfd038 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -16,7 +16,7 @@ implementation_specific_values = [ # The metric mapping is adapted accordingly to route to the correct # implementations. # - ('', 'float64_t', 'np.float64'), + ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] @@ -73,9 +73,32 @@ def get_valid_metric_ids(L): >>> sorted(L) ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] """ - return [key for (key, val) in METRIC_MAPPING.items() + return [key for (key, val) in METRIC_MAPPING64.items() if (val.__name__ in L) or (val in L)] +cdef class DistanceMetric: + @classmethod + def get_metric(cls, metric, dtype=np.float64, **kwargs): + """Get the given distance metric from the string identifier. + + See the docstring of DistanceMetric for a list of available metrics. + + Parameters + ---------- + metric : str or class name + The distance metric to use + dtype : {np.float32, np.float64}, default=np.float64 + The dtype of the data on which the metric will be applied + **kwargs + additional arguments will be passed to the requested metric + """ + if dtype == np.float32: + specialized_class = DistanceMetric32 + else: + specialized_class = DistanceMetric64 + + return specialized_class.get_metric(metric, **kwargs) + pass {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -126,7 +149,7 @@ cdef {{INPUT_DTYPE_t}} INF{{name_suffix}} = np.inf ###################################################################### # Distance Metric Classes -cdef class DistanceMetric{{name_suffix}}: +cdef class DistanceMetric{{name_suffix}}(DistanceMetric): """DistanceMetric class This class provides a uniform interface to fast distance metric diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index 23337cb2b59d6..98123235d937c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -7,13 +7,13 @@ implementation_specific_values = [ # # We use DistanceMetric for float64 for backward naming compatibility. # - ('64', 'DistanceMetric', 'float64_t'), + ('64', 'DistanceMetric64', 'float64_t'), ('32', 'DistanceMetric32', 'float32_t') ] }} from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t -from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32 +from ...metrics._dist_metrics cimport DistanceMetric64, DistanceMetric32, DistanceMetric {{for name_suffix, DistanceMetric, INPUT_DTYPE_t in implementation_specific_values}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 5569c1f231d62..befbe48e59996 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -7,7 +7,7 @@ implementation_specific_values = [ # # We use DistanceMetric for float64 for backward naming compatibility. # - ('64', 'DistanceMetric', 'float64_t', 'np.float64'), + ('64', 'DistanceMetric64', 'float64_t', 'np.float64'), ('32', 'DistanceMetric32', 'float32_t', 'np.float32') ] @@ -17,7 +17,6 @@ import numpy as np from cython cimport final from ...utils._typedefs cimport float64_t, float32_t, intp_t -from ...metrics._dist_metrics cimport DistanceMetric from scipy.sparse import issparse, csr_matrix @@ -96,8 +95,9 @@ cdef class DatasetsPair{{name_suffix}}: if metric_kwargs is not None: metric_kwargs.pop("Y_norm_squared", None) cdef: - {{DistanceMetric}} distance_metric = {{DistanceMetric}}.get_metric( + {{DistanceMetric}} distance_metric = DistanceMetric.get_metric( metric, + {{INPUT_DTYPE}}, **(metric_kwargs or {}) ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index f72861bcbeb8c..5f4325af3a09f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -6,7 +6,7 @@ from scipy.sparse import isspmatrix_csr, issparse -from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING +from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING64 from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64 from ._argkmin import ( @@ -76,7 +76,7 @@ def valid_metrics(cls) -> List[str]: "hamming", *BOOL_METRICS, } - return sorted(({"sqeuclidean"} | set(METRIC_MAPPING.keys())) - excluded) + return sorted(({"sqeuclidean"} | set(METRIC_MAPPING64.keys())) - excluded) @classmethod def is_usable_for(cls, X, Y, metric) -> bool: diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 7cde0397d70be..c6ec517a10be2 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -9,11 +9,7 @@ from scipy.spatial.distance import cdist from sklearn.metrics import DistanceMetric -from sklearn.metrics._dist_metrics import ( - BOOL_METRICS, - # Unexposed private DistanceMetric for 32 bit - DistanceMetric32, -) +from sklearn.metrics._dist_metrics import BOOL_METRICS from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose, create_memmap_backed_data @@ -78,9 +74,6 @@ def dist_func(x1, x2, p): ) @pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) def test_cdist(metric_param_grid, X, Y): - DistanceMetricInterface = ( - DistanceMetric if X.dtype == Y.dtype == np.float64 else DistanceMetric32 - ) metric, param_grid = metric_param_grid keys = param_grid.keys() X_csr, Y_csr = sp.csr_matrix(X), sp.csr_matrix(Y) @@ -105,7 +98,7 @@ def test_cdist(metric_param_grid, X, Y): else: D_scipy_cdist = cdist(X, Y, metric, **kwargs) - dm = DistanceMetricInterface.get_metric(metric, **kwargs) + dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs) # DistanceMetric.pairwise must be consistent for all # combinations of formats in {sparse, dense}. @@ -165,9 +158,6 @@ def test_cdist_bool_metric(metric, X_bool, Y_bool): ) @pytest.mark.parametrize("X", [X64, X32, X_mmap]) def test_pdist(metric_param_grid, X): - DistanceMetricInterface = ( - DistanceMetric if X.dtype == np.float64 else DistanceMetric32 - ) metric, param_grid = metric_param_grid keys = param_grid.keys() X_csr = sp.csr_matrix(X) @@ -195,7 +185,7 @@ def test_pdist(metric_param_grid, X): else: D_scipy_pdist = cdist(X, X, metric, **kwargs) - dm = DistanceMetricInterface.get_metric(metric, **kwargs) + dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs) D_sklearn = dm.pairwise(X) assert D_sklearn.flags.c_contiguous assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict) @@ -226,8 +216,8 @@ def test_distance_metrics_dtype_consistency(metric_param_grid): for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) - dm64 = DistanceMetric.get_metric(metric, **kwargs) - dm32 = DistanceMetric32.get_metric(metric, **kwargs) + dm64 = DistanceMetric.get_metric(metric, np.float64, **kwargs) + dm32 = DistanceMetric.get_metric(metric, np.float32, **kwargs) D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) @@ -269,9 +259,6 @@ def test_pdist_bool_metrics(metric, X_bool): ) @pytest.mark.parametrize("X", [X64, X32]) def test_pickle(writable_kwargs, metric_param_grid, X): - DistanceMetricInterface = ( - DistanceMetric if X.dtype == np.float64 else DistanceMetric32 - ) metric, param_grid = metric_param_grid keys = param_grid.keys() for vals in itertools.product(*param_grid.values()): @@ -281,7 +268,7 @@ def test_pickle(writable_kwargs, metric_param_grid, X): if isinstance(val, np.ndarray): val.setflags(write=writable_kwargs) kwargs = dict(zip(keys, vals)) - dm = DistanceMetricInterface.get_metric(metric, **kwargs) + dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs) D1 = dm.pairwise(X) dm2 = pickle.loads(pickle.dumps(dm)) D2 = dm2.pairwise(X) @@ -302,10 +289,6 @@ def test_pickle_bool_metrics(metric, X_bool): @pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) def test_haversine_metric(X, Y): - DistanceMetricInterface = ( - DistanceMetric if X.dtype == np.float64 else DistanceMetric32 - ) - # The Haversine DistanceMetric only works on 2 features. X = np.asarray(X[:, :2]) Y = np.asarray(Y[:, :2]) @@ -327,7 +310,7 @@ def haversine_slow(x1, x2): for j, yj in enumerate(Y): D_reference[i, j] = haversine_slow(xi, yj) - haversine = DistanceMetricInterface.get_metric("haversine") + haversine = DistanceMetric.get_metric("haversine", X.dtype) D_sklearn = haversine.pairwise(X, Y) assert_allclose( diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 12824e9cb684e..8223c20991904 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -5,7 +5,6 @@ from ._ball_tree import BallTree from ._kd_tree import KDTree -from ._distance_metric import DistanceMetric from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer from ._unsupervised import NearestNeighbors @@ -20,7 +19,6 @@ __all__ = [ "BallTree", - "DistanceMetric", "KDTree", "KNeighborsClassifier", "KNeighborsRegressor", diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx index cf02bd37c5829..0bc65fdd36ec8 100644 --- a/sklearn/neighbors/_ball_tree.pyx +++ b/sklearn/neighbors/_ball_tree.pyx @@ -5,15 +5,15 @@ __all__ = ['BallTree'] DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} -VALID_METRICS = ['EuclideanDistance', 'SEuclideanDistance', - 'ManhattanDistance', 'ChebyshevDistance', - 'MinkowskiDistance', 'WMinkowskiDistance', - 'MahalanobisDistance', 'HammingDistance', - 'CanberraDistance', 'BrayCurtisDistance', - 'JaccardDistance', 'DiceDistance', - 'RogersTanimotoDistance', 'RussellRaoDistance', - 'SokalMichenerDistance', 'SokalSneathDistance', - 'PyFuncDistance', 'HaversineDistance'] +VALID_METRICS = ['EuclideanDistance64', 'SEuclideanDistance64', + 'ManhattanDistance64', 'ChebyshevDistance64', + 'MinkowskiDistance64', 'WMinkowskiDistance64', + 'MahalanobisDistance64', 'HammingDistance64', + 'CanberraDistance64', 'BrayCurtisDistance64', + 'JaccardDistance64', 'DiceDistance64', + 'RogersTanimotoDistance64', 'RussellRaoDistance64', + 'SokalMichenerDistance64', 'SokalSneathDistance64', + 'PyFuncDistance64', 'HaversineDistance64'] include "_binary_tree.pxi" @@ -129,7 +129,7 @@ cdef inline float64_t min_rdist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1 nogil: """Compute the minimum reduced-distance between a point and a node""" if tree.euclidean: - return euclidean_dist_to_rdist(min_dist(tree, i_node, pt)) + return euclidean_dist_to_rdist64(min_dist(tree, i_node, pt)) else: return tree.dist_metric._dist_to_rdist(min_dist(tree, i_node, pt)) @@ -138,7 +138,7 @@ cdef inline float64_t max_rdist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1: """Compute the maximum reduced-distance between a point and a node""" if tree.euclidean: - return euclidean_dist_to_rdist(max_dist(tree, i_node, pt)) + return euclidean_dist_to_rdist64(max_dist(tree, i_node, pt)) else: return tree.dist_metric._dist_to_rdist(max_dist(tree, i_node, pt)) @@ -167,8 +167,8 @@ cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, BinaryTree tree2, intp_t i_node2) except -1: """compute the minimum reduced distance between two nodes""" if tree1.euclidean: - return euclidean_dist_to_rdist(min_dist_dual(tree1, i_node1, - tree2, i_node2)) + return euclidean_dist_to_rdist64(min_dist_dual(tree1, i_node1, + tree2, i_node2)) else: return tree1.dist_metric._dist_to_rdist(min_dist_dual(tree1, i_node1, tree2, i_node2)) @@ -178,8 +178,8 @@ cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, BinaryTree tree2, intp_t i_node2) except -1: """compute the maximum reduced distance between two nodes""" if tree1.euclidean: - return euclidean_dist_to_rdist(max_dist_dual(tree1, i_node1, - tree2, i_node2)) + return euclidean_dist_to_rdist64(max_dist_dual(tree1, i_node1, + tree2, i_node2)) else: return tree1.dist_metric._dist_to_rdist(max_dist_dual(tree1, i_node1, tree2, i_node2)) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 36fb03ce2afe5..7d4d08b2703a4 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -153,9 +153,10 @@ import warnings from ..metrics._dist_metrics cimport ( DistanceMetric, - euclidean_dist, - euclidean_rdist, - euclidean_dist_to_rdist, + DistanceMetric64, + euclidean_dist64, + euclidean_rdist64, + euclidean_dist_to_rdist64, ) from ._partition_nodes cimport partition_node_indices @@ -231,7 +232,7 @@ leaf_size : positive int, default=40 satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in the case that ``n_samples < leaf_size``. -metric : str or DistanceMetric object, default='minkowski' +metric : str or DistanceMetric64 object, default='minkowski' Metric to use for distance computation. Default is "minkowski", which results in the standard Euclidean distance when p = 2. A list of valid metrics for {BinaryTree} is given by @@ -782,7 +783,7 @@ cdef class BinaryTree: cdef intp_t n_levels cdef intp_t n_nodes - cdef DistanceMetric dist_metric + cdef DistanceMetric64 dist_metric cdef int euclidean # variables to keep track of building & querying stats @@ -832,7 +833,7 @@ cdef class BinaryTree: self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ - == 'EuclideanDistance') + == 'EuclideanDistance64') metric = self.dist_metric.__class__.__name__ if metric not in VALID_METRICS: @@ -922,7 +923,7 @@ cdef class BinaryTree: sample_weight = state[12] self.euclidean = (self.dist_metric.__class__.__name__ - == 'EuclideanDistance') + == 'EuclideanDistance64') n_samples = self.data.shape[0] self._update_sample_weight(n_samples, sample_weight) @@ -996,7 +997,7 @@ cdef class BinaryTree: """Compute the distance between arrays x1 and x2""" self.n_calls += 1 if self.euclidean: - return euclidean_dist(x1, x2, size) + return euclidean_dist64(x1, x2, size) else: return self.dist_metric.dist(x1, x2, size) @@ -1011,7 +1012,7 @@ cdef class BinaryTree: """ self.n_calls += 1 if self.euclidean: - return euclidean_rdist(x1, x2, size) + return euclidean_rdist64(x1, x2, size) else: return self.dist_metric.rdist(x1, x2, size) diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py deleted file mode 100644 index 9bfd131c482d8..0000000000000 --- a/sklearn/neighbors/_distance_metric.py +++ /dev/null @@ -1,22 +0,0 @@ -# TODO: Remove this file in 1.3 -import warnings - -from ..metrics import DistanceMetric as _DistanceMetric - - -class DistanceMetric(_DistanceMetric): - @classmethod - def _warn(cls): - warnings.warn( - ( - "sklearn.neighbors.DistanceMetric has been moved " - "to sklearn.metrics.DistanceMetric in 1.0. " - "This import path will be removed in 1.3" - ), - category=FutureWarning, - ) - - @classmethod - def get_metric(cls, metric, **kwargs): - DistanceMetric._warn() - return _DistanceMetric.get_metric(metric, **kwargs) diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx index ece293229fa10..f5cd2617be147 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx @@ -6,8 +6,8 @@ __all__ = ['KDTree'] DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} -VALID_METRICS = ['EuclideanDistance', 'ManhattanDistance', - 'ChebyshevDistance', 'MinkowskiDistance'] +VALID_METRICS = ['EuclideanDistance64', 'ManhattanDistance64', + 'ChebyshevDistance64', 'MinkowskiDistance64'] include "_binary_tree.pxi" diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 092b85ad9dcd0..6fd3db9b7b509 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -2158,18 +2158,6 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo): assert model._fit_method == expected_algo -# TODO: Remove in 1.3 -def test_neighbors_distance_metric_deprecation(): - from sklearn.neighbors import DistanceMetric - from sklearn.metrics import DistanceMetric as ActualDistanceMetric - - msg = r"This import path will be removed in 1\.3" - with pytest.warns(FutureWarning, match=msg): - dist_metric = DistanceMetric.get_metric("euclidean") - - assert isinstance(dist_metric, ActualDistanceMetric) - - # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize( diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 176a2d463d162..b3c6391729698 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -221,6 +221,7 @@ def test_import_all_consistency(): or "feature_extraction._hashing_fast" in modname ): continue + print(f"DEBUG *** {modname=}") package = __import__(modname, fromlist="dummy") for name in getattr(package, "__all__", ()): assert hasattr(package, name), "Module '{0}' has no attribute '{1}'".format( From 9d1e26caa91965be6ab888f12e82dc0353a40f27 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 1 Jun 2023 18:18:39 -0400 Subject: [PATCH 2/5] Updated hdbscan linkage for new DistanceMetric64 --- sklearn/cluster/_hdbscan/_linkage.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 0cb51a7b42992..03e91ac8d6833 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -35,7 +35,7 @@ cimport numpy as cnp from libc.float cimport DBL_MAX import numpy as np -from ...metrics._dist_metrics cimport DistanceMetric +from ...metrics._dist_metrics cimport DistanceMetric64 from ...cluster._hierarchical_fast cimport UnionFind from ...cluster._hdbscan._tree cimport HIERARCHY_t from ...cluster._hdbscan._tree import HIERARCHY_dtype @@ -111,7 +111,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( const float64_t[:, ::1] raw_data, const float64_t[::1] core_distances, - DistanceMetric dist_metric, + DistanceMetric64 dist_metric, float64_t alpha=1.0 ): """Compute the Minimum Spanning Tree (MST) representation of the mutual- From d9e49e7551fddffd013a686980bc4df16358af1a Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 1 Jun 2023 18:19:45 -0400 Subject: [PATCH 3/5] Removed old debug print --- sklearn/tests/test_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index b3c6391729698..176a2d463d162 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -221,7 +221,6 @@ def test_import_all_consistency(): or "feature_extraction._hashing_fast" in modname ): continue - print(f"DEBUG *** {modname=}") package = __import__(modname, fromlist="dummy") for name in getattr(package, "__all__", ()): assert hasattr(package, name), "Module '{0}' has no attribute '{1}'".format( From 2bdc44b6f2002316999cd88e2ee1f62d1bc82e87 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 2 Jun 2023 17:41:20 -0400 Subject: [PATCH 4/5] Updated test --- sklearn/metrics/_dist_metrics.pyx.tp | 8 ++++-- sklearn/metrics/tests/test_dist_metrics.py | 29 +++++++++++++++++++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index 8388a125ea9bd..d3e80d98ef3c8 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -94,11 +94,15 @@ cdef class DistanceMetric: """ if dtype == np.float32: specialized_class = DistanceMetric32 - else: + elif dtype == np.float64: specialized_class = DistanceMetric64 + else: + raise ValueError( + f"Unexpected dtype {dtype} provided. Please select a dtype from" + " {np.float32, np.float64}" + ) return specialized_class.get_metric(metric, **kwargs) - pass {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 3c08ad32b8e2a..fc9b006e2fefd 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -9,7 +9,11 @@ from scipy.spatial.distance import cdist from sklearn.metrics import DistanceMetric -from sklearn.metrics._dist_metrics import BOOL_METRICS +from sklearn.metrics._dist_metrics import ( + BOOL_METRICS, + DistanceMetric32, + DistanceMetric64, +) from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose, create_memmap_backed_data @@ -372,3 +376,26 @@ def test_minkowski_metric_validate_weights_size(): ) with pytest.raises(ValueError, match=msg): dm.pairwise(X64, Y64) + + +@pytest.mark.parametrize("metric, metric_kwargs", METRICS_DEFAULT_PARAMS) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_get_metric_dtype(metric, metric_kwargs, dtype): + specialized_cls = { + np.float32: DistanceMetric32, + np.float64: DistanceMetric64, + }[dtype] + + # We don't need the entire grid, just one for a sanity check + metric_kwargs = {k: v[0] for k, v in metric_kwargs.items()} + generic_type = type(DistanceMetric.get_metric(metric, dtype, **metric_kwargs)) + specialized_type = type(specialized_cls.get_metric(metric, **metric_kwargs)) + + assert generic_type is specialized_type + + +def test_get_metric_bad_dtype(): + dtype = np.int32 + msg = r"Unexpected dtype .* provided. Please select a dtype from" + with pytest.raises(ValueError, match=msg): + DistanceMetric.get_metric("manhattan", dtype) From 9b2e36c57845708bb129ca7fb745a34a365095bd Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 2 Jun 2023 17:45:15 -0400 Subject: [PATCH 5/5] Removed now extraneous documentation and improved VALID_METRICS list --- sklearn/metrics/_dist_metrics.pxd.tp | 14 +-------- sklearn/metrics/_dist_metrics.pyx.tp | 14 +-------- .../_datasets_pair.pxd.tp | 5 +--- .../_datasets_pair.pyx.tp | 5 +--- sklearn/neighbors/_ball_tree.pyx | 30 ++++++++++++------- 5 files changed, 24 insertions(+), 44 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 34ca704ab68bf..60b8da3ecfa46 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -3,19 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # On the first hand, an empty string is used for `name_suffix` - # for the float64 case as to still be able to expose the original - # float64 implementation under the same API, namely `DistanceMetric`. - # - # On the other hand, '32' is used for `name_suffix` for the float32 - # case to remove ambiguity and use `DistanceMetric32`, which is not - # publicly exposed. - # - # The metric mapping is adapted accordingly to route to the correct - # implementations. - # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index d3e80d98ef3c8..bc54e51a7511a 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -3,19 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE - # - # On the first hand, an empty string is used for `name_suffix` - # for the float64 case as to still be able to expose the original - # float64 implementation under the same API, namely `DistanceMetric`. - # - # On the other hand, '32' bit is used for `name_suffix` for the float32 - # case to remove ambiguity and use `DistanceMetric32`, which is not - # publicly exposed. - # - # The metric mapping is adapted accordingly to route to the correct - # implementations. - # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE ('64', 'float64_t', 'np.float64'), ('32', 'float32_t', 'np.float32') ] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index 98123235d937c..fc56a59cab16f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -3,10 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE - # - # We use DistanceMetric for float64 for backward naming compatibility. - # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE ('64', 'DistanceMetric64', 'float64_t'), ('32', 'DistanceMetric32', 'float32_t') ] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index befbe48e59996..40a9a45e8b8e1 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -3,10 +3,7 @@ implementation_specific_values = [ # Values are the following ones: # - # name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE - # - # We use DistanceMetric for float64 for backward naming compatibility. - # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE ('64', 'DistanceMetric64', 'float64_t', 'np.float64'), ('32', 'DistanceMetric32', 'float32_t', 'np.float32') ] diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx index 0bc65fdd36ec8..d9b933cb43c66 100644 --- a/sklearn/neighbors/_ball_tree.pyx +++ b/sklearn/neighbors/_ball_tree.pyx @@ -5,16 +5,26 @@ __all__ = ['BallTree'] DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} -VALID_METRICS = ['EuclideanDistance64', 'SEuclideanDistance64', - 'ManhattanDistance64', 'ChebyshevDistance64', - 'MinkowskiDistance64', 'WMinkowskiDistance64', - 'MahalanobisDistance64', 'HammingDistance64', - 'CanberraDistance64', 'BrayCurtisDistance64', - 'JaccardDistance64', 'DiceDistance64', - 'RogersTanimotoDistance64', 'RussellRaoDistance64', - 'SokalMichenerDistance64', 'SokalSneathDistance64', - 'PyFuncDistance64', 'HaversineDistance64'] - +VALID_METRICS = [ + 'BrayCurtisDistance64', + 'CanberraDistance64', + 'ChebyshevDistance64', + 'DiceDistance64', + 'EuclideanDistance64', + 'HammingDistance64', + 'HaversineDistance64', + 'JaccardDistance64', + 'MahalanobisDistance64', + 'ManhattanDistance64', + 'MinkowskiDistance64', + 'PyFuncDistance64', + 'RogersTanimotoDistance64', + 'RussellRaoDistance64', + 'SEuclideanDistance64', + 'SokalMichenerDistance64', + 'SokalSneathDistance64', + 'WMinkowskiDistance64', +] include "_binary_tree.pxi"