diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 5c338d614b49d..42f675676a569 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -369,6 +369,11 @@ Changelog A deprecation cycle was introduced. :pr:`21576` by :user:`Paul-Emile Dugnat `. +- |API| The `"wminkowski"` metric of :class:`sklearn.metrics.DistanceMetric` is deprecated + and will be removed in version 1.3. Instead the existing `"minkowski"` metric now takes + in an optional `w` parameter for weights. This deprecation aims at remaining consistent + with SciPy 1.8 convention. :pr:`21873` by :user:`Yar Khine Phyo ` + - |Fix| :func:`metrics.silhouette_score` now supports integer input for precomputed distances. :pr:`22108` by `Thomas Fan`_. diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 9f0d8b86f0873..bf5744a509b9d 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -410,6 +410,8 @@ def test_vector_scikit_single_vs_scipy_single(seed): assess_same_labelling(cut, cut_scipy) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_mst_linkage_core_memory_mapped(metric_param_grid): """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset. diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index f7d22c1badfa2..946d8d7735601 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -30,6 +30,7 @@ cdef DTYPE_t INF = np.inf from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE from ..utils._typedefs import DTYPE, ITYPE from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper +from ..utils import check_array ###################################################################### # newObj function @@ -119,9 +120,12 @@ cdef class DistanceMetric: "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` ============== ==================== ======== =============================== - Note that "minkowski" with a non-None `w` parameter actually calls - `WMinkowskiDistance` with `w=w ** (1/p)` in order to be consistent with the - parametrization of scipy 1.8 and later. + .. deprecated:: 1.1 + `WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3. + Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are + applied to the absolute differences already raised to the p power. This is different from + `WMinkowskiDistance` where weights are applied to the absolute differences before raising + to the p power. The deprecation aims to remain consistent with SciPy 1.8 convention. **Metrics intended for two-dimensional vector spaces:** Note that the haversine distance metric requires data in the form of [latitude, longitude] and both @@ -257,25 +261,14 @@ cdef class DistanceMetric: if metric is MinkowskiDistance: p = kwargs.pop('p', 2) w = kwargs.pop('w', None) - if w is not None: - # Be consistent with scipy 1.8 conventions: in scipy 1.8, - # 'wminkowski' was removed in favor of passing a - # weight vector directly to 'minkowski', however - # the new weights apply to the absolute differences raised to - # the p power instead of the absolute difference as in - # previous versions of scipy. - # WMinkowskiDistance in sklearn implements the weighting - # scheme of the old 'wminkowski' in scipy < 1.8, hence the - # following adaptation: - return WMinkowskiDistance(p, w ** (1/p), **kwargs) - if p == 1: + if p == 1 and w is None: return ManhattanDistance(**kwargs) - elif p == 2: + elif p == 2 and w is None: return EuclideanDistance(**kwargs) - elif np.isinf(p): + elif np.isinf(p) and w is None: return ChebyshevDistance(**kwargs) else: - return MinkowskiDistance(p, **kwargs) + return MinkowskiDistance(p, w, **kwargs) else: return metric(**kwargs) @@ -554,27 +547,64 @@ cdef class MinkowskiDistance(DistanceMetric): r"""Minkowski Distance .. math:: - D(x, y) = [\sum_i |x_i - y_i|^p] ^ (1/p) + D(x, y) = {||u-v||}_p + + when w is None. + + Here is the more general expanded expression for the weighted case: + + .. math:: + D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p) + + Parameters + ---------- + p : int + The order of the p-norm of the difference (see above). + w : (N,) array-like (optional) + The weight vector. Minkowski Distance requires p >= 1 and finite. For p = infinity, use ChebyshevDistance. Note that for p=1, ManhattanDistance is more efficient, and for p=2, EuclideanDistance is more efficient. """ - def __init__(self, p): + def __init__(self, p, w=None): if p < 1: raise ValueError("p must be greater than 1") elif np.isinf(p): raise ValueError("MinkowskiDistance requires finite p. " "For p=inf, use ChebyshevDistance.") + self.p = p + if w is not None: + w_array = check_array( + w, ensure_2d=False, dtype=DTYPE, input_name="w" + ) + if (w_array < 0).any(): + raise ValueError("w cannot contain negative weights") + self.vec = ReadonlyArrayWrapper(w_array) + self.size = self.vec.shape[0] + else: + self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE)) + self.size = 0 + + def _validate_data(self, X): + if self.size > 0 and X.shape[1] != self.size: + raise ValueError("MinkowskiDistance: the size of w must match " + f"the number of features ({X.shape[1]}). " + f"Currently len(w)={self.size}.") cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t d=0 cdef np.intp_t j - for j in range(size): - d += pow(fabs(x1[j] - x2[j]), self.p) + cdef bint has_w = self.size > 0 + if has_w: + for j in range(size): + d += self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p) + else: + for j in range(size): + d += pow(fabs(x1[j] - x2[j]), self.p) return d cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, @@ -595,6 +625,7 @@ cdef class MinkowskiDistance(DistanceMetric): #------------------------------------------------------------ +# TODO: Remove in 1.3 - WMinkowskiDistance class # W-Minkowski Distance cdef class WMinkowskiDistance(DistanceMetric): r"""Weighted Minkowski Distance @@ -613,6 +644,16 @@ cdef class WMinkowskiDistance(DistanceMetric): """ def __init__(self, p, w): + from warnings import warn + warn("WMinkowskiDistance is deprecated in version 1.1 and will be " + "removed in version 1.3. Use MinkowskiDistance instead. Note " + "that in MinkowskiDistance, the weights are applied to the " + "absolute differences raised to the p power. This is different " + "from WMinkowskiDistance where weights are applied to the " + "absolute differences before raising to the p power. " + "The deprecation aims to remain consistent with SciPy 1.8 " + "convention.", FutureWarning) + if p < 1: raise ValueError("p must be greater than 1") elif np.isinf(p): diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 1de7471c01812..5dd10d83bfe68 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -7,6 +7,7 @@ import pytest +import scipy.sparse as sp from scipy.spatial.distance import cdist from sklearn.metrics import DistanceMetric from sklearn.utils import check_random_state @@ -89,6 +90,8 @@ def check_cdist(metric, kwargs, X1, X2): assert_array_almost_equal(D_sklearn, D_scipy_cdist) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)]) def test_cdist(metric_param_grid, X1, X2): @@ -120,6 +123,8 @@ def check_cdist_bool(metric, D_true): assert_array_almost_equal(D12, D_true) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)]) def test_pdist(metric_param_grid, X1, X2): @@ -170,6 +175,8 @@ def check_pdist_bool(metric, D_true): assert_array_almost_equal(D12, D_true) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("writable_kwargs", [True, False]) @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_pickle(writable_kwargs, metric_param_grid): @@ -185,6 +192,8 @@ def test_pickle(writable_kwargs, metric_param_grid): check_pickle(metric, kwargs) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap]) def test_pickle_bool_metrics(metric, X1_bool): @@ -262,6 +271,8 @@ def custom_metric(x, y): assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") def test_readonly_kwargs(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/21685 @@ -277,3 +288,55 @@ def test_readonly_kwargs(): DistanceMetric.get_metric("seuclidean", V=weights) DistanceMetric.get_metric("wminkowski", p=1, w=weights) DistanceMetric.get_metric("mahalanobis", VI=VI) + + +@pytest.mark.parametrize( + "w, err_type, err_msg", + [ + (np.array([1, 1.5, -13]), ValueError, "w cannot contain negative weights"), + (np.array([1, 1.5, np.nan]), ValueError, "w contains NaN"), + ( + sp.csr_matrix([1, 1.5, 1]), + TypeError, + "A sparse matrix was passed, but dense data is required", + ), + (np.array(["a", "b", "c"]), ValueError, "could not convert string to float"), + (np.array([]), ValueError, "a minimum of 1 is required"), + ], +) +def test_minkowski_metric_validate_weights_values(w, err_type, err_msg): + with pytest.raises(err_type, match=err_msg): + DistanceMetric.get_metric("minkowski", p=3, w=w) + + +def test_minkowski_metric_validate_weights_size(): + w2 = rng.random_sample(d + 1) + dm = DistanceMetric.get_metric("minkowski", p=3, w=w2) + msg = ( + "MinkowskiDistance: the size of w must match " + f"the number of features \\({X1.shape[1]}\\). " + f"Currently len\\(w\\)={w2.shape[0]}." + ) + with pytest.raises(ValueError, match=msg): + dm.pairwise(X1, X2) + + +# TODO: Remove in 1.3 when wminkowski is removed +def test_wminkowski_deprecated(): + w = rng.random_sample(d) + msg = "WMinkowskiDistance is deprecated in version 1.1" + with pytest.warns(FutureWarning, match=msg): + DistanceMetric.get_metric("wminkowski", p=3, w=w) + + +# TODO: Remove in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") +@pytest.mark.parametrize("p", [1, 1.5, 3]) +def test_wminkowski_minkowski_equivalence(p): + w = rng.random_sample(d) + # Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski' + dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p)) + dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w) + D_wmks = dm_wmks.pairwise(X1, X2) + D_mks = dm_mks.pairwise(X1, X2) + assert_array_almost_equal(D_wmks, D_mks) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index b7e90e63f2af1..35f7e19416f0c 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -252,6 +252,8 @@ def callable_rbf_kernel(x, y, **kwds): return K +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize( "func, metric, kwds", [ diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 2a4d500610051..c528c3ab900f8 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1275,6 +1275,8 @@ def test_neighbors_badargs(): nbrs.radius_neighbors_graph(X, mode="blah") +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): # Test computing the neighbors for various metrics # create a symmetric matrix @@ -1381,6 +1383,8 @@ def custom_metric(x1, x2): assert_array_almost_equal(dist1, dist2) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") def test_valid_brute_metric_for_auto_algorithm(): X = rng.rand(12, 12) Xcsr = csr_matrix(X) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index e043ffb730708..85d578c271faa 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -233,6 +233,8 @@ def test_gaussian_kde(Cls, n_samples=1000): assert_array_almost_equal(dens_tree, dens_gkde, decimal=3) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize( "Cls, metric", itertools.chain(