From ce268127caa230a51d32c0ee58364695736bfb4d Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Mon, 31 Mar 2025 23:56:20 +0200 Subject: [PATCH 01/33] Change the init and the eps --- sklearn/manifold/_mds.py | 55 ++++++++++++++---------------- sklearn/manifold/tests/test_mds.py | 39 ++++++++++++++++++--- 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 07d492bdcd34d..fdd7b6d638685 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -27,7 +27,7 @@ def _smacof_single( init=None, max_iter=300, verbose=0, - eps=1e-3, + eps=1e-6, random_state=None, normalized_stress=False, ): @@ -59,10 +59,9 @@ def _smacof_single( verbose : int, default=0 Level of verbosity. - eps : float, default=1e-3 - Relative tolerance with respect to stress at which to declare - convergence. The value of `eps` should be tuned separately depending - on whether or not `normalized_stress` is being used. + eps : float, default=1e-6 + The tolerance with respect to stress (normalized by the sum of squared + embedding distances) at which to declare convergence. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -70,7 +69,7 @@ def _smacof_single( See :term:`Glossary `. normalized_stress : bool, default=False - Whether use and return normalized stress value (Stress-1) instead of raw + Whether to return normalized stress value (Stress-1) instead of raw stress. .. versionadded:: 1.2 @@ -168,18 +167,18 @@ def _smacof_single( # Compute stress distances = euclidean_distances(X) stress = ((distances.ravel() - disparities.ravel()) ** 2).sum() / 2 - if normalized_stress: - stress = np.sqrt(stress / ((disparities.ravel() ** 2).sum() / 2)) - normalization = np.sqrt((X**2).sum(axis=1)).sum() if verbose >= 2: # pragma: no cover print(f"Iteration {it}, stress {stress:.4f}") if old_stress is not None: - if (old_stress - stress / normalization) < eps: + if ((old_stress - stress) / ((distances.ravel() ** 2).sum() / 2)) < eps: if verbose: # pragma: no cover print("Convergence criterion reached.") break - old_stress = stress / normalization + old_stress = stress + + if normalized_stress: + stress = np.sqrt(stress / ((distances.ravel() ** 2).sum() / 2)) return X, stress, it + 1 @@ -207,11 +206,11 @@ def smacof( metric=True, n_components=2, init=None, - n_init=8, + n_init=1, n_jobs=None, max_iter=300, verbose=0, - eps=1e-3, + eps=1e-6, random_state=None, return_n_iter=False, normalized_stress="auto", @@ -256,7 +255,7 @@ def smacof( Starting configuration of the embedding to initialize the algorithm. By default, the algorithm is initialized with a randomly chosen array. - n_init : int, default=8 + n_init : int, default=1 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. If ``init`` is @@ -277,10 +276,9 @@ def smacof( verbose : int, default=0 Level of verbosity. - eps : float, default=1e-3 - Relative tolerance with respect to stress at which to declare - convergence. The value of `eps` should be tuned separately depending - on whether or not `normalized_stress` is being used. + eps : float, default=1e-6 + The tolerance with respect to stress (normalized by the sum of squared + embedding distances) at which to declare convergence. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -423,7 +421,7 @@ class MDS(BaseEstimator): When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as missing values. - n_init : int, default=4 + n_init : int, default=1 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. @@ -434,10 +432,9 @@ class MDS(BaseEstimator): verbose : int, default=0 Level of verbosity. - eps : float, default=1e-3 - Relative tolerance with respect to stress at which to declare - convergence. The value of `eps` should be tuned separately depending - on whether or not `normalized_stress` is being used. + eps : float, default=1e-6 + The tolerance with respect to stress (normalized by the sum of squared + embedding distances) at which to declare convergence. n_jobs : int, default=None The number of jobs to use for the computation. If multiple @@ -464,9 +461,9 @@ class MDS(BaseEstimator): ``fit_transform``. normalized_stress : bool or "auto" default="auto" - Whether use and return normalized stress value (Stress-1) instead of raw - stress. By default, metric MDS uses raw stress while non-metric MDS uses - normalized stress. + Whether to return normalized stress value (Stress-1) instead of raw + stress. By default, metric MDS returns raw stress while non-metric MDS + returns normalized stress. .. versionadded:: 1.2 @@ -569,10 +566,10 @@ def __init__( n_components=2, *, metric=True, - n_init=4, + n_init=1, max_iter=300, verbose=0, - eps=1e-3, + eps=1e-6, n_jobs=None, random_state=None, dissimilarity="euclidean", @@ -649,7 +646,7 @@ def fit_transform(self, X, y=None, init=None): X = validate_data(self, X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn( - "The MDS API has changed. ``fit`` now constructs an" + "The MDS API has changed. ``fit`` now constructs a" " dissimilarity matrix from data. To use a custom " "dissimilarity matrix, set " "``dissimilarity='precomputed'``." diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index b34f030b79895..2f713427c9697 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from numpy.testing import assert_allclose, assert_array_almost_equal +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal from sklearn.datasets import load_digits from sklearn.manifold import _mds as mds @@ -86,7 +86,7 @@ def test_mds_recovers_true_data(metric): random_state=42, ).fit(X) stress = mds_est.stress_ - assert_allclose(stress, 0, atol=1e-10) + assert_allclose(stress, 0, atol=1e-6) def test_smacof_error(): @@ -165,17 +165,48 @@ def test_isotonic_outofbounds(): mds.smacof(dis, init=init, metric=False, n_init=1) -def test_returned_stress(): +@pytest.mark.parametrize("normalized_stress", [True, False]) +def test_returned_stress(normalized_stress): # Test that the final stress corresponds to the final embedding # (non-regression test for issue 16846) X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) D = euclidean_distances(X) - mds_est = mds.MDS(n_components=2, random_state=42).fit(X) + mds_est = mds.MDS( + n_components=2, + random_state=42, + normalized_stress=normalized_stress, + ).fit(X) + Z = mds_est.embedding_ stress = mds_est.stress_ D_mds = euclidean_distances(Z) stress_Z = ((D_mds.ravel() - D.ravel()) ** 2).sum() / 2 + if normalized_stress: + stress_Z = np.sqrt(stress_Z / ((D_mds.ravel() ** 2).sum() / 2)) + assert_allclose(stress, stress_Z) + + +@pytest.mark.parametrize("metric", [True, False]) +def test_convergence_does_not_depend_on_scale(metric): + # Test that the number of iterations until convergence does not depend + # the scale of the input data + X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) + + mds_est = mds.MDS( + n_components=2, + random_state=42, + eps=1e-6, + metric=metric, + ) + + mds_est.fit(X * 100) + n_iter1 = mds_est.n_iter_ + + mds_est.fit(X / 100) + n_iter2 = mds_est.n_iter_ + + assert_equal(n_iter1, n_iter2) From 2bfe6619e324bdd7df4adab2b86e90aa1a8c21bd Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Mon, 31 Mar 2025 23:58:43 +0200 Subject: [PATCH 02/33] Fix typo --- sklearn/manifold/tests/test_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 2f713427c9697..fab10f0c0ed84 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -192,7 +192,7 @@ def test_returned_stress(normalized_stress): @pytest.mark.parametrize("metric", [True, False]) def test_convergence_does_not_depend_on_scale(metric): - # Test that the number of iterations until convergence does not depend + # Test that the number of iterations until convergence does not depend on # the scale of the input data X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) From f900c294a961aecf72b5c98d847af0ac294fd334 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 1 Apr 2025 00:38:47 +0200 Subject: [PATCH 03/33] Add what's new --- doc/whats_new/upcoming_changes/sklearn.manifold/31117.enh.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.manifold/31117.enh.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enh.rst b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enh.rst new file mode 100644 index 0000000000000..b0a3b0a261b11 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enh.rst @@ -0,0 +1,4 @@ +:class:`manifold.MDS` now runs with `n_init=1` by default, and the convergence +criterion has been changed to ensure good convergence of both metric and +non-metric MDS, following the reference R implementation. +By :user:`Dmitry Kobak ` From 9aa9af78f7b877a99ee1738427990f11a0356b2d Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 1 Apr 2025 00:42:32 +0200 Subject: [PATCH 04/33] rename what's new file --- .../sklearn.manifold/{31117.enh.rst => 31117.enhancement.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/whats_new/upcoming_changes/sklearn.manifold/{31117.enh.rst => 31117.enhancement.rst} (100%) diff --git a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enh.rst b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst similarity index 100% rename from doc/whats_new/upcoming_changes/sklearn.manifold/31117.enh.rst rename to doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst From dfb5b436b9eaf649d06a9e05a9f9979d584e5faa Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 1 Apr 2025 09:06:40 +0200 Subject: [PATCH 05/33] Fix the example --- sklearn/manifold/_mds.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index fdd7b6d638685..cb3bf5103aa51 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -337,11 +337,11 @@ def smacof( >>> dissimilarities = euclidean_distances(X) >>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42) >>> np.round(mds_result, 5) - array([[ 0.05352, -1.07253], - [ 1.74231, -0.75675], - [-1.79583, 1.82928]]) - >>> np.round(stress, 5).item() - 0.00128 + array([[-0.03166, 1.10996], + [ 1.6829 , 0.84601], + [-1.65124, -1.95597]]) + >>> np.round(stress, 6).item() + 3.2e-05 """ dissimilarities = check_array(dissimilarities) @@ -536,7 +536,7 @@ class MDS(BaseEstimator): >>> X, _ = load_digits(return_X_y=True) >>> X.shape (1797, 64) - >>> embedding = MDS(n_components=2, normalized_stress='auto') + >>> embedding = MDS(n_components=2) >>> X_transformed = embedding.fit_transform(X[:100]) >>> X_transformed.shape (100, 2) From 45dd51c860fb1baa0314aba0861a15ce3ba41bac Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 1 Apr 2025 23:53:38 +0200 Subject: [PATCH 06/33] Add deprecation cycle --- sklearn/manifold/_mds.py | 83 ++++++++++++++++++++++++------ sklearn/manifold/tests/test_mds.py | 68 ++++++++++++++++++++---- 2 files changed, 126 insertions(+), 25 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index cb3bf5103aa51..ddd9dd0478efe 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -27,7 +27,7 @@ def _smacof_single( init=None, max_iter=300, verbose=0, - eps=1e-6, + eps="warn", random_state=None, normalized_stress=False, ): @@ -59,10 +59,13 @@ def _smacof_single( verbose : int, default=0 Level of verbosity. - eps : float, default=1e-6 + eps : float, default=1e-3 The tolerance with respect to stress (normalized by the sum of squared embedding distances) at which to declare convergence. + .. versionchanged:: 1.9 + The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. Pass an int for reproducible results across multiple function calls. @@ -103,6 +106,13 @@ def _smacof_single( .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; Groenen P. Springer Series in Statistics (1997) """ + if eps == "warn": + warnings.warn( + "The default value of `eps` will change from 1e-3 to 1e-6 in 1.9.", + FutureWarning, + ) + eps = 1e-3 + dissimilarities = check_symmetric(dissimilarities, raise_exception=True) n_samples = dissimilarities.shape[0] @@ -189,11 +199,11 @@ def _smacof_single( "metric": ["boolean"], "n_components": [Interval(Integral, 1, None, closed="left")], "init": ["array-like", None], - "n_init": [Interval(Integral, 1, None, closed="left")], + "n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})], "n_jobs": [Integral, None], "max_iter": [Interval(Integral, 1, None, closed="left")], "verbose": ["verbose"], - "eps": [Interval(Real, 0, None, closed="left")], + "eps": [Interval(Real, 0, None, closed="left"), StrOptions({"warn"})], "random_state": ["random_state"], "return_n_iter": ["boolean"], "normalized_stress": ["boolean", StrOptions({"auto"})], @@ -206,11 +216,11 @@ def smacof( metric=True, n_components=2, init=None, - n_init=1, + n_init="warn", n_jobs=None, max_iter=300, verbose=0, - eps=1e-6, + eps="warn", random_state=None, return_n_iter=False, normalized_stress="auto", @@ -255,12 +265,15 @@ def smacof( Starting configuration of the embedding to initialize the algorithm. By default, the algorithm is initialized with a randomly chosen array. - n_init : int, default=1 + n_init : int, default=8 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. If ``init`` is provided, this option is overridden and a single run is performed. + .. versionchanged:: 1.9 + The default value for `n_iter` will change from 8 to 1 in version 1.9. + n_jobs : int, default=None The number of jobs to use for the computation. If multiple initializations are used (``n_init``), each run of the algorithm is @@ -276,10 +289,13 @@ def smacof( verbose : int, default=0 Level of verbosity. - eps : float, default=1e-6 + eps : float, default=1e-3 The tolerance with respect to stress (normalized by the sum of squared embedding distances) at which to declare convergence. + .. versionchanged:: 1.9 + The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. Pass an int for reproducible results across multiple function calls. @@ -288,7 +304,7 @@ def smacof( return_n_iter : bool, default=False Whether or not to return the number of iterations. - normalized_stress : bool or "auto" default="auto" + normalized_stress : bool or "auto", default="auto" Whether to return normalized stress value (Stress-1) instead of raw stress. By default, metric MDS returns raw stress while non-metric MDS returns normalized stress. @@ -333,7 +349,7 @@ def smacof( >>> import numpy as np >>> from sklearn.manifold import smacof >>> from sklearn.metrics import euclidean_distances - >>> X = np.array([[0, 1, 2], [1, 0, 3],[2, 3, 0]]) + >>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]]) >>> dissimilarities = euclidean_distances(X) >>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42) >>> np.round(mds_result, 5) @@ -344,6 +360,20 @@ def smacof( 3.2e-05 """ + if eps == "warn": + warnings.warn( + "The default value of `eps` will change from 1e-3 to 1e-6 in 1.9.", + FutureWarning, + ) + eps = 1e-3 + + if n_init == "warn": + warnings.warn( + "The default value of `n_init` will change from 8 to 1 in 1.9.", + FutureWarning, + ) + n_init = 8 + dissimilarities = check_array(dissimilarities) random_state = check_random_state(random_state) @@ -421,21 +451,27 @@ class MDS(BaseEstimator): When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as missing values. - n_init : int, default=1 + n_init : int, default=4 Number of times the SMACOF algorithm will be run with different initializations. The final results will be the best output of the runs, determined by the run with the smallest final stress. + .. versionchanged:: 1.9 + The default value for `n_init` will change from 4 to 1 in version 1.9. + max_iter : int, default=300 Maximum number of iterations of the SMACOF algorithm for a single run. verbose : int, default=0 Level of verbosity. - eps : float, default=1e-6 + eps : float, default=1e-3 The tolerance with respect to stress (normalized by the sum of squared embedding distances) at which to declare convergence. + .. versionchanged:: 1.9 + The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + n_jobs : int, default=None The number of jobs to use for the computation. If multiple initializations are used (``n_init``), each run of the algorithm is @@ -551,10 +587,10 @@ class MDS(BaseEstimator): _parameter_constraints: dict = { "n_components": [Interval(Integral, 1, None, closed="left")], "metric": ["boolean"], - "n_init": [Interval(Integral, 1, None, closed="left")], + "n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})], "max_iter": [Interval(Integral, 1, None, closed="left")], "verbose": ["verbose"], - "eps": [Interval(Real, 0.0, None, closed="left")], + "eps": [Interval(Real, 0.0, None, closed="left"), StrOptions({"warn"})], "n_jobs": [None, Integral], "random_state": ["random_state"], "dissimilarity": [StrOptions({"euclidean", "precomputed"})], @@ -566,10 +602,10 @@ def __init__( n_components=2, *, metric=True, - n_init=1, + n_init="warn", max_iter=300, verbose=0, - eps=1e-6, + eps="warn", n_jobs=None, random_state=None, dissimilarity="euclidean", @@ -643,6 +679,21 @@ def fit_transform(self, X, y=None, init=None): X_new : ndarray of shape (n_samples, n_components) X transformed in the new space. """ + print("MDS", self.eps) + if self.eps == "warn": + warnings.warn( + "The default value of `eps` will change from 1e-3 to 1e-6 in 1.9.", + FutureWarning, + ) + self.eps = 1e-3 + + if self.n_init == "warn": + warnings.warn( + "The default value of `n_init` will change from 4 to 1 in 1.9.", + FutureWarning, + ) + self.n_init = 4 + X = validate_data(self, X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn( diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index fab10f0c0ed84..4b6c348d6466b 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -14,7 +14,7 @@ def test_smacof(): # Borg & Groenen, p 154 sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]]) - X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1) + X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1, eps=1e-6) X_true = np.array( [[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]] ) @@ -28,7 +28,13 @@ def test_nonmetric_lower_normalized_stress(): Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]]) _, stress1 = mds.smacof( - sim, init=Z, n_components=2, max_iter=1000, n_init=1, normalized_stress=True + sim, + init=Z, + n_components=2, + max_iter=1000, + n_init=1, + normalized_stress=True, + eps=1e-6, ) _, stress2 = mds.smacof( @@ -39,6 +45,7 @@ def test_nonmetric_lower_normalized_stress(): n_init=1, normalized_stress=True, metric=False, + eps=1e-6, ) assert stress1 > stress2 @@ -54,7 +61,7 @@ def test_nonmetric_mds_optimization(): mds_est = mds.MDS( n_components=2, n_init=1, - eps=1e-15, + eps=1e-6, max_iter=2, metric=False, random_state=42, @@ -64,7 +71,7 @@ def test_nonmetric_mds_optimization(): mds_est = mds.MDS( n_components=2, n_init=1, - eps=1e-15, + eps=1e-6, max_iter=3, metric=False, random_state=42, @@ -94,28 +101,36 @@ def test_smacof_error(): sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) with pytest.raises(ValueError): - mds.smacof(sim) + mds.smacof(sim, eps=1e-6, n_init=1) # Not squared similarity matrix: sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]]) with pytest.raises(ValueError): - mds.smacof(sim) + mds.smacof(sim, eps=1e-6, n_init=1) # init not None and not correct format: sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]]) with pytest.raises(ValueError): - mds.smacof(sim, init=Z, n_init=1) + mds.smacof(sim, init=Z, n_init=1, eps=1e-6) def test_MDS(): sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) - mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed") + mds_clf = mds.MDS( + metric=False, + n_jobs=3, + n_init=3, + eps=1e-6, + dissimilarity="precomputed", + ) mds_clf.fit(sim) +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("k", [0.5, 1.5, 2]) def test_normed_stress(k): """Test that non-metric MDS normalized stress is scale-invariant.""" @@ -128,6 +143,8 @@ def test_normed_stress(k): assert_allclose(X1, X2, rtol=1e-5) +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("metric", [True, False]) def test_normalized_stress_auto(metric, monkeypatch): rng = np.random.RandomState(0) @@ -162,9 +179,11 @@ def test_isotonic_outofbounds(): [0.8766008278401566, 0.4227358815811242], ] ) - mds.smacof(dis, init=init, metric=False, n_init=1) + mds.smacof(dis, init=init, metric=False, n_init=1, eps=1e-6) +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("normalized_stress", [True, False]) def test_returned_stress(normalized_stress): # Test that the final stress corresponds to the final embedding @@ -190,6 +209,8 @@ def test_returned_stress(normalized_stress): assert_allclose(stress, stress_Z) +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize("metric", [True, False]) def test_convergence_does_not_depend_on_scale(metric): # Test that the number of iterations until convergence does not depend on @@ -210,3 +231,32 @@ def test_convergence_does_not_depend_on_scale(metric): n_iter2 = mds_est.n_iter_ assert_equal(n_iter1, n_iter2) + + +# TODO(1.9): delete this test +def test_future_warning_eps(): + X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + + mds.smacof(sim, n_init=1) + + with pytest.warns(FutureWarning): + mds.smacof(sim, n_init=1) + + with pytest.warns(FutureWarning): + mds._smacof_single(sim) + + with pytest.warns(FutureWarning): + mds.MDS(n_init=1).fit(X) + + +# TODO(1.9): delete this test +def test_future_warning_n_init(): + X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + + with pytest.warns(FutureWarning): + mds.smacof(sim, eps=1e-6) + + with pytest.warns(FutureWarning): + mds.MDS(eps=1e-6).fit(X) From 153febe73c43d807f95ae831971cea8b20310118 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 1 Apr 2025 23:55:42 +0200 Subject: [PATCH 07/33] Fix example --- sklearn/manifold/_mds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ddd9dd0478efe..cb865ee52dac6 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -351,8 +351,8 @@ def smacof( >>> from sklearn.metrics import euclidean_distances >>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]]) >>> dissimilarities = euclidean_distances(X) - >>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42) - >>> np.round(mds_result, 5) + >>> Z, stress = smacof(dissimilarities, n_init=1, eps=1e-6, random_state=42) + >>> np.round(Z, 5) array([[-0.03166, 1.10996], [ 1.6829 , 0.84601], [-1.65124, -1.95597]]) From 80862ab47e5eeb1d7cb5c408fbf04c33fedcc7d0 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Wed, 2 Apr 2025 09:42:57 +0200 Subject: [PATCH 08/33] Remove future warnings from examples --- examples/manifold/plot_compare_methods.py | 3 ++- examples/manifold/plot_lle_digits.py | 2 +- examples/manifold/plot_manifold_sphere.py | 2 +- examples/manifold/plot_mds.py | 1 + sklearn/manifold/_mds.py | 14 +++++++++----- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py index 30ce4e5d8d897..cf44fffe460dc 100644 --- a/examples/manifold/plot_compare_methods.py +++ b/examples/manifold/plot_compare_methods.py @@ -166,7 +166,8 @@ def add_2d_scatter(ax, points, points_color, title=None): md_scaling = manifold.MDS( n_components=n_components, max_iter=50, - n_init=4, + n_init=1, + eps=1e-6, random_state=0, normalized_stress=False, ) diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py index 34b221ca0cd1d..41ba9ce07957e 100644 --- a/examples/manifold/plot_lle_digits.py +++ b/examples/manifold/plot_lle_digits.py @@ -131,7 +131,7 @@ def plot_embedding(X, title): "LTSA LLE embedding": LocallyLinearEmbedding( n_neighbors=n_neighbors, n_components=2, method="ltsa" ), - "MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, n_jobs=2), + "MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, eps=1e-6), "Random Trees embedding": make_pipeline( RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0), TruncatedSVD(n_components=2), diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py index 7c666c4b7fb7b..e30ded54bd0be 100644 --- a/examples/manifold/plot_manifold_sphere.py +++ b/examples/manifold/plot_manifold_sphere.py @@ -112,7 +112,7 @@ # Perform Multi-dimensional scaling. t0 = time() -mds = manifold.MDS(2, max_iter=100, n_init=1, random_state=42) +mds = manifold.MDS(2, max_iter=100, n_init=1, random_state=42, eps=1e-6) trans_data = mds.fit_transform(sphere_data).T t1 = time() print("MDS: %.2g sec" % (t1 - t0)) diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py index afea676b245a8..07efa89cfae04 100644 --- a/examples/manifold/plot_mds.py +++ b/examples/manifold/plot_mds.py @@ -44,6 +44,7 @@ n_components=2, max_iter=3000, eps=1e-9, + n_init=1, random_state=42, dissimilarity="precomputed", n_jobs=1, diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index cb865ee52dac6..2d13f794b7b5a 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -679,20 +679,24 @@ def fit_transform(self, X, y=None, init=None): X_new : ndarray of shape (n_samples, n_components) X transformed in the new space. """ - print("MDS", self.eps) + if self.eps == "warn": warnings.warn( "The default value of `eps` will change from 1e-3 to 1e-6 in 1.9.", FutureWarning, ) - self.eps = 1e-3 + self._eps = 1e-3 + else: + self._eps = self.eps if self.n_init == "warn": warnings.warn( "The default value of `n_init` will change from 4 to 1 in 1.9.", FutureWarning, ) - self.n_init = 4 + self._n_init = 4 + else: + self._n_init = self.n_init X = validate_data(self, X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": @@ -713,11 +717,11 @@ def fit_transform(self, X, y=None, init=None): metric=self.metric, n_components=self.n_components, init=init, - n_init=self.n_init, + n_init=self._n_init, n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, - eps=self.eps, + eps=self._eps, random_state=self.random_state, return_n_iter=True, normalized_stress=self.normalized_stress, From 95c99bb8aad53afa2a15400649e758e5a6f1e566 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Wed, 2 Apr 2025 10:12:36 +0200 Subject: [PATCH 09/33] Fix docstring params --- sklearn/manifold/_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 2d13f794b7b5a..e21fa69359102 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -572,7 +572,7 @@ class MDS(BaseEstimator): >>> X, _ = load_digits(return_X_y=True) >>> X.shape (1797, 64) - >>> embedding = MDS(n_components=2) + >>> embedding = MDS(n_components=2, n_init=1, eps=1e-6) >>> X_transformed = embedding.fit_transform(X[:100]) >>> X_transformed.shape (100, 2) From f56d5892c55b4f2f34430e3a1da4f78820d12bde Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Wed, 2 Apr 2025 11:02:03 +0200 Subject: [PATCH 10/33] Fix docstrings --- sklearn/manifold/_mds.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index e21fa69359102..36df272837f77 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -64,7 +64,7 @@ def _smacof_single( embedding distances) at which to declare convergence. .. versionchanged:: 1.9 - The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -272,7 +272,7 @@ def smacof( provided, this option is overridden and a single run is performed. .. versionchanged:: 1.9 - The default value for `n_iter` will change from 8 to 1 in version 1.9. + The default value for `n_iter` will change from 8 to 1 in version 1.9. n_jobs : int, default=None The number of jobs to use for the computation. If multiple @@ -294,7 +294,7 @@ def smacof( embedding distances) at which to declare convergence. .. versionchanged:: 1.9 - The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -457,7 +457,7 @@ class MDS(BaseEstimator): determined by the run with the smallest final stress. .. versionchanged:: 1.9 - The default value for `n_init` will change from 4 to 1 in version 1.9. + The default value for `n_init` will change from 4 to 1 in version 1.9. max_iter : int, default=300 Maximum number of iterations of the SMACOF algorithm for a single run. @@ -470,7 +470,7 @@ class MDS(BaseEstimator): embedding distances) at which to declare convergence. .. versionchanged:: 1.9 - The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. n_jobs : int, default=None The number of jobs to use for the computation. If multiple From 27bf6cc29ded51eff17c7371c3f993b887dc2e29 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Wed, 2 Apr 2025 11:56:40 +0200 Subject: [PATCH 11/33] Fix warning test --- sklearn/manifold/tests/test_mds.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 4b6c348d6466b..d22246aa65770 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -238,8 +238,6 @@ def test_future_warning_eps(): X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) - mds.smacof(sim, n_init=1) - with pytest.warns(FutureWarning): mds.smacof(sim, n_init=1) From bd51858fda64b4b708b6d9e0a7a9af70e3858f82 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Wed, 2 Apr 2025 12:53:19 +0200 Subject: [PATCH 12/33] Avoid future warning in docsting params test --- sklearn/tests/test_docstring_parameters.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 6f165f483c66e..4f4e048453eee 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -226,6 +226,10 @@ def test_fit_docstring_attributes(name, Estimator): elif Estimator.__name__ == "KBinsDiscretizer": # default raises an FutureWarning if quantile method is at default "warn" est.set_params(quantile_method="averaged_inverted_cdf") + # TODO(1.9) remove + elif Estimator.__name__ == "MDS": + # default raises a FutureWarning + est.set_params(n_init=1, eps=1e-6) # Low max iter to speed up tests: we are only interested in checking the existence # of fitted attributes. This should be invariant to whether it has converged or not. From f394ab2336d3b0cf1c11174127ca050bbe7538b8 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Wed, 2 Apr 2025 13:52:39 +0200 Subject: [PATCH 13/33] Adjust what's new description --- .../sklearn.manifold/31117.enhancement.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst index b0a3b0a261b11..b7f782e80216b 100644 --- a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst +++ b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst @@ -1,4 +1,6 @@ -:class:`manifold.MDS` now runs with `n_init=1` by default, and the convergence -criterion has been changed to ensure good convergence of both metric and -non-metric MDS, following the reference R implementation. +:class:`manifold.MDS` will switch to use `n_init=1` and `eps=1e-6` by default, +starting from version 1.9. The convergence criterion has been adjusted to +make sense for both metric and non-metric MDS and to follow the reference +R implementation. The formula for normalized stress was adjusted to follow +the original definition by Kruskal. By :user:`Dmitry Kobak ` From d5346bc18ff8fd33400549ec6f5272313d093e2c Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Wed, 2 Apr 2025 14:09:58 +0200 Subject: [PATCH 14/33] Format the MDS example as a notebook --- examples/manifold/plot_mds.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py index 07efa89cfae04..a00a08947261b 100644 --- a/examples/manifold/plot_mds.py +++ b/examples/manifold/plot_mds.py @@ -13,6 +13,12 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause +# %% +# Dataset preparation +# ------------------- +# +# We start by uniformly generating 20 points in a 2D space. + import numpy as np from matplotlib import pyplot as plt from matplotlib.collections import LineCollection @@ -31,6 +37,11 @@ # Center the data X_true -= X_true.mean() +# %% +# Now we compute pairwise distances between all points and add +# a small amount of noise to the distance matrix. We make sure +# to keep the noisy distance matrix symmetric. + # Compute pairwise Euclidean distances distances = euclidean_distances(X_true) @@ -40,6 +51,9 @@ np.fill_diagonal(noise, 0) distances += noise +# %% +# Here we compute metric and non-metric MDS of the noisy distance matrix. + mds = manifold.MDS( n_components=2, max_iter=3000, @@ -63,10 +77,16 @@ ) X_nmds = nmds.fit_transform(distances) -# Rescale the data -X_mds *= np.sqrt((X_true**2).sum()) / np.sqrt((X_mds**2).sum()) +# %% +# Rescaling the non-metric MDS solution to match the spread of the original data. + X_nmds *= np.sqrt((X_true**2).sum()) / np.sqrt((X_nmds**2).sum()) +# %% +# To make the visual comparisons easier, we rotate the original data and both MDS +# solutions to their PCA axes. And flip horizontal and vertical MDS axes, if needed, +# to match the original data orientation. + # Rotate the data pca = PCA(n_components=2) X_true = pca.fit_transform(X_true) @@ -80,6 +100,9 @@ if np.corrcoef(X_nmds[:, i], X_true[:, i])[0, 1] < 0: X_nmds[:, i] *= -1 +# %% +# Finally, we plot the original data and both MDS reconstructions. + fig = plt.figure(1) ax = plt.axes([0.0, 0.0, 1.0, 1.0]) From cae3bc01e9f80868f2397f55ba17e11ed3bcd329 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 22 Apr 2025 11:06:23 +0200 Subject: [PATCH 15/33] Add some TODO comments --- examples/manifold/plot_mds.py | 3 --- sklearn/manifold/_mds.py | 9 +++++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py index a00a08947261b..83f6ce31bfb9e 100644 --- a/examples/manifold/plot_mds.py +++ b/examples/manifold/plot_mds.py @@ -5,9 +5,6 @@ An illustration of the metric and non-metric MDS on generated noisy data. -The reconstructed points using the metric MDS and non metric MDS are slightly -shifted to avoid overlapping. - """ # Authors: The scikit-learn developers diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 36df272837f77..64a9c58c263a7 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -20,6 +20,7 @@ from ..utils.validation import validate_data +# TODO(1.9): change default `eps` to 1e-6, see PR #31117 def _smacof_single( dissimilarities, metric=True, @@ -181,18 +182,21 @@ def _smacof_single( if verbose >= 2: # pragma: no cover print(f"Iteration {it}, stress {stress:.4f}") if old_stress is not None: - if ((old_stress - stress) / ((distances.ravel() ** 2).sum() / 2)) < eps: + sum_squared_distances = (distances.ravel() ** 2).sum() + if ((old_stress - stress) / (sum_squared_distances / 2)) < eps: if verbose: # pragma: no cover print("Convergence criterion reached.") break old_stress = stress if normalized_stress: - stress = np.sqrt(stress / ((distances.ravel() ** 2).sum() / 2)) + sum_squared_distances = (distances.ravel() ** 2).sum() + stress = np.sqrt(stress / (sum_squared_distances / 2)) return X, stress, it + 1 +# TODO(1.9): change default `eps` to 1e-6 and `n_init` to 1, see PR #31117 @validate_params( { "dissimilarities": ["array-like"], @@ -436,6 +440,7 @@ def smacof( return best_pos, best_stress +# TODO(1.9): change default `eps` to 1e-6 and `n_init` to 1, see PR #31117 class MDS(BaseEstimator): """Multidimensional scaling. From 53a96980c4948df2941d1f00cf79a2ca74a06a1b Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Fri, 25 Apr 2025 15:33:23 +0200 Subject: [PATCH 16/33] Update sklearn/manifold/_mds.py Co-authored-by: Olivier Grisel --- sklearn/manifold/_mds.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 64a9c58c263a7..bc744560cca19 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -107,13 +107,6 @@ def _smacof_single( .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; Groenen P. Springer Series in Statistics (1997) """ - if eps == "warn": - warnings.warn( - "The default value of `eps` will change from 1e-3 to 1e-6 in 1.9.", - FutureWarning, - ) - eps = 1e-3 - dissimilarities = check_symmetric(dissimilarities, raise_exception=True) n_samples = dissimilarities.shape[0] From 8484a995ccdf05754095ec645648ef422589d736 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Fri, 25 Apr 2025 15:33:35 +0200 Subject: [PATCH 17/33] Update sklearn/manifold/_mds.py Co-authored-by: Olivier Grisel --- sklearn/manifold/_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index bc744560cca19..7def72f1960cd 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -28,7 +28,7 @@ def _smacof_single( init=None, max_iter=300, verbose=0, - eps="warn", + eps=1e-6, random_state=None, normalized_stress=False, ): From 9a5b287b7f26e2fa54f4ff6c0a3cd3898e64228c Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Fri, 25 Apr 2025 15:33:44 +0200 Subject: [PATCH 18/33] Update sklearn/manifold/_mds.py Co-authored-by: Olivier Grisel --- sklearn/manifold/_mds.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 7def72f1960cd..a19ad8cdf4050 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -678,15 +678,6 @@ def fit_transform(self, X, y=None, init=None): X transformed in the new space. """ - if self.eps == "warn": - warnings.warn( - "The default value of `eps` will change from 1e-3 to 1e-6 in 1.9.", - FutureWarning, - ) - self._eps = 1e-3 - else: - self._eps = self.eps - if self.n_init == "warn": warnings.warn( "The default value of `n_init` will change from 4 to 1 in 1.9.", From 82535319713db5a2e35213836d6003e4b4552bcb Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Fri, 25 Apr 2025 15:33:51 +0200 Subject: [PATCH 19/33] Update sklearn/manifold/_mds.py Co-authored-by: Olivier Grisel --- sklearn/manifold/_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index a19ad8cdf4050..b1bc36c44923f 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -710,7 +710,7 @@ def fit_transform(self, X, y=None, init=None): n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, - eps=self._eps, + eps=self.eps, random_state=self.random_state, return_n_iter=True, normalized_stress=self.normalized_stress, From 5af02c3b722c226697c5a40ad7905d7ffa95f137 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Fri, 25 Apr 2025 15:34:00 +0200 Subject: [PATCH 20/33] Update sklearn/manifold/_mds.py Co-authored-by: Olivier Grisel --- sklearn/manifold/_mds.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index b1bc36c44923f..0189cbfbe621d 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -20,7 +20,6 @@ from ..utils.validation import validate_data -# TODO(1.9): change default `eps` to 1e-6, see PR #31117 def _smacof_single( dissimilarities, metric=True, From 3dae4140b88f1c2084d05b3adf713808d4d79d17 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Fri, 25 Apr 2025 15:43:55 +0200 Subject: [PATCH 21/33] Change eps default without future warning --- .../sklearn.manifold/31117.enhancement.rst | 7 +--- .../sklearn.manifold/31117.fix.rst | 5 +++ sklearn/manifold/_mds.py | 38 +++++++++---------- sklearn/manifold/tests/test_mds.py | 15 -------- 4 files changed, 24 insertions(+), 41 deletions(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.manifold/31117.fix.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst index b7f782e80216b..2ce6feaee387d 100644 --- a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst +++ b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst @@ -1,6 +1,3 @@ -:class:`manifold.MDS` will switch to use `n_init=1` and `eps=1e-6` by default, -starting from version 1.9. The convergence criterion has been adjusted to -make sense for both metric and non-metric MDS and to follow the reference -R implementation. The formula for normalized stress was adjusted to follow -the original definition by Kruskal. +:class:`manifold.MDS` will switch to use `eps=1e-6` by default, +starting from version 1.9. By :user:`Dmitry Kobak ` diff --git a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.fix.rst b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.fix.rst new file mode 100644 index 0000000000000..5ade720cfa570 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.fix.rst @@ -0,0 +1,5 @@ +:class:`manifold.MDS` now uses `eps=1e-6` by default and the convergence +criterion was adjusted to make sense for both metric and non-metric MDS +and to follow the reference R implementation. The formula for normalized +stress was adjusted to follow the original definition by Kruskal. +By :user:`Dmitry Kobak ` diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 0189cbfbe621d..d7c34a1058940 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -59,12 +59,13 @@ def _smacof_single( verbose : int, default=0 Level of verbosity. - eps : float, default=1e-3 + eps : float, default=1e-6 The tolerance with respect to stress (normalized by the sum of squared embedding distances) at which to declare convergence. - .. versionchanged:: 1.9 - The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + .. versionchanged:: 1.7 + The default value for `eps` has changed from 1e-3 to 1e-6, as a result + of a bugfix in the computation of the convergence criterion. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -188,7 +189,7 @@ def _smacof_single( return X, stress, it + 1 -# TODO(1.9): change default `eps` to 1e-6 and `n_init` to 1, see PR #31117 +# TODO(1.9): change default `n_init` to 1, see PR #31117 @validate_params( { "dissimilarities": ["array-like"], @@ -199,7 +200,7 @@ def _smacof_single( "n_jobs": [Integral, None], "max_iter": [Interval(Integral, 1, None, closed="left")], "verbose": ["verbose"], - "eps": [Interval(Real, 0, None, closed="left"), StrOptions({"warn"})], + "eps": [Interval(Real, 0, None, closed="left")], "random_state": ["random_state"], "return_n_iter": ["boolean"], "normalized_stress": ["boolean", StrOptions({"auto"})], @@ -216,7 +217,7 @@ def smacof( n_jobs=None, max_iter=300, verbose=0, - eps="warn", + eps=1e-6, random_state=None, return_n_iter=False, normalized_stress="auto", @@ -285,12 +286,13 @@ def smacof( verbose : int, default=0 Level of verbosity. - eps : float, default=1e-3 + eps : float, default=1e-6 The tolerance with respect to stress (normalized by the sum of squared embedding distances) at which to declare convergence. - .. versionchanged:: 1.9 - The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + .. versionchanged:: 1.7 + The default value for `eps` has changed from 1e-3 to 1e-6, as a result + of a bugfix in the computation of the convergence criterion. random_state : int, RandomState instance or None, default=None Determines the random number generator used to initialize the centers. @@ -356,13 +358,6 @@ def smacof( 3.2e-05 """ - if eps == "warn": - warnings.warn( - "The default value of `eps` will change from 1e-3 to 1e-6 in 1.9.", - FutureWarning, - ) - eps = 1e-3 - if n_init == "warn": warnings.warn( "The default value of `n_init` will change from 8 to 1 in 1.9.", @@ -462,12 +457,13 @@ class MDS(BaseEstimator): verbose : int, default=0 Level of verbosity. - eps : float, default=1e-3 + eps : float, default=1e-6 The tolerance with respect to stress (normalized by the sum of squared embedding distances) at which to declare convergence. - .. versionchanged:: 1.9 - The default value for `eps` will change from 1e-3 to 1e-6 in version 1.9. + .. versionchanged:: 1.7 + The default value for `eps` has changed from 1e-3 to 1e-6, as a result + of a bugfix in the computation of the convergence criterion. n_jobs : int, default=None The number of jobs to use for the computation. If multiple @@ -587,7 +583,7 @@ class MDS(BaseEstimator): "n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})], "max_iter": [Interval(Integral, 1, None, closed="left")], "verbose": ["verbose"], - "eps": [Interval(Real, 0.0, None, closed="left"), StrOptions({"warn"})], + "eps": [Interval(Real, 0.0, None, closed="left")], "n_jobs": [None, Integral], "random_state": ["random_state"], "dissimilarity": [StrOptions({"euclidean", "precomputed"})], @@ -602,7 +598,7 @@ def __init__( n_init="warn", max_iter=300, verbose=0, - eps="warn", + eps=1e-6, n_jobs=None, random_state=None, dissimilarity="euclidean", diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index af94fb652935f..fff84104cfac8 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -233,21 +233,6 @@ def test_convergence_does_not_depend_on_scale(metric): assert_equal(n_iter1, n_iter2) -# TODO(1.9): delete this test -def test_future_warning_eps(): - X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) - sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) - - with pytest.warns(FutureWarning): - mds.smacof(sim, n_init=1) - - with pytest.warns(FutureWarning): - mds._smacof_single(sim) - - with pytest.warns(FutureWarning): - mds.MDS(n_init=1).fit(X) - - # TODO(1.9): delete this test def test_future_warning_n_init(): X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) From fe23faa066151ae81ad209c062e1aa24d723a0d9 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Fri, 25 Apr 2025 15:46:53 +0200 Subject: [PATCH 22/33] Fix changelog --- .../upcoming_changes/sklearn.manifold/31117.enhancement.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst index 2ce6feaee387d..51b9222c91e08 100644 --- a/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst +++ b/doc/whats_new/upcoming_changes/sklearn.manifold/31117.enhancement.rst @@ -1,3 +1,3 @@ -:class:`manifold.MDS` will switch to use `eps=1e-6` by default, +:class:`manifold.MDS` will switch to use `n_init=1` by default, starting from version 1.9. By :user:`Dmitry Kobak ` From 20b7c10189a927e2a9be86bd9d2d02c71ce99988 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Mon, 28 Apr 2025 15:52:20 +0200 Subject: [PATCH 23/33] Update sklearn/manifold/_mds.py Co-authored-by: Olivier Grisel --- sklearn/manifold/_mds.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index d7c34a1058940..3550a688ddc56 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -349,11 +349,11 @@ def smacof( >>> from sklearn.metrics import euclidean_distances >>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]]) >>> dissimilarities = euclidean_distances(X) - >>> Z, stress = smacof(dissimilarities, n_init=1, eps=1e-6, random_state=42) - >>> np.round(Z, 5) - array([[-0.03166, 1.10996], - [ 1.6829 , 0.84601], - [-1.65124, -1.95597]]) + >>> Z, stress = smacof( + ... dissimilarities, n_components=2, n_init=1, eps=1e-6, random_state=42 + ... ) + >>> Z.shape + (3, 2) >>> np.round(stress, 6).item() 3.2e-05 """ From 522800e5384e707a758071788907c495ca756981 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Mon, 28 Apr 2025 15:54:29 +0200 Subject: [PATCH 24/33] Update sklearn/manifold/_mds.py Co-authored-by: Olivier Grisel From 11975bbfb0a805e6a22c4939cad50d2ed98217e9 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:40:57 +0200 Subject: [PATCH 25/33] Update sklearn/manifold/_mds.py Co-authored-by: antoinebaker --- sklearn/manifold/_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 3550a688ddc56..cb986f799fc2d 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -427,7 +427,7 @@ def smacof( return best_pos, best_stress -# TODO(1.9): change default `eps` to 1e-6 and `n_init` to 1, see PR #31117 +# TODO(1.9): change default `n_init` to 1, see PR #31117 class MDS(BaseEstimator): """Multidimensional scaling. From 6e8e4ec3b7c85ae2f88973c601cc4c7f3b466056 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:41:22 +0200 Subject: [PATCH 26/33] Update examples/manifold/plot_compare_methods.py Co-authored-by: antoinebaker --- examples/manifold/plot_compare_methods.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py index cf44fffe460dc..6203a4afc436d 100644 --- a/examples/manifold/plot_compare_methods.py +++ b/examples/manifold/plot_compare_methods.py @@ -167,7 +167,6 @@ def add_2d_scatter(ax, points, points_color, title=None): n_components=n_components, max_iter=50, n_init=1, - eps=1e-6, random_state=0, normalized_stress=False, ) From ad5e516beafec0f0ab1397aeb779a327c2901d2f Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:41:44 +0200 Subject: [PATCH 27/33] Update examples/manifold/plot_manifold_sphere.py Co-authored-by: antoinebaker --- examples/manifold/plot_manifold_sphere.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py index 274ba69b385f7..d52d99be4d087 100644 --- a/examples/manifold/plot_manifold_sphere.py +++ b/examples/manifold/plot_manifold_sphere.py @@ -112,7 +112,7 @@ # Perform Multi-dimensional scaling. t0 = time() -mds = manifold.MDS(2, max_iter=100, n_init=1, random_state=42, eps=1e-6) +mds = manifold.MDS(2, max_iter=100, n_init=1, random_state=42) trans_data = mds.fit_transform(sphere_data).T t1 = time() print("MDS: %.2g sec" % (t1 - t0)) From 0a9720de3c17250142047bf551371906a2f62f87 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:43:13 +0200 Subject: [PATCH 28/33] Update examples/manifold/plot_manifold_sphere.py Co-authored-by: antoinebaker From 97a091c158a5ec45a8a87a6f8124af971e4eef35 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:46:40 +0200 Subject: [PATCH 29/33] Apply suggestions from code review Co-authored-by: antoinebaker --- sklearn/manifold/_mds.py | 3 +-- sklearn/manifold/tests/test_mds.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index cb986f799fc2d..438be684c5f65 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -27,7 +27,6 @@ def _smacof_single( init=None, max_iter=300, verbose=0, - eps=1e-6, random_state=None, normalized_stress=False, ): @@ -565,7 +564,7 @@ class MDS(BaseEstimator): >>> X, _ = load_digits(return_X_y=True) >>> X.shape (1797, 64) - >>> embedding = MDS(n_components=2, n_init=1, eps=1e-6) + >>> embedding = MDS(n_components=2, n_init=1) >>> X_transformed = embedding.fit_transform(X[:100]) >>> X_transformed.shape (100, 2) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index fff84104cfac8..e1ff25c111eaa 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -14,7 +14,7 @@ def test_smacof(): # Borg & Groenen, p 154 sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]]) - X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1, eps=1e-6) + X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1) X_true = np.array( [[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]] ) From b9d6ee3c35df5e9a3afc2fcdecb02fbcb3b6dca4 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:47:01 +0200 Subject: [PATCH 30/33] Apply suggestions from code review Co-authored-by: antoinebaker --- sklearn/manifold/tests/test_mds.py | 2 +- sklearn/tests/test_docstring_parameters.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index e1ff25c111eaa..2191d50e190cf 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -239,7 +239,7 @@ def test_future_warning_n_init(): sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) with pytest.warns(FutureWarning): - mds.smacof(sim, eps=1e-6) + mds.smacof(sim) with pytest.warns(FutureWarning): mds.MDS(eps=1e-6).fit(X) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 66d4453482b17..4d179df69ddf7 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -227,7 +227,7 @@ def test_fit_docstring_attributes(name, Estimator): # TODO(1.9) remove elif Estimator.__name__ == "MDS": # default raises a FutureWarning - est.set_params(n_init=1, eps=1e-6) + est.set_params(n_init=1) # Low max iter to speed up tests: we are only interested in checking the existence # of fitted attributes. This should be invariant to whether it has converged or not. From 7fbdeec89cfbb3af8c12ff817415ecaae96e3d61 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:47:28 +0200 Subject: [PATCH 31/33] Apply suggestions from code review Co-authored-by: antoinebaker --- sklearn/manifold/tests/test_mds.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 2191d50e190cf..6b7562ff0438f 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -114,7 +114,7 @@ def test_smacof_error(): Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]]) with pytest.raises(ValueError): - mds.smacof(sim, init=Z, n_init=1, eps=1e-6) + mds.smacof(sim, init=Z, n_init=1) def test_MDS(): @@ -123,7 +123,6 @@ def test_MDS(): metric=False, n_jobs=3, n_init=3, - eps=1e-6, dissimilarity="precomputed", ) mds_clf.fit(sim) @@ -179,7 +178,7 @@ def test_isotonic_outofbounds(): [0.8766008278401566, 0.4227358815811242], ] ) - mds.smacof(dis, init=init, metric=False, n_init=1, eps=1e-6) + mds.smacof(dis, init=init, metric=False, n_init=1) # TODO(1.9): remove warning filter @@ -220,7 +219,6 @@ def test_convergence_does_not_depend_on_scale(metric): mds_est = mds.MDS( n_components=2, random_state=42, - eps=1e-6, metric=metric, ) @@ -242,4 +240,4 @@ def test_future_warning_n_init(): mds.smacof(sim) with pytest.warns(FutureWarning): - mds.MDS(eps=1e-6).fit(X) + mds.MDS().fit(X) From 6c4943902a3af44bac4bf79c4944ea784441c28f Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:48:02 +0200 Subject: [PATCH 32/33] Apply suggestions from code review Co-authored-by: antoinebaker --- sklearn/manifold/tests/test_mds.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 6b7562ff0438f..88dc842a1d5fc 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -28,13 +28,7 @@ def test_nonmetric_lower_normalized_stress(): Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]]) _, stress1 = mds.smacof( - sim, - init=Z, - n_components=2, - max_iter=1000, - n_init=1, - normalized_stress=True, - eps=1e-6, + sim, init=Z, n_components=2, max_iter=1000, n_init=1, normalized_stress=True ) _, stress2 = mds.smacof( @@ -45,7 +39,6 @@ def test_nonmetric_lower_normalized_stress(): n_init=1, normalized_stress=True, metric=False, - eps=1e-6, ) assert stress1 > stress2 @@ -61,7 +54,6 @@ def test_nonmetric_mds_optimization(): mds_est = mds.MDS( n_components=2, n_init=1, - eps=1e-6, max_iter=2, metric=False, random_state=42, @@ -71,7 +63,6 @@ def test_nonmetric_mds_optimization(): mds_est = mds.MDS( n_components=2, n_init=1, - eps=1e-6, max_iter=3, metric=False, random_state=42, @@ -101,13 +92,13 @@ def test_smacof_error(): sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) with pytest.raises(ValueError): - mds.smacof(sim, eps=1e-6, n_init=1) + mds.smacof(sim, n_init=1) # Not squared similarity matrix: sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]]) with pytest.raises(ValueError): - mds.smacof(sim, eps=1e-6, n_init=1) + mds.smacof(sim, n_init=1) # init not None and not correct format: sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) From 753913c434b3df2fce7fb615de9b9d363747b960 Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Tue, 29 Apr 2025 10:53:14 +0200 Subject: [PATCH 33/33] Fix a but introduced in code review --- sklearn/manifold/_mds.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 438be684c5f65..6c31c72f7ef59 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -27,6 +27,7 @@ def _smacof_single( init=None, max_iter=300, verbose=0, + eps=1e-6, random_state=None, normalized_stress=False, ):