Skip to content

ENH Change the default n_init and eps for MDS #31117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 37 commits into from
Apr 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
ce26812
Change the init and the eps
dkobak Mar 31, 2025
2bfe661
Fix typo
dkobak Mar 31, 2025
f900c29
Add what's new
dkobak Mar 31, 2025
9aa9af7
rename what's new file
dkobak Mar 31, 2025
dfb5b43
Fix the example
dkobak Apr 1, 2025
45dd51c
Add deprecation cycle
dkobak Apr 1, 2025
153febe
Fix example
dkobak Apr 1, 2025
80862ab
Remove future warnings from examples
dkobak Apr 2, 2025
95c99bb
Fix docstring params
dkobak Apr 2, 2025
f56d589
Fix docstrings
dkobak Apr 2, 2025
27bf6cc
Fix warning test
dkobak Apr 2, 2025
bd51858
Avoid future warning in docsting params test
dkobak Apr 2, 2025
f394ab2
Adjust what's new description
dkobak Apr 2, 2025
d5346bc
Format the MDS example as a notebook
dkobak Apr 2, 2025
cae3bc0
Add some TODO comments
dkobak Apr 22, 2025
a4f4d81
Merge branch 'main' into mds-default-params
dkobak Apr 24, 2025
53a9698
Update sklearn/manifold/_mds.py
dkobak Apr 25, 2025
8484a99
Update sklearn/manifold/_mds.py
dkobak Apr 25, 2025
9a5b287
Update sklearn/manifold/_mds.py
dkobak Apr 25, 2025
8253531
Update sklearn/manifold/_mds.py
dkobak Apr 25, 2025
5af02c3
Update sklearn/manifold/_mds.py
dkobak Apr 25, 2025
3dae414
Change eps default without future warning
dkobak Apr 25, 2025
fe23faa
Fix changelog
dkobak Apr 25, 2025
9f2ef99
Merge branch 'main' into mds-default-params
dkobak Apr 25, 2025
20b7c10
Update sklearn/manifold/_mds.py
dkobak Apr 28, 2025
522800e
Update sklearn/manifold/_mds.py
dkobak Apr 28, 2025
186314d
Merge branch 'main' into mds-default-params
dkobak Apr 28, 2025
11975bb
Update sklearn/manifold/_mds.py
dkobak Apr 29, 2025
6e8e4ec
Update examples/manifold/plot_compare_methods.py
dkobak Apr 29, 2025
ad5e516
Update examples/manifold/plot_manifold_sphere.py
dkobak Apr 29, 2025
0a9720d
Update examples/manifold/plot_manifold_sphere.py
dkobak Apr 29, 2025
97a091c
Apply suggestions from code review
dkobak Apr 29, 2025
b9d6ee3
Apply suggestions from code review
dkobak Apr 29, 2025
7fbdeec
Apply suggestions from code review
dkobak Apr 29, 2025
6c49439
Apply suggestions from code review
dkobak Apr 29, 2025
878c6e7
Merge branch 'main' into mds-default-params
dkobak Apr 29, 2025
753913c
Fix a but introduced in code review
dkobak Apr 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:class:`manifold.MDS` will switch to use `n_init=1` by default,
starting from version 1.9.
By :user:`Dmitry Kobak <dkobak>`
5 changes: 5 additions & 0 deletions doc/whats_new/upcoming_changes/sklearn.manifold/31117.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
:class:`manifold.MDS` now uses `eps=1e-6` by default and the convergence
criterion was adjusted to make sense for both metric and non-metric MDS
and to follow the reference R implementation. The formula for normalized
stress was adjusted to follow the original definition by Kruskal.
By :user:`Dmitry Kobak <dkobak>`
2 changes: 1 addition & 1 deletion examples/manifold/plot_compare_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def add_2d_scatter(ax, points, points_color, title=None):
md_scaling = manifold.MDS(
n_components=n_components,
max_iter=50,
n_init=4,
n_init=1,
random_state=0,
normalized_stress=False,
)
Expand Down
2 changes: 1 addition & 1 deletion examples/manifold/plot_lle_digits.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def plot_embedding(X, title):
"LTSA LLE embedding": LocallyLinearEmbedding(
n_neighbors=n_neighbors, n_components=2, method="ltsa"
),
"MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, n_jobs=2),
"MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, eps=1e-6),
"Random Trees embedding": make_pipeline(
RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),
TruncatedSVD(n_components=2),
Expand Down
31 changes: 26 additions & 5 deletions examples/manifold/plot_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@

An illustration of the metric and non-metric MDS on generated noisy data.

The reconstructed points using the metric MDS and non metric MDS are slightly
shifted to avoid overlapping.

"""

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

# %%
# Dataset preparation
# -------------------
#
# We start by uniformly generating 20 points in a 2D space.

import numpy as np
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
Expand All @@ -31,6 +34,11 @@
# Center the data
X_true -= X_true.mean()

# %%
# Now we compute pairwise distances between all points and add
# a small amount of noise to the distance matrix. We make sure
# to keep the noisy distance matrix symmetric.

# Compute pairwise Euclidean distances
distances = euclidean_distances(X_true)

Expand All @@ -40,10 +48,14 @@
np.fill_diagonal(noise, 0)
distances += noise

# %%
# Here we compute metric and non-metric MDS of the noisy distance matrix.

mds = manifold.MDS(
n_components=2,
max_iter=3000,
eps=1e-9,
n_init=1,
random_state=42,
dissimilarity="precomputed",
n_jobs=1,
Expand All @@ -62,10 +74,16 @@
)
X_nmds = nmds.fit_transform(distances)

# Rescale the data
X_mds *= np.sqrt((X_true**2).sum()) / np.sqrt((X_mds**2).sum())
# %%
# Rescaling the non-metric MDS solution to match the spread of the original data.

X_nmds *= np.sqrt((X_true**2).sum()) / np.sqrt((X_nmds**2).sum())

# %%
# To make the visual comparisons easier, we rotate the original data and both MDS
# solutions to their PCA axes. And flip horizontal and vertical MDS axes, if needed,
# to match the original data orientation.

# Rotate the data
pca = PCA(n_components=2)
X_true = pca.fit_transform(X_true)
Expand All @@ -79,6 +97,9 @@
if np.corrcoef(X_nmds[:, i], X_true[:, i])[0, 1] < 0:
X_nmds[:, i] *= -1

# %%
# Finally, we plot the original data and both MDS reconstructions.

fig = plt.figure(1)
ax = plt.axes([0.0, 0.0, 1.0, 1.0])

Expand Down
116 changes: 76 additions & 40 deletions sklearn/manifold/_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _smacof_single(
init=None,
max_iter=300,
verbose=0,
eps=1e-3,
eps=1e-6,
random_state=None,
normalized_stress=False,
):
Expand Down Expand Up @@ -59,18 +59,21 @@ def _smacof_single(
verbose : int, default=0
Level of verbosity.

eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence. The value of `eps` should be tuned separately depending
on whether or not `normalized_stress` is being used.
eps : float, default=1e-6
The tolerance with respect to stress (normalized by the sum of squared
embedding distances) at which to declare convergence.

.. versionchanged:: 1.7
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
of a bugfix in the computation of the convergence criterion.

random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.

normalized_stress : bool, default=False
Whether use and return normalized stress value (Stress-1) instead of raw
Whether to return normalized stress value (Stress-1) instead of raw
stress.

.. versionadded:: 1.2
Expand Down Expand Up @@ -168,29 +171,32 @@ def _smacof_single(
# Compute stress
distances = euclidean_distances(X)
stress = ((distances.ravel() - disparities.ravel()) ** 2).sum() / 2
if normalized_stress:
stress = np.sqrt(stress / ((disparities.ravel() ** 2).sum() / 2))

normalization = np.sqrt((X**2).sum(axis=1)).sum()
if verbose >= 2: # pragma: no cover
print(f"Iteration {it}, stress {stress:.4f}")
if old_stress is not None:
if (old_stress - stress / normalization) < eps:
sum_squared_distances = (distances.ravel() ** 2).sum()
if ((old_stress - stress) / (sum_squared_distances / 2)) < eps:
if verbose: # pragma: no cover
print("Convergence criterion reached.")
break
old_stress = stress / normalization
old_stress = stress

if normalized_stress:
sum_squared_distances = (distances.ravel() ** 2).sum()
stress = np.sqrt(stress / (sum_squared_distances / 2))

return X, stress, it + 1


# TODO(1.9): change default `n_init` to 1, see PR #31117
@validate_params(
{
"dissimilarities": ["array-like"],
"metric": ["boolean"],
"n_components": [Interval(Integral, 1, None, closed="left")],
"init": ["array-like", None],
"n_init": [Interval(Integral, 1, None, closed="left")],
"n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
"n_jobs": [Integral, None],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"verbose": ["verbose"],
Expand All @@ -207,11 +213,11 @@ def smacof(
metric=True,
n_components=2,
init=None,
n_init=8,
n_init="warn",
n_jobs=None,
max_iter=300,
verbose=0,
eps=1e-3,
eps=1e-6,
random_state=None,
return_n_iter=False,
normalized_stress="auto",
Expand Down Expand Up @@ -262,6 +268,9 @@ def smacof(
determined by the run with the smallest final stress. If ``init`` is
provided, this option is overridden and a single run is performed.

.. versionchanged:: 1.9
The default value for `n_iter` will change from 8 to 1 in version 1.9.

n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
Expand All @@ -277,10 +286,13 @@ def smacof(
verbose : int, default=0
Level of verbosity.

eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence. The value of `eps` should be tuned separately depending
on whether or not `normalized_stress` is being used.
eps : float, default=1e-6
The tolerance with respect to stress (normalized by the sum of squared
embedding distances) at which to declare convergence.

.. versionchanged:: 1.7
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
of a bugfix in the computation of the convergence criterion.

random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Expand All @@ -290,7 +302,7 @@ def smacof(
return_n_iter : bool, default=False
Whether or not to return the number of iterations.

normalized_stress : bool or "auto" default="auto"
normalized_stress : bool or "auto", default="auto"
Whether to return normalized stress value (Stress-1) instead of raw
stress. By default, metric MDS returns raw stress while non-metric MDS
returns normalized stress.
Expand Down Expand Up @@ -335,17 +347,24 @@ def smacof(
>>> import numpy as np
>>> from sklearn.manifold import smacof
>>> from sklearn.metrics import euclidean_distances
>>> X = np.array([[0, 1, 2], [1, 0, 3],[2, 3, 0]])
>>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
>>> dissimilarities = euclidean_distances(X)
>>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42)
>>> np.round(mds_result, 5)
array([[ 0.05352, -1.07253],
[ 1.74231, -0.75675],
[-1.79583, 1.82928]])
>>> np.round(stress, 5).item()
0.00128
>>> Z, stress = smacof(
... dissimilarities, n_components=2, n_init=1, eps=1e-6, random_state=42
... )
>>> Z.shape
(3, 2)
>>> np.round(stress, 6).item()
3.2e-05
"""

if n_init == "warn":
warnings.warn(
"The default value of `n_init` will change from 8 to 1 in 1.9.",
FutureWarning,
)
n_init = 8

dissimilarities = check_array(dissimilarities)
random_state = check_random_state(random_state)

Expand Down Expand Up @@ -408,6 +427,7 @@ def smacof(
return best_pos, best_stress


# TODO(1.9): change default `n_init` to 1, see PR #31117
class MDS(BaseEstimator):
"""Multidimensional scaling.

Expand All @@ -428,16 +448,22 @@ class MDS(BaseEstimator):
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress.

.. versionchanged:: 1.9
The default value for `n_init` will change from 4 to 1 in version 1.9.

max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.

verbose : int, default=0
Level of verbosity.

eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence. The value of `eps` should be tuned separately depending
on whether or not `normalized_stress` is being used.
eps : float, default=1e-6
The tolerance with respect to stress (normalized by the sum of squared
embedding distances) at which to declare convergence.

.. versionchanged:: 1.7
The default value for `eps` has changed from 1e-3 to 1e-6, as a result
of a bugfix in the computation of the convergence criterion.

n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
Expand All @@ -464,9 +490,9 @@ class MDS(BaseEstimator):
``fit_transform``.

normalized_stress : bool or "auto" default="auto"
Whether use and return normalized stress value (Stress-1) instead of raw
stress. By default, metric MDS uses raw stress while non-metric MDS uses
normalized stress.
Whether to return normalized stress value (Stress-1) instead of raw
stress. By default, metric MDS returns raw stress while non-metric MDS
returns normalized stress.

.. versionadded:: 1.2

Expand Down Expand Up @@ -539,7 +565,7 @@ class MDS(BaseEstimator):
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = MDS(n_components=2, normalized_stress='auto')
>>> embedding = MDS(n_components=2, n_init=1)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
Expand All @@ -554,7 +580,7 @@ class MDS(BaseEstimator):
_parameter_constraints: dict = {
"n_components": [Interval(Integral, 1, None, closed="left")],
"metric": ["boolean"],
"n_init": [Interval(Integral, 1, None, closed="left")],
"n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"verbose": ["verbose"],
"eps": [Interval(Real, 0.0, None, closed="left")],
Expand All @@ -569,10 +595,10 @@ def __init__(
n_components=2,
*,
metric=True,
n_init=4,
n_init="warn",
max_iter=300,
verbose=0,
eps=1e-3,
eps=1e-6,
n_jobs=None,
random_state=None,
dissimilarity="euclidean",
Expand Down Expand Up @@ -646,10 +672,20 @@ def fit_transform(self, X, y=None, init=None):
X_new : ndarray of shape (n_samples, n_components)
X transformed in the new space.
"""

if self.n_init == "warn":
warnings.warn(
"The default value of `n_init` will change from 4 to 1 in 1.9.",
FutureWarning,
)
self._n_init = 4
else:
self._n_init = self.n_init

X = validate_data(self, X)
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
warnings.warn(
"The MDS API has changed. ``fit`` now constructs an"
"The MDS API has changed. ``fit`` now constructs a"
" dissimilarity matrix from data. To use a custom "
"dissimilarity matrix, set "
"``dissimilarity='precomputed'``."
Expand All @@ -665,7 +701,7 @@ def fit_transform(self, X, y=None, init=None):
metric=self.metric,
n_components=self.n_components,
init=init,
n_init=self.n_init,
n_init=self._n_init,
n_jobs=self.n_jobs,
max_iter=self.max_iter,
verbose=self.verbose,
Expand Down
Loading