Skip to content

FIX array_api support for non-integer n_components in PCA #27431

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 15, 2023
Merged
2 changes: 1 addition & 1 deletion doc/whats_new/v1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ Changelog
- |Enhancement| :class:`decomposition.PCA` now supports the Array API for the
`full` and `randomized` solvers (with QR power iterations). See
:ref:`array_api` for more details.
:pr:`26315` and :pr:`27098` by :user:`Mateusz Sokół <mtsokol>`,
:pr:`26315`, :pr:`27098` and :pr:`27431` by :user:`Mateusz Sokół <mtsokol>`,
:user:`Olivier Grisel <ogrisel>` and :user:`Edoardo Abati <EdAbati>`.

- |Feature| :class:`decomposition.PCA` now supports :class:`scipy.sparse.sparray`
Expand Down
41 changes: 29 additions & 12 deletions sklearn/decomposition/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..base import _fit_context
from ..utils import check_random_state
from ..utils._arpack import _init_arpack_v0
from ..utils._array_api import get_namespace
from ..utils._array_api import _convert_to_numpy, get_namespace
from ..utils._param_validation import Interval, RealNotInt, StrOptions
from ..utils.deprecation import deprecated
from ..utils.extmath import fast_logdet, randomized_svd, stable_cumsum, svd_flip
Expand Down Expand Up @@ -60,6 +60,7 @@ def _assess_dimension(spectrum, rank, n_samples):
Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604
<https://proceedings.neurips.cc/paper/2000/file/7503cfacd12053d309b6bed5c89de212-Paper.pdf>`_
"""
xp, _ = get_namespace(spectrum)

n_features = spectrum.shape[0]
if not 1 <= rank < n_features:
Expand All @@ -73,29 +74,29 @@ def _assess_dimension(spectrum, rank, n_samples):
# small and won't be the max anyway. Also, it can lead to numerical
# issues below when computing pa, in particular in log((spectrum[i] -
# spectrum[j]) because this will take the log of something very small.
return -np.inf
return -xp.inf

pu = -rank * log(2.0)
for i in range(1, rank + 1):
pu += (
gammaln((n_features - i + 1) / 2.0)
- log(np.pi) * (n_features - i + 1) / 2.0
- log(xp.pi) * (n_features - i + 1) / 2.0
)

pl = np.sum(np.log(spectrum[:rank]))
pl = xp.sum(xp.log(spectrum[:rank]))
pl = -pl * n_samples / 2.0

v = max(eps, np.sum(spectrum[rank:]) / (n_features - rank))
pv = -np.log(v) * n_samples * (n_features - rank) / 2.0
v = max(eps, xp.sum(spectrum[rank:]) / (n_features - rank))
pv = -log(v) * n_samples * (n_features - rank) / 2.0

m = n_features * rank - rank * (rank + 1.0) / 2.0
pp = log(2.0 * np.pi) * (m + rank) / 2.0
pp = log(2.0 * xp.pi) * (m + rank) / 2.0

pa = 0.0
spectrum_ = spectrum.copy()
spectrum_ = xp.asarray(spectrum, copy=True)
spectrum_[rank:n_features] = v
for i in range(rank):
for j in range(i + 1, len(spectrum)):
for j in range(i + 1, spectrum.shape[0]):
pa += log(
(spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i])
) + log(n_samples)
Expand All @@ -116,7 +117,7 @@ def _infer_dimension(spectrum, n_samples):
ll[0] = -xp.inf # we don't want to return n_components = 0
for rank in range(1, spectrum.shape[0]):
ll[rank] = _assess_dimension(spectrum, rank, n_samples)
return ll.argmax()
return xp.argmax(ll)


class PCA(_BasePCA):
Expand Down Expand Up @@ -578,8 +579,24 @@ def _fit_full(self, X, n_components):
# side='right' ensures that number of features selected
# their variance is always greater than n_components float
# passed. More discussion in issue: #15669
ratio_cumsum = stable_cumsum(explained_variance_ratio_)
n_components = xp.searchsorted(ratio_cumsum, n_components, side="right") + 1
if is_array_api_compliant:
# Convert to numpy as xp.cumsum and xp.searchsorted are not
# part of the Array API standard yet:
#
# https://github.com/data-apis/array-api/issues/597
# https://github.com/data-apis/array-api/issues/688
#
# Furthermore, it's not always safe to call them for namespaces
# that already implement them: for instance as
# cupy.searchsorted does not accept a float as second argument.
explained_variance_ratio_np = _convert_to_numpy(
explained_variance_ratio_, xp=xp
)
else:
explained_variance_ratio_np = explained_variance_ratio_
ratio_cumsum = stable_cumsum(explained_variance_ratio_np)
n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1

# Compute noise covariance using Probabilistic PCA model
# The sigma2 maximum likelihood (cf. eq. 12.46)
if n_components < min(n_features, n_samples):
Expand Down
25 changes: 24 additions & 1 deletion sklearn/decomposition/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sklearn.utils._testing import _array_api_for_tests, assert_allclose
from sklearn.utils.estimator_checks import (
_get_check_estimator_ids,
check_array_api_input,
check_array_api_input_and_values,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
Expand Down Expand Up @@ -859,7 +860,7 @@ def check_array_api_get_precision(name, estimator, array_namespace, device, dtyp
"estimator",
[
PCA(n_components=2, svd_solver="full"),
PCA(n_components=2, svd_solver="full", whiten=True),
PCA(n_components=0.1, svd_solver="full", whiten=True),
PCA(
n_components=2,
svd_solver="randomized",
Expand All @@ -874,6 +875,28 @@ def test_pca_array_api_compliance(estimator, check, array_namespace, device, dty
check(name, estimator, array_namespace, device=device, dtype=dtype)


@pytest.mark.parametrize(
"array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize(
"check",
[check_array_api_input, check_array_api_get_precision],
ids=_get_check_estimator_ids,
)
@pytest.mark.parametrize(
"estimator",
[
# PCA with mle cannot use check_array_api_input_and_values because of
# rounding errors in the noisy (low variance) components.
PCA(n_components="mle", svd_solver="full"),
],
ids=_get_check_estimator_ids,
)
def test_pca_mle_array_api_compliance(estimator, check, array_namespace, device, dtype):
name = estimator.__class__.__name__
check(name, estimator, array_namespace, device=device, dtype=dtype)


def test_array_api_error_and_warnings_on_unsupported_params():
pytest.importorskip("array_api_compat")
xp = pytest.importorskip("numpy.array_api")
Expand Down
12 changes: 4 additions & 8 deletions sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,14 +1211,10 @@ def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
out : ndarray
Array with the cumulative sums along the chosen axis.
"""
xp, _ = get_namespace(arr)

out = xp.cumsum(arr, axis=axis, dtype=np.float64)
expected = xp.sum(arr, axis=axis, dtype=np.float64)
if not xp.all(
xp.isclose(
out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
)
out = np.cumsum(arr, axis=axis, dtype=np.float64)
expected = np.sum(arr, axis=axis, dtype=np.float64)
if not np.allclose(
out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
):
warnings.warn(
(
Expand Down