diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4266e028d7a60..9898452469278 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -67,23 +67,23 @@ jobs: SKLEARN_SKIP_NETWORK_TESTS: '0' # Will run all the time regardless of linting outcome. -- template: build_tools/azure/posix.yml - parameters: - name: Linux_Runs - vmImage: ubuntu-18.04 - matrix: - pylatest_conda_mkl: - DISTRIB: 'conda' - PYTHON_VERSION: '*' - BLAS: 'mkl' - NUMPY_VERSION: '*' - SCIPY_VERSION: '*' - CYTHON_VERSION: '*' - PILLOW_VERSION: '*' - PYTEST_VERSION: '*' - JOBLIB_VERSION: '*' - THREADPOOLCTL_VERSION: '2.0.0' - COVERAGE: 'true' +# - template: build_tools/azure/posix.yml +# parameters: +# name: Linux_Runs +# vmImage: ubuntu-18.04 +# matrix: +# pylatest_conda_mkl: +# DISTRIB: 'conda' +# PYTHON_VERSION: '*' +# BLAS: 'mkl' +# NUMPY_VERSION: '*' +# SCIPY_VERSION: '*' +# CYTHON_VERSION: '*' +# PILLOW_VERSION: '*' +# PYTEST_VERSION: '*' +# JOBLIB_VERSION: '*' +# THREADPOOLCTL_VERSION: '2.0.0' +# COVERAGE: 'true' - template: build_tools/azure/posix.yml parameters: @@ -95,31 +95,31 @@ jobs: # Linux environment to test that scikit-learn can be built against # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04 # i.e. numpy 1.13.3 and scipy 0.19 - py36_ubuntu_atlas: - DISTRIB: 'ubuntu' - PYTHON_VERSION: '3.6' - JOBLIB_VERSION: '0.11' - PYTEST_XDIST: 'false' - THREADPOOLCTL_VERSION: '2.0.0' - # Linux + Python 3.6 build with OpenBLAS and without SITE_JOBLIB - py36_conda_openblas: - DISTRIB: 'conda' - PYTHON_VERSION: '3.6' - BLAS: 'openblas' - NUMPY_VERSION: '1.13.3' - SCIPY_VERSION: '0.19.1' - PANDAS_VERSION: '*' - CYTHON_VERSION: '*' - # temporary pin pytest due to unknown failure with pytest 5.3 - PYTEST_VERSION: '5.2' - PILLOW_VERSION: '4.2.1' - MATPLOTLIB_VERSION: '2.1.1' - SCIKIT_IMAGE_VERSION: '*' - # latest version of joblib available in conda for Python 3.6 - JOBLIB_VERSION: '0.13.2' - THREADPOOLCTL_VERSION: '2.0.0' - PYTEST_XDIST: 'false' - COVERAGE: 'true' + # py36_ubuntu_atlas: + # DISTRIB: 'ubuntu' + # PYTHON_VERSION: '3.6' + # JOBLIB_VERSION: '0.11' + # PYTEST_XDIST: 'false' + # THREADPOOLCTL_VERSION: '2.0.0' + # # Linux + Python 3.6 build with OpenBLAS and without SITE_JOBLIB + # py36_conda_openblas: + # DISTRIB: 'conda' + # PYTHON_VERSION: '3.6' + # BLAS: 'openblas' + # NUMPY_VERSION: '1.13.3' + # SCIPY_VERSION: '0.19.1' + # PANDAS_VERSION: '*' + # CYTHON_VERSION: '*' + # # temporary pin pytest due to unknown failure with pytest 5.3 + # PYTEST_VERSION: '5.2' + # PILLOW_VERSION: '4.2.1' + # MATPLOTLIB_VERSION: '2.1.1' + # SCIKIT_IMAGE_VERSION: '*' + # # latest version of joblib available in conda for Python 3.6 + # JOBLIB_VERSION: '0.13.2' + # THREADPOOLCTL_VERSION: '2.0.0' + # PYTEST_XDIST: 'false' + # COVERAGE: 'true' # Linux environment to test the latest available dependencies and MKL. # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: @@ -131,66 +131,66 @@ jobs: TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' -- template: build_tools/azure/posix-32.yml - parameters: - name: Linux32 - vmImage: ubuntu-18.04 - dependsOn: [linting] - condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) - matrix: - py36_ubuntu_atlas_32bit: - DISTRIB: 'ubuntu-32' - PYTHON_VERSION: '3.6' - JOBLIB_VERSION: '0.13' - THREADPOOLCTL_VERSION: '2.0.0' +# - template: build_tools/azure/posix-32.yml +# parameters: +# name: Linux32 +# vmImage: ubuntu-18.04 +# dependsOn: [linting] +# condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) +# matrix: +# py36_ubuntu_atlas_32bit: +# DISTRIB: 'ubuntu-32' +# PYTHON_VERSION: '3.6' +# JOBLIB_VERSION: '0.13' +# THREADPOOLCTL_VERSION: '2.0.0' -- template: build_tools/azure/posix.yml - parameters: - name: macOS - vmImage: macOS-10.14 - dependsOn: [linting] - condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) - matrix: - pylatest_conda_mkl: - DISTRIB: 'conda' - PYTHON_VERSION: '*' - BLAS: 'mkl' - NUMPY_VERSION: '*' - SCIPY_VERSION: '*' - CYTHON_VERSION: '*' - PILLOW_VERSION: '*' - PYTEST_VERSION: '*' - JOBLIB_VERSION: '*' - THREADPOOLCTL_VERSION: '2.0.0' - COVERAGE: 'true' - pylatest_conda_mkl_no_openmp: - DISTRIB: 'conda' - PYTHON_VERSION: '*' - BLAS: 'mkl' - NUMPY_VERSION: '*' - SCIPY_VERSION: '*' - CYTHON_VERSION: '*' - PILLOW_VERSION: '*' - PYTEST_VERSION: '*' - JOBLIB_VERSION: '*' - THREADPOOLCTL_VERSION: '2.0.0' - COVERAGE: 'true' - SKLEARN_TEST_NO_OPENMP: 'true' - SKLEARN_SKIP_OPENMP_TEST: 'true' +# - template: build_tools/azure/posix.yml +# parameters: +# name: macOS +# vmImage: macOS-10.14 +# dependsOn: [linting] +# condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) +# matrix: +# pylatest_conda_mkl: +# DISTRIB: 'conda' +# PYTHON_VERSION: '*' +# BLAS: 'mkl' +# NUMPY_VERSION: '*' +# SCIPY_VERSION: '*' +# CYTHON_VERSION: '*' +# PILLOW_VERSION: '*' +# PYTEST_VERSION: '*' +# JOBLIB_VERSION: '*' +# THREADPOOLCTL_VERSION: '2.0.0' +# COVERAGE: 'true' +# pylatest_conda_mkl_no_openmp: +# DISTRIB: 'conda' +# PYTHON_VERSION: '*' +# BLAS: 'mkl' +# NUMPY_VERSION: '*' +# SCIPY_VERSION: '*' +# CYTHON_VERSION: '*' +# PILLOW_VERSION: '*' +# PYTEST_VERSION: '*' +# JOBLIB_VERSION: '*' +# THREADPOOLCTL_VERSION: '2.0.0' +# COVERAGE: 'true' +# SKLEARN_TEST_NO_OPENMP: 'true' +# SKLEARN_SKIP_OPENMP_TEST: 'true' -- template: build_tools/azure/windows.yml - parameters: - name: Windows - vmImage: vs2017-win2016 - dependsOn: [linting] - condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) - matrix: - py37_conda_mkl: - PYTHON_VERSION: '3.7' - CHECK_WARNINGS: 'true' - PYTHON_ARCH: '64' - PYTEST_VERSION: '*' - COVERAGE: 'true' - py36_pip_openblas_32bit: - PYTHON_VERSION: '3.6' - PYTHON_ARCH: '32' +# - template: build_tools/azure/windows.yml +# parameters: +# name: Windows +# vmImage: vs2017-win2016 +# dependsOn: [linting] +# condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) +# matrix: +# py37_conda_mkl: +# PYTHON_VERSION: '3.7' +# CHECK_WARNINGS: 'true' +# PYTHON_ARCH: '64' +# PYTEST_VERSION: '*' +# COVERAGE: 'true' +# py36_pip_openblas_32bit: +# PYTHON_VERSION: '3.6' +# PYTHON_ARCH: '32' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index faddf6f360926..f0472d8586045 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -94,7 +94,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install -U pip python -m pip install pytest==$PYTEST_VERSION pytest-cov - python -m pip install pandas matplotlib pyamg scikit-image + python -m pip install pandas matplotlib pyamg scikit-image jax jaxlib # do not install dependencies for lightgbm since it requires scikit-learn python -m pip install lightgbm --no-deps elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index c44b0e2d34196..4758fb605382f 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -42,5 +42,5 @@ cp setup.cfg $TEST_DIR cd $TEST_DIR set -x -$TEST_CMD --pyargs sklearn +$TEST_CMD --pyargs sklearn.decomposition set +x diff --git a/sklearn/_config.py b/sklearn/_config.py index e333095e817d1..ebe56692f6608 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -8,6 +8,7 @@ 'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)), 'print_changed_only': True, 'display': 'text', + 'enable_duck_array': False, } @@ -28,7 +29,8 @@ def get_config(): def set_config(assume_finite=None, working_memory=None, - print_changed_only=None, display=None): + print_changed_only=None, display=None, + enable_duck_array=None): """Set global scikit-learn configuration .. versionadded:: 0.19 @@ -80,6 +82,8 @@ def set_config(assume_finite=None, working_memory=None, _global_config['print_changed_only'] = print_changed_only if display is not None: _global_config['display'] = display + if enable_duck_array is not None: + _global_config['enable_duck_array'] = enable_duck_array @contextmanager diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index fa305d273e857..2aaa89627ef7d 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -14,7 +14,6 @@ import numbers import numpy as np -from scipy import linalg from scipy.special import gammaln from scipy.sparse import issparse from scipy.sparse.linalg import svds @@ -26,6 +25,7 @@ from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args +from ..utils import _get_array_module def _assess_dimension(spectrum, rank, n_samples): @@ -396,6 +396,7 @@ def _fit(self, X): X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_2d=True, copy=self.copy) + npx = _get_array_module(X) # Handle n_components==None if self.n_components is None: @@ -420,14 +421,14 @@ def _fit(self, X): # Call different fits for either full or truncated SVD if self._fit_svd_solver == 'full': - return self._fit_full(X, n_components) + return self._fit_full(X, n_components, npx=npx) elif self._fit_svd_solver in ['arpack', 'randomized']: return self._fit_truncated(X, n_components, self._fit_svd_solver) else: raise ValueError("Unrecognized svd_solver='{0}'" "".format(self._fit_svd_solver)) - def _fit_full(self, X, n_components): + def _fit_full(self, X, n_components, npx=np): """Fit the model by computing full SVD on X""" n_samples, n_features = X.shape @@ -448,12 +449,12 @@ def _fit_full(self, X, n_components): % (n_components, type(n_components))) # Center data - self.mean_ = np.mean(X, axis=0) + self.mean_ = npx.mean(X, axis=0) X -= self.mean_ - U, S, Vt = linalg.svd(X, full_matrices=False) + U, S, Vt = npx.linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output - U, Vt = svd_flip(U, Vt) + U, Vt = svd_flip(U, Vt, npx=npx) components_ = Vt diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 0123e169ce9c0..04ab7d04da690 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -3,6 +3,7 @@ import pytest +import sklearn from sklearn.utils._testing import assert_allclose from sklearn import datasets @@ -638,3 +639,30 @@ def test_assess_dimesion_rank_one(): assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples)) for rank in range(2, n_features): assert _assess_dimension(s, rank, n_samples) == -np.inf + + +# XXX: it should be possible to support 'randomized' by adding npx=np +# in appropriate locations. The 'arpack' svd_solver, on the other-hand, +# cannot easily be adapted to work on non-numpy allocated arrays. +# @pytest.mark.parametrize('svd_solver', ["full", "randomized", "auto"]) +@pytest.mark.parametrize('svd_solver', ["full"]) +@pytest.mark.parametrize('copy', [True, False]) +def test_pca_jax_data(svd_solver, copy): + jnp = pytest.importorskip("jax.numpy") + X_np = np.random.RandomState(42).randn(1000, 100) + X_np = X_np.astype(np.float32) + X_jnp = jnp.asarray(X_np) + + pca_np = PCA(n_components=3, svd_solver=svd_solver, copy=copy, + random_state=0) + X_pca_np = pca_np.fit_transform(X_np) + + with sklearn.config_context(enable_duck_array=True): + pca_jnp = PCA(**pca_np.get_params()) + X_pca_jnp = pca_jnp.fit_transform(X_jnp) + + assert isinstance(X_pca_jnp, type(X_jnp)) + assert isinstance(pca_jnp.components_, type(X_jnp)) + + assert_allclose(X_pca_np, X_pca_jnp, atol=1e-3) + assert_allclose(pca_np.components_, pca_jnp.components_, atol=1e-3) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 2b92bd3d09467..836fe92489057 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -30,6 +30,7 @@ min_max_axis) from ..utils.validation import (check_is_fitted, check_random_state, FLOAT_DTYPES, _deprecate_positional_args) +from ..utils import _get_array_module from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -60,17 +61,17 @@ ] -def _handle_zeros_in_scale(scale, copy=True): +def _handle_zeros_in_scale(scale, copy=True, npx=np): ''' Makes sure that whenever scale is zero, we handle it correctly. This happens in most scalers when we have constant features.''' # if we are fitting on 1D arrays, scale might be a scalar - if np.isscalar(scale): + if npx.isscalar(scale): if scale == .0: scale = 1. return scale - elif isinstance(scale, np.ndarray): + elif isinstance(scale, npx.ndarray): if copy: # New array to avoid side-effects scale = scale.copy() @@ -387,20 +388,21 @@ def partial_fit(self, X, y=None): X = self._validate_data(X, reset=first_pass, estimator=self, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") + npx = _get_array_module(X) - data_min = np.nanmin(X, axis=0) - data_max = np.nanmax(X, axis=0) + data_min = npx.nanmin(X, axis=0) + data_max = npx.nanmax(X, axis=0) if first_pass: self.n_samples_seen_ = X.shape[0] else: - data_min = np.minimum(self.data_min_, data_min) - data_max = np.maximum(self.data_max_, data_max) + data_min = npx.minimum(self.data_min_, data_min) + data_max = npx.maximum(self.data_max_, data_max) self.n_samples_seen_ += X.shape[0] data_range = data_max - data_min self.scale_ = ((feature_range[1] - feature_range[0]) / - _handle_zeros_in_scale(data_range)) + _handle_zeros_in_scale(data_range, npx=npx)) self.min_ = feature_range[0] - data_min * self.scale_ self.data_min_ = data_min self.data_max_ = data_max diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index eec349861258c..db72e4972efb6 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -5,7 +5,8 @@ def test_config_context(): assert get_config() == {'assume_finite': False, 'working_memory': 1024, 'print_changed_only': True, - 'display': 'text'} + 'display': 'text', + 'enable_duck_array': False} # Not using as a context manager affects nothing config_context(assume_finite=True) @@ -14,7 +15,8 @@ def test_config_context(): with config_context(assume_finite=True): assert get_config() == {'assume_finite': True, 'working_memory': 1024, 'print_changed_only': True, - 'display': 'text'} + 'display': 'text', + 'enable_duck_array': False} assert get_config()['assume_finite'] is False with config_context(assume_finite=True): @@ -40,7 +42,8 @@ def test_config_context(): assert get_config() == {'assume_finite': False, 'working_memory': 1024, 'print_changed_only': True, - 'display': 'text'} + 'display': 'text', + 'enable_duck_array': False} # No positional arguments assert_raises(TypeError, config_context, True) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4149765e7c9aa..46a98ed5be396 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1185,3 +1185,20 @@ def is_abstract(c): # itemgetter is used to ensure the sort does not extend to the 2nd item of # the tuple return sorted(set(estimators), key=itemgetter(0)) + + +def _get_array_module(array): + if isinstance(array, np.ndarray): + return np + elif issparse(array): + return np + + npx_str = array.__class__.__module__.split(".")[0] + if npx_str == "cupy": + import cupy + return cupy + elif npx_str == "jax": + import jax.numpy as npx + return npx + else: + raise ValueError(f"Unexpected array object from module: {npx_str}") diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 6cf57479bc4a4..3fe5d64ddcffa 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -497,7 +497,7 @@ def cartesian(arrays, out=None): return out -def svd_flip(u, v, u_based_decision=True): +def svd_flip(u, v, u_based_decision=True, npx=np): """Sign correction to ensure deterministic output from SVD. Adjusts the columns of u and the rows of v such that the loadings in the @@ -523,6 +523,11 @@ def svd_flip(u, v, u_based_decision=True): decision on is generally algorithm dependent. + npx : module + Module compatible with the numpy API to make it possible to use this + utility function with alternative array libraries by following NEP 37 + idioms. + Returns ------- u_adjusted, v_adjusted : arrays with the same dimensions as the input. @@ -530,14 +535,14 @@ def svd_flip(u, v, u_based_decision=True): """ if u_based_decision: # columns of u, rows of v - max_abs_cols = np.argmax(np.abs(u), axis=0) - signs = np.sign(u[max_abs_cols, range(u.shape[1])]) + max_abs_cols = npx.argmax(npx.abs(u), axis=0) + signs = npx.sign(u[max_abs_cols, list(range(u.shape[1]))]) u *= signs v *= signs[:, np.newaxis] else: # rows of v, columns of u - max_abs_rows = np.argmax(np.abs(v), axis=1) - signs = np.sign(v[range(v.shape[0]), max_abs_rows]) + max_abs_rows = npx.argmax(np.abs(v), axis=1) + signs = npx.sign(v[list(range(v.shape[0])), max_abs_rows]) u *= signs v *= signs[:, np.newaxis] return u, v diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 9a8c66fd2df13..95e683d30d412 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -484,6 +484,9 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, array_converted : object The converted and validated array. """ + if _get_config()['enable_duck_array']: + return array.copy() if copy else array + # store reference to original array to check if copy is needed when # function returns array_orig = array