diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py index bbf5029062448..bbc5a5d78dd41 100644 --- a/asv_benchmarks/benchmarks/datasets.py +++ b/asv_benchmarks/benchmarks/datasets.py @@ -1,7 +1,6 @@ from pathlib import Path import numpy as np -import scipy.sparse as sp from joblib import Memory from sklearn.datasets import ( @@ -17,6 +16,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import MaxAbsScaler, StandardScaler +from sklearn.utils._sparse import _sparse_random # memory location for caching datasets M = Memory(location=str(Path(__file__).resolve().parent / "cache")) @@ -100,12 +100,12 @@ def _synth_regression_dataset(n_samples=100000, n_features=100, dtype=np.float32 def _synth_regression_sparse_dataset( n_samples=10000, n_features=10000, density=0.01, dtype=np.float32 ): - X = sp.random( - m=n_samples, n=n_features, density=density, format="csr", random_state=0 + X = _sparse_random( + (n_samples, n_features), density=density, format="csr", random_state=0 ) X.data = np.random.RandomState(0).randn(X.getnnz()) X = X.astype(dtype, copy=False) - coefs = sp.random(m=n_features, n=1, density=0.5, random_state=0) + coefs = _sparse_random((n_features, 1), density=0.5, random_state=0) coefs.data = np.random.RandomState(0).randn(coefs.getnnz()) y = X.dot(coefs.toarray()).reshape(-1) y += 0.2 * y.std() * np.random.randn(n_samples) @@ -155,9 +155,8 @@ def _random_dataset( X = np.random.RandomState(0).random_sample((n_samples, n_features)) X = X.astype(dtype, copy=False) else: - X = sp.random( - n_samples, - n_features, + X = _sparse_random( + (n_samples, n_features), density=0.05, format="csr", dtype=dtype, diff --git a/benchmarks/bench_feature_expansions.py b/benchmarks/bench_feature_expansions.py index b9d9efbdea4f1..598be5e265d54 100644 --- a/benchmarks/bench_feature_expansions.py +++ b/benchmarks/bench_feature_expansions.py @@ -2,9 +2,9 @@ import matplotlib.pyplot as plt import numpy as np -import scipy.sparse as sparse from sklearn.preprocessing import PolynomialFeatures +from sklearn.utils._sparse import _sparse_random degree = 2 trials = 3 @@ -21,7 +21,7 @@ for density in densities: for dim_index, dim in enumerate(dimensionalities): print(trial, density, dim) - X_csr = sparse.random(num_rows, dim, density).tocsr() + X_csr = _sparse_random((num_rows, dim), density=density, format="csr") X_dense = X_csr.toarray() # CSR t0 = time() diff --git a/sklearn/_config.py b/sklearn/_config.py index 05549c88a9ddc..a8096c112a317 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -20,6 +20,7 @@ "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } _threadlocal = threading.local() @@ -68,6 +69,7 @@ def set_config( transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None, + sparse_interface=None, ): """Set global scikit-learn configuration. @@ -174,6 +176,16 @@ def set_config( .. versionadded:: 1.3 + sparse_interface : str, default="spmatrix" + + The sparse interface used for every sparse object that scikit-learn produces, + e.g., function returns, estimator attributes, estimator properties, etc. + + - `"sparray"`: Return sparse as SciPy sparse array + - `"spmatrix"`: Return sparse as SciPy sparse matrix + + .. versionadded:: 1.7 + See Also -------- config_context : Context manager for global scikit-learn configuration. @@ -209,6 +221,8 @@ def set_config( local_config["enable_metadata_routing"] = enable_metadata_routing if skip_parameter_validation is not None: local_config["skip_parameter_validation"] = skip_parameter_validation + if sparse_interface is not None: + local_config["sparse_interface"] = sparse_interface @contextmanager @@ -224,6 +238,7 @@ def config_context( transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None, + sparse_interface=None, ): """Context manager for global scikit-learn configuration. @@ -329,6 +344,16 @@ def config_context( .. versionadded:: 1.3 + sparse_interface : str, default="spmatrix" + + The sparse interface used for every sparse object that scikit-learn produces, + e.g., function returns, estimator attributes, estimator properties, etc. + + - `"sparray"`: Return sparse as SciPy sparse array + - `"spmatrix"`: Return sparse as SciPy sparse matrix + + .. versionadded:: 1.7 + Yields ------ None. @@ -368,6 +393,7 @@ def config_context( transform_output=transform_output, enable_metadata_routing=enable_metadata_routing, skip_parameter_validation=skip_parameter_validation, + sparse_interface=sparse_interface, ) try: diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 0ab602d32d133..3e14344119a4f 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from scipy import sparse as sp from sklearn.base import clone from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus @@ -25,6 +24,7 @@ from sklearn.metrics import pairwise_distances, pairwise_distances_argmin from sklearn.metrics.cluster import v_measure_score from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( assert_allclose, assert_array_equal, @@ -1030,9 +1030,7 @@ def test_euclidean_distance(dtype, squared, global_random_seed): # Check that the _euclidean_(dense/sparse)_dense helpers produce correct # results rng = np.random.RandomState(global_random_seed) - a_sparse = sp.random( - 1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype - ) + a_sparse = _sparse_random((1, 100), density=0.5, format="csr", rng=rng, dtype=dtype) a_dense = a_sparse.toarray().reshape(-1) b = rng.randn(100).astype(dtype, copy=False) b_squared_norm = (b**2).sum() @@ -1055,8 +1053,8 @@ def test_euclidean_distance(dtype, squared, global_random_seed): def test_inertia(dtype, global_random_seed): # Check that the _inertia_(dense/sparse) helpers produce correct results. rng = np.random.RandomState(global_random_seed) - X_sparse = sp.random( - 100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype + X_sparse = _sparse_random( + (100, 10), density=0.5, format="csr", rng=rng, dtype=dtype ) X_dense = X_sparse.toarray() sample_weight = rng.randn(100).astype(dtype, copy=False) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index daa4111c9393d..c8ee4da8a8b16 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -33,6 +33,7 @@ check_recorded_metadata, ) from sklearn.utils._indexing import _safe_indexing +from sklearn.utils._sparse import _sparse_eye from sklearn.utils._testing import ( _convert_container, assert_allclose_dense_sparse, @@ -73,7 +74,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): n_samples = len(X) - return self.csr_container(sparse.eye(n_samples, n_samples)) + return self.csr_container(_sparse_eye(n_samples)) class TransNo2D(BaseEstimator): @@ -469,7 +470,7 @@ def test_column_transformer_output_indices_df(): @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_column_transformer_sparse_array(csr_container): - X_sparse = csr_container(sparse.eye(3, 2)) + X_sparse = csr_container(_sparse_eye(3, 2)) # no distinction between 1D and 2D X_res_first = X_sparse[:, [0]] diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index e2d80422e7df7..9f1eb01daf103 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -20,6 +20,12 @@ from ..utils import check_array, check_random_state from ..utils import shuffle as util_shuffle from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils._sparse import ( + _align_api_if_sparse, + _sparse_diags, + _sparse_eye, + _sparse_random, +) from ..utils.random import sample_without_replacement @@ -1819,13 +1825,12 @@ def make_sparse_spd_matrix( """ random_state = check_random_state(random_state) - chol = -sp.eye(n_dim) - aux = sp.random( - m=n_dim, - n=n_dim, + chol = -_sparse_eye(n_dim) + aux = _sparse_random( + shape=(n_dim, n_dim), density=1 - alpha, - data_rvs=lambda x: random_state.uniform( - low=smallest_coef, high=largest_coef, size=x + data_sampler=lambda size: random_state.uniform( + low=smallest_coef, high=largest_coef, size=size ), random_state=random_state, ) @@ -1841,13 +1846,13 @@ def make_sparse_spd_matrix( if norm_diag: # Form the diagonal vector into a row matrix - d = sp.diags(1.0 / np.sqrt(prec.diagonal())) + d = _sparse_diags(1.0 / np.sqrt(prec.diagonal())) prec = d @ prec @ d if sparse_format is None: return prec.toarray() else: - return prec.asformat(sparse_format) + return _align_api_if_sparse(prec.asformat(sparse_format)) @validate_params( diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 2b97138c4dea3..2891a93527e1c 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -4,7 +4,6 @@ import numpy as np import pytest -import scipy as sp from numpy.testing import assert_array_equal from sklearn import config_context, datasets @@ -19,6 +18,7 @@ yield_namespace_device_dtype_combinations, ) from sklearn.utils._array_api import device as array_device +from sklearn.utils._sparse import _sparse_random from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import _array_api_for_tests, assert_allclose from sklearn.utils.estimator_checks import ( @@ -87,17 +87,10 @@ def test_pca_sparse( atol = 1e-12 transform_atol = 1e-10 - random_state = np.random.default_rng(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=density, - ) - ) + rng = np.random.default_rng(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=density)) # Scale the data + vary the column means - scale_vector = random_state.random(X.shape[1]) * scale + scale_vector = rng.random(X.shape[1]) * scale X = X.multiply(scale_vector) pca = PCA( @@ -120,12 +113,7 @@ def test_pca_sparse( # Test transform X2 = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=density, - ) + _sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=density) ) X2d = X2.toarray() @@ -135,23 +123,9 @@ def test_pca_sparse( @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_pca_sparse_fit_transform(global_random_seed, sparse_container): - random_state = np.random.default_rng(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=0.01, - ) - ) - X2 = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=0.01, - ) - ) + rng = np.random.default_rng(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=0.01)) + X2 = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=0.01)) pca_fit = PCA(n_components=10, svd_solver="arpack", random_state=global_random_seed) pca_fit_transform = PCA( @@ -170,14 +144,8 @@ def test_pca_sparse_fit_transform(global_random_seed, sparse_container): @pytest.mark.parametrize("svd_solver", ["randomized", "full"]) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container): - random_state = np.random.RandomState(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - ) - ) + rng = np.random.RandomState(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng)) pca = PCA(n_components=30, svd_solver=svd_solver) error_msg_pattern = ( 'PCA only support sparse inputs with the "arpack" and "covariance_eigh"' @@ -192,14 +160,8 @@ def test_sparse_pca_auto_arpack_singluar_values_consistency( global_random_seed, sparse_container ): """Check that "auto" and "arpack" solvers are equivalent for sparse inputs.""" - random_state = np.random.RandomState(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - ) - ) + rng = np.random.RandomState(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng)) pca_arpack = PCA(n_components=10, svd_solver="arpack").fit(X) pca_auto = PCA(n_components=10, svd_solver="auto").fit(X) assert_allclose(pca_arpack.singular_values_, pca_auto.singular_values_, rtol=5e-3) diff --git a/sklearn/decomposition/tests/test_truncated_svd.py b/sklearn/decomposition/tests/test_truncated_svd.py index 07b35c873ee3e..8eea4fc1f3e6e 100644 --- a/sklearn/decomposition/tests/test_truncated_svd.py +++ b/sklearn/decomposition/tests/test_truncated_svd.py @@ -2,10 +2,10 @@ import numpy as np import pytest -import scipy.sparse as sp from sklearn.decomposition import PCA, TruncatedSVD from sklearn.utils import check_random_state +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import assert_allclose, assert_array_less SVD_SOLVERS = ["arpack", "randomized"] @@ -15,7 +15,7 @@ def X_sparse(): # Make an X that looks somewhat like a small tf-idf matrix. rng = check_random_state(42) - X = sp.random(60, 55, density=0.2, format="csr", random_state=rng) + X = _sparse_random((60, 55), density=0.2, format="csr", rng=rng) X.data[:] = 1 + np.log(X.data) return X diff --git a/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py b/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py index 34c816628ee73..ade61f3f10d43 100644 --- a/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py +++ b/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py @@ -283,19 +283,19 @@ def laplacian( Our final example illustrates the latter for a noisy directed linear graph. - >>> from scipy.sparse import diags, random + >>> from scipy.sparse import diags_array, random_array >>> from scipy.sparse.linalg import lobpcg Create a directed linear graph with ``N=35`` vertices using a sparse adjacency matrix ``G``: >>> N = 35 - >>> G = diags(np.ones(N-1), 1, format="csr") + >>> G = diags_array((np.ones(N-1), 1), format="csr") Fix a random seed ``rng`` and add a random sparse noise to the graph ``G``: >>> rng = np.random.default_rng() - >>> G += 1e-2 * random(N, N, density=0.1, random_state=rng) + >>> G += 1e-2 * random_array((N, N), density=0.1, random_state=rng) Set initial approximations for eigenvectors: diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index ab3f84668fd2d..e361b8331edf0 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -27,6 +27,7 @@ from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC +from sklearn.utils import _align_api_if_sparse from sklearn.utils._testing import ( assert_allclose_dense_sparse, assert_almost_equal, @@ -1611,7 +1612,14 @@ def test_tfidf_transformer_copy(csr_container): assert X_transform is not X_csr X_transform = transformer.transform(X_csr, copy=False) - assert X_transform is X_csr + # allow for config["sparse_interface"] to change output type + # there should be no data copied, but the `id` will change. + if _align_api_if_sparse(X_csr) is X_csr: + assert X_transform is X_csr + else: + assert X_transform is not X_csr + assert X_transform.indptr is X_csr.indptr + with pytest.raises(AssertionError): assert_allclose_dense_sparse(X_csr, X_csr_original) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index eb3226b01c79e..84a89db10cc17 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -16,7 +16,7 @@ import numpy as np import scipy.sparse as sp -from sklearn.utils import metadata_routing +from sklearn.utils import _align_api_if_sparse, metadata_routing from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context from ..exceptions import NotFittedError @@ -880,7 +880,7 @@ def transform(self, X): X.data.fill(1) if self.norm is not None: X = normalize(X, norm=self.norm, copy=False) - return X + return _align_api_if_sparse(X) def fit_transform(self, X, y=None): """Transform a sequence of documents to a document-term matrix. @@ -1393,7 +1393,7 @@ def fit_transform(self, raw_documents, y=None): X = self._sort_features(X, vocabulary) self.vocabulary_ = vocabulary - return X + return _align_api_if_sparse(X) def transform(self, raw_documents): """Transform documents to document-term matrix. @@ -1421,7 +1421,7 @@ def transform(self, raw_documents): _, X = self._count_vocab(raw_documents, fixed_vocab=True) if self.binary: X.data.fill(1) - return X + return _align_api_if_sparse(X) def inverse_transform(self, X): """Return terms per document with nonzero entries in X. @@ -1720,7 +1720,7 @@ def transform(self, X, copy=True): if self.norm is not None: X = normalize(X, norm=self.norm, copy=False) - return X + return _align_api_if_sparse(X) def __sklearn_tags__(self): tags = super().__sklearn_tags__() diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 62096133ada2f..3724a1397e7b5 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -12,7 +12,7 @@ from joblib import effective_n_jobs from scipy import sparse -from sklearn.utils import metadata_routing +from sklearn.utils import _align_api_if_sparse, metadata_routing from ..base import MultiOutputMixin, RegressorMixin, _fit_context from ..model_selection import check_cv @@ -1129,7 +1129,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): @property def sparse_coef_(self): """Sparse representation of the fitted `coef_`.""" - return sparse.csr_matrix(self.coef_) + return _align_api_if_sparse(sparse.csr_array(np.atleast_2d(self.coef_))) def _decision_function(self, X): """Decision function of the linear model. diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py index 446d232958e8d..7686a2a120201 100644 --- a/sklearn/linear_model/_quantile.py +++ b/sklearn/linear_model/_quantile.py @@ -12,6 +12,7 @@ from ..exceptions import ConvergenceWarning from ..utils import _safe_indexing from ..utils._param_validation import Interval, StrOptions +from ..utils._sparse import _sparse_eye from ..utils.fixes import parse_version, sp_version from ..utils.validation import _check_sample_weight, validate_data from ._base import LinearModel @@ -240,7 +241,7 @@ def fit(self, X, y, sample_weight=None): # even for optimization problems parametrized using dense numpy arrays. # Therefore, we work with CSC matrices as early as possible to limit # unnecessary repeated memory copies. - eye = sparse.eye(n_indices, dtype=X.dtype, format="csc") + eye = _sparse_eye(n_indices, dtype=X.dtype, format="csc") if self.fit_intercept: ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype)) A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc") diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index cf8dfdf4e4712..bd604760fecfe 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -15,6 +15,7 @@ make_dataset, ) from sklearn.preprocessing import add_dummy_feature +from sklearn.utils._sparse import _sparse_eye from sklearn.utils._testing import ( assert_allclose, assert_array_almost_equal, @@ -98,7 +99,7 @@ def test_linear_regression_sample_weights( def test_raises_value_error_if_positive_and_sparse(): error_msg = "Sparse data was passed for X, but dense data is required." # X must not be sparse if positive == True - X = sparse.eye(10) + X = _sparse_eye(10) y = np.ones(10) reg = LinearRegression(positive=True) @@ -148,7 +149,7 @@ def test_linear_regression_sparse(global_random_seed): # Test that linear regression also works with sparse data rng = np.random.RandomState(global_random_seed) n = 100 - X = sparse.eye(n, n) + X = _sparse_eye(n, n) beta = rng.rand(n) y = X @ beta diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 26d138ae3649b..50181a9799f55 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -21,6 +21,7 @@ from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, scale from sklearn.svm import OneClassSVM from sklearn.utils import get_tags +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( assert_allclose, assert_almost_equal, @@ -2037,7 +2038,7 @@ def test_SGDClassifier_fit_for_all_backends(backend): # Create a classification problem with 50000 features and 20 classes. Using # loky or multiprocessing this make the clf.coef_ exceed the threshold # above which memmaping is used in joblib and loky (1MB as of 2018/11/1). - X = sp.random(500, 2000, density=0.02, format="csr", random_state=random_state) + X = _sparse_random((500, 2000), density=0.02, format="csr", rng=random_state) y = random_state.choice(20, 500) # Begin by fitting a SGD classifier sequentially diff --git a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py index 1aab9babeeb40..21cd0a8bc1508 100644 --- a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py @@ -6,6 +6,7 @@ from sklearn.datasets import make_regression from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( assert_almost_equal, assert_array_almost_equal, @@ -375,7 +376,7 @@ def test_sparse_read_only_buffer(copy_X): rng = np.random.RandomState(0) clf = ElasticNet(alpha=0.1, copy_X=copy_X, random_state=rng) - X = sp.random(100, 20, format="csc", random_state=rng) + X = _sparse_random((100, 20), format="csc", rng=rng) # Make X.data read-only X.data = create_memmap_backed_data(X.data) diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index e6967446274ad..88fb239935ee7 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -7,7 +7,7 @@ import numpy as np from scipy.linalg import eigh, qr, solve, svd -from scipy.sparse import csr_matrix, eye, lil_matrix +from scipy.sparse import csr_array, lil_array from scipy.sparse.linalg import eigsh from ..base import ( @@ -21,6 +21,7 @@ from ..utils import check_array, check_random_state from ..utils._arpack import _init_arpack_v0 from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils._sparse import _align_api_if_sparse, _sparse_eye from ..utils.extmath import stable_cumsum from ..utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data @@ -118,7 +119,8 @@ def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None): ind = knn.kneighbors(X, return_distance=False)[:, 1:] data = barycenter_weights(X, X, ind, reg=reg) indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors) - return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples)) + csr = csr_array((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples)) + return _align_api_if_sparse(csr) def null_space( @@ -229,7 +231,7 @@ def _locally_linear_embedding( ) M_sparse = eigen_solver != "dense" - M_container_constructor = lil_matrix if M_sparse else np.zeros + M_container_constructor = lil_array if M_sparse else np.zeros if method == "standard": W = barycenter_kneighbors_graph( @@ -239,8 +241,10 @@ def _locally_linear_embedding( # we'll compute M = (I-W)'(I-W) # depending on the solver, we'll do this differently if M_sparse: - M = eye(*W.shape, format=W.format) - W - M = M.T @ M + # change when SciPy 1.12+ is minimal supported version + # M = eye_array(W.shape, format=W.format, dtype=W.dtype) - W + M = _sparse_eye(*W.shape, format=W.format, dtype=W.dtype) - W + M = M.T @ M # M = (I - W)' (I - W) = W' W - W' - W + I else: M = (W.T @ W - W.T - W).toarray() M.flat[:: M.shape[0] + 1] += 1 # M = W' W - W' - W + I @@ -395,7 +399,7 @@ def _locally_linear_embedding( nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i]) M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T) Wi_sum1 = Wi.sum(1) - M[i, neighbors[i]] -= Wi_sum1 + M[[i], neighbors[i]] -= Wi_sum1 M[neighbors[i], [i]] -= Wi_sum1 M[i, i] += s_i @@ -432,7 +436,7 @@ def _locally_linear_embedding( M[neighbors[i], neighbors[i]] += np.ones(shape=n_neighbors) if M_sparse: - M = M.tocsr() + M = _align_api_if_sparse(M.tocsr()) return null_space( M, diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 1a3b95e023897..f73dfebef389d 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -22,6 +22,7 @@ ) from ..utils._arpack import _init_arpack_v0 from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils._sparse import _sparse_eye from ..utils.extmath import _deterministic_vector_sign_flip from ..utils.fixes import laplacian as csgraph_laplacian from ..utils.fixes import parse_version, sp_version @@ -400,7 +401,7 @@ def _spectral_embedding( # Shift the Laplacian so its diagononal is not all ones. The shift # does change the eigenpairs however, so we'll feed the shifted # matrix to the solver and afterward set it back to the original. - diag_shift = 1e-5 * sparse.eye(laplacian.shape[0]) + diag_shift = 1e-5 * _sparse_eye(laplacian.shape[0]) laplacian += diag_shift if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array): # `pyamg` does not work with `csr_array` and we need to convert it to a diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 4c4115734a404..0743571ed3a1d 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -17,6 +17,7 @@ from sklearn.metrics import normalized_mutual_info_score, pairwise_distances from sklearn.metrics.pairwise import rbf_kernel from sklearn.neighbors import NearestNeighbors +from sklearn.utils._sparse import _sparse_diags, _sparse_random from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.extmath import _deterministic_vector_sign_flip from sklearn.utils.fixes import ( @@ -311,9 +312,9 @@ def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36): def test_spectral_embedding_amg_solver_failure(dtype, seed=36): # Non-regression test for amg solver failure (issue #13393 on github) num_nodes = 100 - X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed) + X = _sparse_random((num_nodes, num_nodes), density=0.1, random_state=seed) X = X.astype(dtype) - upper = sparse.triu(X) - sparse.diags(X.diagonal()) + upper = sparse.triu(X) - _sparse_diags(X.diagonal()) sym_matrix = upper + upper.T embedding = spectral_embedding( sym_matrix, n_components=10, eigen_solver="amg", random_state=0 diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a618d426a7dcb..59b5186c069a1 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -43,6 +43,7 @@ _get_namespace_device_dtype_ids, yield_namespace_device_dtype_combinations, ) +from sklearn.utils._sparse import _sparse_random from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import ( _array_api_for_tests, @@ -2497,7 +2498,7 @@ def test_power_transformer_box_cox_raise_all_nans_col(): @pytest.mark.parametrize( "X_2", - [sparse.random(10, 1, density=0.8, random_state=0)] + [_sparse_random((10, 1), density=0.8, rng=0)] + [ csr_container(np.full((10, 1), fill_value=np.nan)) for csr_container in CSR_CONTAINERS @@ -2506,7 +2507,7 @@ def test_power_transformer_box_cox_raise_all_nans_col(): def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/16448 - X_1 = sparse.random(5, 1, density=0.8) + X_1 = _sparse_random((5, 1), density=0.8) scaler = StandardScaler(with_mean=False) scaler.fit(X_1).partial_fit(X_2) assert np.isfinite(scaler.var_[0]) diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index bf35eee623c18..b7beda31c2fd5 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -21,6 +21,7 @@ def test_config_context(): "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } # Not using as a context manager affects nothing @@ -39,6 +40,7 @@ def test_config_context(): "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } assert get_config()["assume_finite"] is False @@ -74,6 +76,7 @@ def test_config_context(): "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } # No positional arguments diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 941126c6b083f..440f582a8668b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -20,6 +20,7 @@ shuffle, ) from ._mask import safe_mask +from ._sparse import _align_api_if_sparse from ._tags import ( ClassifierTags, InputTags, @@ -56,6 +57,7 @@ "Tags", "TargetTags", "TransformerTags", + "_align_api_if_sparse", "all_estimators", "as_float_array", "assert_all_finite", diff --git a/sklearn/utils/_sparse.py b/sklearn/utils/_sparse.py new file mode 100644 index 0000000000000..55a4f5edae7f4 --- /dev/null +++ b/sklearn/utils/_sparse.py @@ -0,0 +1,75 @@ +"""Control sparse interface based on config""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import scipy as sp + +from .._config import get_config +from .fixes import parse_version, sp_base_version + + +def _align_api_if_sparse(X): + """ + Convert to sparse interface as set in config. Input can be dense or sparse. + If sparse, convert to sparse_interface indicated by get_config. + Otherwise, return X unchanged. + + """ + if not sp.sparse.issparse(X): + return X + + config_sparse_interface = get_config()["sparse_interface"] + + # there are only two sparse interfaces: sparray and spmatrix + if config_sparse_interface == "sparray": + if sp.sparse.isspmatrix(X): + # Fundamental code to switch to sparray in any format + return getattr(sp.sparse, X.format + "_array")(X) + return X + else: # config is spmatrix + if sp.sparse.isspmatrix(X): + return X + # Fundamental code to switch to spmatrix in any format + return getattr(sp.sparse, X.format + "_matrix")(X) + + +########### fixes for transitioning function names + +# TODO: Replace when Scipy 1.12 is the minimum supported version +if sp_base_version >= parse_version("1.12.0"): + _sparse_eye = sp.sparse.eye_array + _sparse_block = sp.sparse.block_array + _sparse_diags = sp.sparse.diags_array + _sparse_random = sp.sparse.random_array +else: + + def _sparse_eye(m, n=None, *, k=0, dtype=float, format=None): + return sp.sparse.eye(m, n, k=k, dtype=dtype, format=format) + + def _sparse_block(blocks, *, format=None, dtype=None): + return sp.sparse.bmat(blocks, format=format, dtype=dtype) + + def _sparse_diags(diagonals, /, *, offsets=0, shape=None, format=None, dtype=None): + return sp.sparse.diags( + diagonals, offsets=offsets, shape=shape, format=format, dtype=dtype + ) + + def _sparse_random( + shape, + *, + density=0.01, + format="coo", + dtype=None, + random_state=None, + rng=None, + data_sampler=None, + ): + return sp.sparse.random( + *shape, + density=density, + format=format, + dtype=dtype, + random_state=rng or random_state, + data_rvs=data_sampler, + ) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index b98a7747c28aa..d4a99dfa72317 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -139,7 +139,7 @@ def density(w): -------- >>> from scipy import sparse >>> from sklearn.utils.extmath import density - >>> X = sparse.random(10, 10, density=0.25, random_state=0) + >>> X = sparse.random_array((10, 10), density=0.25, rng=0) >>> density(X) 0.25 """ diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 907de11702af2..0ef1dd2abd1ae 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -19,6 +19,7 @@ get_namespace, yield_namespace_device_dtype_combinations, ) +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( _array_api_for_tests, assert_allclose, @@ -1041,8 +1042,8 @@ def test_safe_sparse_dot_2d_1d(container): def test_safe_sparse_dot_dense_output(dense_output): rng = np.random.RandomState(0) - A = sparse.random(30, 10, density=0.1, random_state=rng) - B = sparse.random(10, 20, density=0.1, random_state=rng) + A = _sparse_random((30, 10), density=0.1, rng=rng) + B = _sparse_random((10, 20), density=0.1, rng=rng) expected = A.dot(B) actual = safe_sparse_dot(A, B, dense_output=dense_output) diff --git a/sklearn/utils/tests/test_sparse.py b/sklearn/utils/tests/test_sparse.py new file mode 100644 index 0000000000000..32d6a1666d493 --- /dev/null +++ b/sklearn/utils/tests/test_sparse.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest +from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix + +import sklearn + + +@pytest.mark.parametrize( + ["sparse_interface", "x", "result_type"], + [ + ("sparray", csr_array([[1, 2, 3]]), csr_array), + ("sparray", csr_matrix([[1, 2, 3]]), csr_array), + ("sparray", csc_array([[1, 2, 3]]), csc_array), + ("sparray", csc_matrix([[1, 2, 3]]), csc_array), + ("spmatrix", csr_array([[1, 2, 3]]), csr_matrix), + ("spmatrix", csr_matrix([[1, 2, 3]]), csr_matrix), + ("spmatrix", csc_array([[1, 2, 3]]), csc_matrix), + ("spmatrix", csc_matrix([[1, 2, 3]]), csc_matrix), + ], +) +def test_align_api_if_sparse(sparse_interface, x, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + result = sklearn.utils._align_api_if_sparse(x) + assert isinstance(result, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "x", "result_type"], + [ + ("sparray", np.array([[1, 2, 3]]), np.ndarray), + ("spmatrix", np.array([[1, 2, 3]]), np.ndarray), + ], +) +def test_ndarray_align_api_if_sparse(sparse_interface, x, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + result = sklearn.utils._align_api_if_sparse(x) + assert isinstance(result, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "result_type"], + [("sparray", csr_array), ("spmatrix", csr_matrix)], +) +def test_transform_returns_sparse(sparse_interface, result_type): + corpus = [ + "This is the first document.", + "This document is the second document.", + "And this is the third one.", + "Is this the first document?", + ] + with sklearn.config_context(sparse_interface=sparse_interface): + vectorizer = sklearn.feature_extraction.text.CountVectorizer() + X = vectorizer.fit_transform(corpus) + assert isinstance(X, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "result_type"], + [("sparray", csr_array), ("spmatrix", csr_matrix)], +) +def test_function_returns_sparse(sparse_interface, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + X, y = sklearn.datasets.make_regression(n_features=2, random_state=0) + X = sklearn.manifold._locally_linear.barycenter_kneighbors_graph(X, 1) + assert isinstance(X, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "result_type"], + [("sparray", csr_array), ("spmatrix", csr_matrix)], +) +def test_estimator_property_sparse(sparse_interface, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + X, y = sklearn.datasets.make_regression(n_features=2, random_state=0) + regr = sklearn.linear_model.ElasticNet(random_state=0) + regr.fit(X, y) + # check spec_coeff property + assert isinstance(regr.sparse_coef_, result_type) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index f80b75c02d515..ed07d72788f5b 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -6,6 +6,7 @@ from scipy import linalg from sklearn.datasets import make_classification +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS from sklearn.utils.sparsefuncs import ( @@ -436,15 +437,15 @@ def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor): "X1, X2", [ ( - sp.random(5, 2, density=0.8, format="csr", random_state=0), - sp.random(13, 2, density=0.8, format="csr", random_state=0), + _sparse_random((5, 2), density=0.8, format="csr", rng=0), + _sparse_random((13, 2), density=0.8, format="csr", rng=0), ), ( - sp.random(5, 2, density=0.8, format="csr", random_state=0), + _sparse_random((5, 2), density=0.8, format="csr", rng=0), sp.hstack( [ np.full((13, 1), fill_value=np.nan), - sp.random(13, 1, density=0.8, random_state=42), + _sparse_random((13, 1), density=0.8, rng=42), ], format="csr", ), @@ -477,8 +478,8 @@ def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2, csr_container def test_incr_mean_variance_no_new_n(): # check the behaviour when we update the variance with an empty matrix axis = 0 - X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr() - X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr() + X1 = _sparse_random((5, 1), density=0.8, format="csr", rng=0) + X2 = _sparse_random((0, 1), density=0.8, format="csr", rng=0) last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1]) last_n = np.zeros(X1.shape[1], dtype=np.int64) last_mean, last_var, last_n = incr_mean_variance_axis( @@ -496,7 +497,7 @@ def test_incr_mean_variance_no_new_n(): def test_incr_mean_variance_n_float(): # check the behaviour when last_n is just a number axis = 0 - X = sp.random(5, 2, density=0.8, random_state=0).tocsr() + X = _sparse_random((5, 2), density=0.8, format="csr", rng=0) last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1]) last_n = 0 _, _, new_n = incr_mean_variance_axis( @@ -604,7 +605,7 @@ def test_densify_rows(csr_container): def test_inplace_column_scale(): rng = np.random.RandomState(0) - X = sp.random(100, 200, density=0.05) + X = _sparse_random((100, 200), density=0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() @@ -636,7 +637,7 @@ def test_inplace_column_scale(): def test_inplace_row_scale(): rng = np.random.RandomState(0) - X = sp.random(100, 200, density=0.05) + X = _sparse_random((100, 200), density=0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() @@ -937,7 +938,7 @@ def test_inplace_normalize(csr_container, inplace_csr_row_normalize): def test_csr_row_norms(dtype): # checks that csr_row_norms returns the same output as # scipy.sparse.linalg.norm, and that the dype is the same as X.dtype. - X = sp.random(100, 10, format="csr", dtype=dtype, random_state=42) + X = _sparse_random((100, 10), format="csr", dtype=dtype, rng=42) scipy_norms = sp.linalg.norm(X, axis=1) ** 2 norms = csr_row_norms(X) @@ -952,10 +953,10 @@ def centered_matrices(request): """Returns equivalent tuple[sp.linalg.LinearOperator, np.ndarray].""" sparse_container = request.param - random_state = np.random.default_rng(42) + rng = np.random.default_rng(42) X_sparse = sparse_container( - sp.random(500, 100, density=0.1, format="csr", random_state=random_state) + _sparse_random((500, 100), density=0.1, format="csr", rng=rng) ) X_dense = X_sparse.toarray() mu = np.asarray(X_sparse.mean(axis=0)).ravel() diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index ae9c380941c8c..61e33fa81feea 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -8,6 +8,7 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.tree import DecisionTreeClassifier +from sklearn.utils._sparse import _sparse_diags, _sparse_random from sklearn.utils._testing import ( TempMemmap, _convert_container, @@ -57,7 +58,7 @@ def test_assert_allclose_dense_sparse(csr_container): with pytest.raises(ValueError, match="Can only compare two sparse"): assert_allclose_dense_sparse(x, y) - A = sparse.diags(np.ones(5), offsets=0).tocsr() + A = _sparse_diags(np.ones(5), offsets=0, format="csr") B = csr_container(np.ones((1, 5))) with pytest.raises(AssertionError, match="Arrays are not equal"): assert_allclose_dense_sparse(B, A) @@ -1102,7 +1103,7 @@ def test_convert_container_sparse_to_sparse(constructor_name): """Non-regression test to check that we can still convert a sparse container from a given format to another format. """ - X_sparse = sparse.random(10, 10, density=0.1, format="csr") + X_sparse = _sparse_random((10, 10), density=0.1, format="csr") _convert_container(X_sparse, constructor_name) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 1aaf7c346b1d3..44dcab847b322 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -42,6 +42,7 @@ MockDataFrame, _MockEstimatorOnOffPrediction, ) +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( SkipTest, TempMemmap, @@ -153,7 +154,7 @@ def test_as_float_array(): @pytest.mark.parametrize( - "X", [np.random.random((10, 2)), sp.random(10, 2, format="csr")] + "X", [np.random.random((10, 2)), _sparse_random((10, 2), format="csr")] ) def test_as_float_array_nan(X): X[5, 0] = np.nan @@ -700,7 +701,7 @@ def test_check_array_accept_sparse_no_exception(): @pytest.fixture(params=["csr", "csc", "coo", "bsr"]) def X_64bit(request): - X = sp.random(20, 10, format=request.param) + X = _sparse_random((20, 10), format=request.param) if request.param == "coo": if hasattr(X, "coords"):