diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py index bbf5029062448..bbc5a5d78dd41 100644 --- a/asv_benchmarks/benchmarks/datasets.py +++ b/asv_benchmarks/benchmarks/datasets.py @@ -1,7 +1,6 @@ from pathlib import Path import numpy as np -import scipy.sparse as sp from joblib import Memory from sklearn.datasets import ( @@ -17,6 +16,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import MaxAbsScaler, StandardScaler +from sklearn.utils._sparse import _sparse_random # memory location for caching datasets M = Memory(location=str(Path(__file__).resolve().parent / "cache")) @@ -100,12 +100,12 @@ def _synth_regression_dataset(n_samples=100000, n_features=100, dtype=np.float32 def _synth_regression_sparse_dataset( n_samples=10000, n_features=10000, density=0.01, dtype=np.float32 ): - X = sp.random( - m=n_samples, n=n_features, density=density, format="csr", random_state=0 + X = _sparse_random( + (n_samples, n_features), density=density, format="csr", random_state=0 ) X.data = np.random.RandomState(0).randn(X.getnnz()) X = X.astype(dtype, copy=False) - coefs = sp.random(m=n_features, n=1, density=0.5, random_state=0) + coefs = _sparse_random((n_features, 1), density=0.5, random_state=0) coefs.data = np.random.RandomState(0).randn(coefs.getnnz()) y = X.dot(coefs.toarray()).reshape(-1) y += 0.2 * y.std() * np.random.randn(n_samples) @@ -155,9 +155,8 @@ def _random_dataset( X = np.random.RandomState(0).random_sample((n_samples, n_features)) X = X.astype(dtype, copy=False) else: - X = sp.random( - n_samples, - n_features, + X = _sparse_random( + (n_samples, n_features), density=0.05, format="csr", dtype=dtype, diff --git a/benchmarks/bench_feature_expansions.py b/benchmarks/bench_feature_expansions.py index b9d9efbdea4f1..598be5e265d54 100644 --- a/benchmarks/bench_feature_expansions.py +++ b/benchmarks/bench_feature_expansions.py @@ -2,9 +2,9 @@ import matplotlib.pyplot as plt import numpy as np -import scipy.sparse as sparse from sklearn.preprocessing import PolynomialFeatures +from sklearn.utils._sparse import _sparse_random degree = 2 trials = 3 @@ -21,7 +21,7 @@ for density in densities: for dim_index, dim in enumerate(dimensionalities): print(trial, density, dim) - X_csr = sparse.random(num_rows, dim, density).tocsr() + X_csr = _sparse_random((num_rows, dim), density=density, format="csr") X_dense = X_csr.toarray() # CSR t0 = time() diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index e955be64cdee3..7e5f4cdf01466 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -188,7 +188,7 @@ def get_data(dataset_name): data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) - X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) + X = sp.sparse.csr_array((data, (row, col)), shape=(size, small_size)) del data del row del col diff --git a/benchmarks/bench_random_projections.py b/benchmarks/bench_random_projections.py index 6551de690994b..8ec4b264be76c 100644 --- a/benchmarks/bench_random_projections.py +++ b/benchmarks/bench_random_projections.py @@ -70,7 +70,7 @@ def bench_scikit_transformer(X, transformer): # Gaussian distributed values def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=None): rng = np.random.RandomState(random_state) - data_coo = sp.coo_matrix( + data_coo = sp.coo_array( ( rng.randn(n_nonzeros), ( diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 59367b647dd58..9a9e05d830e26 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -56,11 +56,11 @@ that contain the missing values:: The :class:`SimpleImputer` class also supports sparse matrices:: >>> import scipy.sparse as sp - >>> X = sp.csc_matrix([[1, 2], [0, -1], [8, 4]]) + >>> X = sp.csc_array([[1, 2], [0, -1], [8, 4]]) >>> imp = SimpleImputer(missing_values=-1, strategy='mean') >>> imp.fit(X) SimpleImputer(missing_values=-1) - >>> X_test = sp.csc_matrix([[-1, 2], [6, -1], [7, 6]]) + >>> X_test = sp.csc_array([[-1, 2], [6, -1], [7, 6]]) >>> print(imp.transform(X_test).toarray()) [[3. 2.] [6. 3.] diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index ef7d6ab3000e1..f6608c2c32013 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -170,9 +170,9 @@ Valid :term:`multiclass` representations for [1 0 0] [0 1 0]] >>> from scipy import sparse - >>> y_sparse = sparse.csr_matrix(y_dense) + >>> y_sparse = sparse.csr_array(y_dense) >>> print(y_sparse) - Coords Values (0, 0) 1 @@ -380,9 +380,9 @@ refer to :ref:`preprocessing_targets`. An example of the same ``y`` in sparse matrix form: - >>> y_sparse = sparse.csr_matrix(y) + >>> y_sparse = sparse.csr_array(y) >>> print(y_sparse) - Coords Values (0, 0) 1 diff --git a/examples/applications/plot_tomography_l1_reconstruction.py b/examples/applications/plot_tomography_l1_reconstruction.py index 02d4594b90518..7be4947ea8a18 100644 --- a/examples/applications/plot_tomography_l1_reconstruction.py +++ b/examples/applications/plot_tomography_l1_reconstruction.py @@ -89,7 +89,9 @@ def build_projection_operator(l_x, n_dir): weights += list(w[mask]) camera_inds += list(inds[mask] + i * l_x) data_inds += list(data_unravel_indices[mask]) - proj_operator = sparse.coo_matrix((weights, (camera_inds, data_inds))) + camera_inds = np.array(camera_inds, dtype=np.int32) # lasso needs int32 inds + data_inds = np.array(data_inds, dtype=np.int32) + proj_operator = sparse.coo_array((weights, (camera_inds, data_inds))) return proj_operator diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py index 2ccd028b9a00d..b59cf8eb6c058 100644 --- a/examples/applications/wikipedia_principal_eigenvector.py +++ b/examples/applications/wikipedia_principal_eigenvector.py @@ -146,7 +146,7 @@ def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None): break print("Computing the adjacency matrix") - X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32) + X = sparse.lil_array((len(index_map), len(index_map)), dtype=np.float32) for i, j in links: X[i, j] = 1.0 del links diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py index 920994da1ffb5..e0763448bd59e 100644 --- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py +++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py @@ -32,7 +32,7 @@ X, y = make_regression(n_samples=200, n_features=5000, random_state=0) # create a copy of X in sparse format -X_sp = sparse.coo_matrix(X) +X_sp = sparse.coo_array(X) alpha = 1 sparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000) @@ -64,7 +64,7 @@ # make Xs sparse by replacing the values lower than 2.5 with 0s Xs[Xs < 2.5] = 0.0 # create a copy of Xs in sparse format -Xs_sp = sparse.coo_matrix(Xs) +Xs_sp = sparse.coo_array(Xs) Xs_sp = Xs_sp.tocsc() # compute the proportion of non-zero coefficient in the data matrix diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py index a2da69f62fb10..07f7f1678b469 100644 --- a/examples/neighbors/approximate_nearest_neighbors.py +++ b/examples/neighbors/approximate_nearest_neighbors.py @@ -39,7 +39,7 @@ # `nmslib`, as well as a loading function. import joblib import numpy as np -from scipy.sparse import csr_matrix +from scipy.sparse import csr_array from sklearn.base import BaseEstimator, TransformerMixin from sklearn.datasets import fetch_openml @@ -93,7 +93,7 @@ def transform(self, X): indices, distances = np.vstack(indices), np.vstack(distances) indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors) - kneighbors_graph = csr_matrix( + kneighbors_graph = csr_array( (distances.ravel(), indices.ravel(), indptr), shape=(n_samples_transform, self.n_samples_fit_), ) diff --git a/sklearn/_config.py b/sklearn/_config.py index 217386c81c80e..636e06505f81d 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -20,6 +20,7 @@ "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } _threadlocal = threading.local() @@ -71,6 +72,7 @@ def set_config( transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None, + sparse_interface=None, ): """Set global scikit-learn configuration. @@ -193,6 +195,16 @@ def set_config( .. versionadded:: 1.3 + sparse_interface : str, default="spmatrix" + + The sparse interface used for every sparse object that scikit-learn produces, + e.g., function returns, estimator attributes, estimator properties, etc. + + - `"sparray"`: Return sparse as SciPy sparse array + - `"spmatrix"`: Return sparse as SciPy sparse matrix + + .. versionadded:: 1.8 + See Also -------- config_context : Context manager for global scikit-learn configuration. @@ -228,6 +240,8 @@ def set_config( local_config["enable_metadata_routing"] = enable_metadata_routing if skip_parameter_validation is not None: local_config["skip_parameter_validation"] = skip_parameter_validation + if sparse_interface is not None: + local_config["sparse_interface"] = sparse_interface @contextmanager @@ -243,6 +257,7 @@ def config_context( transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None, + sparse_interface=None, ): """Context manager to temporarily change the global scikit-learn configuration. @@ -360,6 +375,16 @@ def config_context( .. versionadded:: 1.3 + sparse_interface : str, default="spmatrix" + + The sparse interface used for every sparse object that scikit-learn produces, + e.g., function returns, estimator attributes, estimator properties, etc. + + - `"sparray"`: Return sparse as SciPy sparse array + - `"spmatrix"`: Return sparse as SciPy sparse matrix + + .. versionadded:: 1.8 + Yields ------ None. @@ -399,6 +424,7 @@ def config_context( transform_output=transform_output, enable_metadata_routing=enable_metadata_routing, skip_parameter_validation=skip_parameter_validation, + sparse_interface=sparse_interface, ) try: diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 8af512d22016f..045d6d532a96a 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -38,6 +38,7 @@ StrOptions, validate_params, ) +from sklearn.utils._sparse import _align_api_if_sparse from sklearn.utils.graph import _fix_connected_components from sklearn.utils.validation import check_memory, validate_data @@ -92,7 +93,7 @@ def _fix_connectivity(X, connectivity, affinity): # Convert connectivity matrix to LIL if not sparse.issparse(connectivity): - connectivity = sparse.lil_matrix(connectivity) + connectivity = sparse.lil_array(connectivity) # `connectivity` is a sparse matrix at this point if connectivity.format != "lil": @@ -118,7 +119,7 @@ def _fix_connectivity(X, connectivity, affinity): mode="connectivity", ) - return connectivity, n_connected_components + return _align_api_if_sparse(connectivity), n_connected_components def _single_linkage_tree( diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 1fabb1ec07cc1..6862a6cd16cc2 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -8,7 +8,7 @@ import numpy as np from scipy.linalg import norm -from scipy.sparse import dia_matrix, issparse +from scipy.sparse import dia_array, issparse from scipy.sparse.linalg import eigsh, svds from sklearn.base import BaseEstimator, BiclusterMixin, _fit_context @@ -34,8 +34,8 @@ def _scale_normalize(X): col_diag = np.where(np.isnan(col_diag), 0, col_diag) if issparse(X): n_rows, n_cols = X.shape - r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows)) - c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols)) + r = dia_array((row_diag, [0]), shape=(n_rows, n_rows)) + c = dia_array((col_diag, [0]), shape=(n_cols, n_cols)) an = r @ X @ c else: an = row_diag[:, np.newaxis] * X * col_diag diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 43fdc39c4dccd..877c849a10ef7 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -8,7 +8,7 @@ import numpy as np from scipy.linalg import LinAlgError, qr, svd -from scipy.sparse import csc_matrix +from scipy.sparse import csc_array from sklearn.base import BaseEstimator, ClusterMixin, _fit_context from sklearn.cluster._kmeans import k_means @@ -160,7 +160,7 @@ def discretize( t_discrete = np.dot(vectors, rotation) labels = t_discrete.argmax(axis=1) - vectors_discrete = csc_matrix( + vectors_discrete = csc_array( (np.ones(len(labels)), (np.arange(0, n_samples), labels)), shape=(n_samples, n_components), ) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 0ab602d32d133..3e14344119a4f 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from scipy import sparse as sp from sklearn.base import clone from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus @@ -25,6 +24,7 @@ from sklearn.metrics import pairwise_distances, pairwise_distances_argmin from sklearn.metrics.cluster import v_measure_score from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( assert_allclose, assert_array_equal, @@ -1030,9 +1030,7 @@ def test_euclidean_distance(dtype, squared, global_random_seed): # Check that the _euclidean_(dense/sparse)_dense helpers produce correct # results rng = np.random.RandomState(global_random_seed) - a_sparse = sp.random( - 1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype - ) + a_sparse = _sparse_random((1, 100), density=0.5, format="csr", rng=rng, dtype=dtype) a_dense = a_sparse.toarray().reshape(-1) b = rng.randn(100).astype(dtype, copy=False) b_squared_norm = (b**2).sum() @@ -1055,8 +1053,8 @@ def test_euclidean_distance(dtype, squared, global_random_seed): def test_inertia(dtype, global_random_seed): # Check that the _inertia_(dense/sparse) helpers produce correct results. rng = np.random.RandomState(global_random_seed) - X_sparse = sp.random( - 100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype + X_sparse = _sparse_random( + (100, 10), density=0.5, format="csr", rng=rng, dtype=dtype ) X_dense = X_sparse.toarray() sample_weight = rng.randn(100).astype(dtype, copy=False) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 0fc8a81013c9d..789d64077d455 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -34,6 +34,7 @@ check_recorded_metadata, ) from sklearn.utils._indexing import _safe_indexing +from sklearn.utils._sparse import _sparse_eye from sklearn.utils._testing import ( _convert_container, assert_allclose_dense_sparse, @@ -74,7 +75,7 @@ def fit(self, X, y=None): def transform(self, X, y=None): n_samples = len(X) - return self.csr_container(sparse.eye(n_samples, n_samples)) + return self.csr_container(_sparse_eye(n_samples)) class TransNo2D(BaseEstimator): @@ -470,7 +471,7 @@ def test_column_transformer_output_indices_df(): @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_column_transformer_sparse_array(csr_container): - X_sparse = csr_container(sparse.eye(3, 2)) + X_sparse = csr_container(_sparse_eye(3, 2)) # no distinction between 1D and 2D X_res_first = X_sparse[:, [0]] diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 311dc6d8db993..2df375e410c8b 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -16,6 +16,7 @@ from sklearn.externals._arff import ArffSparseDataType from sklearn.utils._chunking import chunk_generator, get_chunk_n_rows from sklearn.utils._optional_dependencies import check_pandas_support +from sklearn.utils._sparse import _align_api_if_sparse from sklearn.utils.fixes import pd_fillna @@ -262,12 +263,12 @@ def _io_to_generator(gzip_file): arff_data_X = _split_sparse_columns(arff_data, feature_indices_to_select) num_obs = max(arff_data[1]) + 1 X_shape = (num_obs, len(feature_indices_to_select)) - X = sp.sparse.coo_matrix( + X = sp.sparse.coo_array( (arff_data_X[0], (arff_data_X[1], arff_data_X[2])), shape=X_shape, dtype=np.float64, ) - X = X.tocsr() + X = _align_api_if_sparse(X.tocsr()) y = _sparse_data_to_array(arff_data, target_indices_to_select) else: # This should never happen diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index c5be518a1d711..20ff47b0298c0 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -29,6 +29,7 @@ from sklearn.utils import Bunch from sklearn.utils import shuffle as shuffle_ from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from sklearn.utils._sparse import _align_api_if_sparse # The original vectorized data can be found at: # http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt0.dat.gz @@ -285,7 +286,7 @@ def fetch_rcv1( # reorder categories in lexicographic order order = np.argsort(categories) categories = categories[order] - y = sp.csr_matrix(y[:, order]) + y = sp.csr_array(y[:, order]) joblib.dump(y, sample_topics_path, compress=9) joblib.dump(categories, topics_path, compress=9) @@ -314,6 +315,7 @@ def fetch_rcv1( fdescr = load_descr("rcv1.rst") + X = _align_api_if_sparse(X) if return_X_y: return X, y diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 1e5fb76b2df42..f16f2527e040d 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -18,6 +18,12 @@ from sklearn.utils import Bunch, check_array, check_random_state from sklearn.utils import shuffle as util_shuffle from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from sklearn.utils._sparse import ( + _align_api_if_sparse, + _sparse_diags, + _sparse_eye, + _sparse_random, +) from sklearn.utils.random import sample_without_replacement @@ -549,10 +555,12 @@ def sample_example(): X_indptr.append(len(X_indices)) Y.append(y) X_data = np.ones(len(X_indices), dtype=np.float64) - X = sp.csr_matrix((X_data, X_indices, X_indptr), shape=(n_samples, n_features)) + X = sp.csr_array((X_data, X_indices, X_indptr), shape=(n_samples, n_features)) X.sum_duplicates() if not sparse: X = X.toarray() + else: + X = _align_api_if_sparse(X) # return_indicator can be True due to backward compatibility if return_indicator in (True, "sparse", "dense"): @@ -1817,13 +1825,12 @@ def make_sparse_spd_matrix( """ random_state = check_random_state(random_state) - chol = -sp.eye(n_dim) - aux = sp.random( - m=n_dim, - n=n_dim, + chol = -_sparse_eye(n_dim) + aux = _sparse_random( + shape=(n_dim, n_dim), density=1 - alpha, - data_rvs=lambda x: random_state.uniform( - low=smallest_coef, high=largest_coef, size=x + data_sampler=lambda size: random_state.uniform( + low=smallest_coef, high=largest_coef, size=size ), random_state=random_state, ) @@ -1839,13 +1846,13 @@ def make_sparse_spd_matrix( if norm_diag: # Form the diagonal vector into a row matrix - d = sp.diags(1.0 / np.sqrt(prec.diagonal())) + d = _sparse_diags(1.0 / np.sqrt(prec.diagonal())) prec = d @ prec @ d if sparse_format is None: return prec.toarray() else: - return prec.asformat(sparse_format) + return _align_api_if_sparse(prec.asformat(sparse_format)) @validate_params( diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 13e5d650dc2cc..5c26e711a054a 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -32,6 +32,7 @@ StrOptions, validate_params, ) +from sklearn.utils._sparse import _align_api_if_sparse @validate_params( @@ -409,9 +410,9 @@ def get_data(): result = [] for data, indices, indptr, y, query_values in r: shape = (indptr.shape[0] - 1, n_features) - X = sp.csr_matrix((data, indices, indptr), shape) + X = sp.csr_array((data, indices, indptr), shape) X.sort_indices() - result += X, y + result += _align_api_if_sparse(X), y if query_id: result.append(query_values) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index 81e8183c6722e..df2ef80ce443b 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -323,18 +323,21 @@ def test_make_multilabel_classification_return_indicator(): def test_make_multilabel_classification_return_indicator_sparse(): - for allow_unlabeled, min_length in zip((True, False), (0, 1)): - X, Y = make_multilabel_classification( - n_samples=25, - n_features=20, - n_classes=3, - random_state=0, - return_indicator="sparse", - allow_unlabeled=allow_unlabeled, - ) - assert X.shape == (25, 20), "X shape mismatch" - assert Y.shape == (25, 3), "Y shape mismatch" - assert sp.issparse(Y) + for allow_unlabeled in (True, False): + for sparse_feature in (True, False): + X, Y = make_multilabel_classification( + n_samples=25, + n_features=20, + n_classes=3, + random_state=0, + sparse=sparse_feature, + return_indicator="sparse", + allow_unlabeled=allow_unlabeled, + ) + assert X.shape == (25, 20), "X shape mismatch" + assert Y.shape == (25, 3), "Y shape mismatch" + assert sp.issparse(Y) + assert sp.issparse(X) if sparse_feature else not sp.issparse(X) def test_make_hastie_10_2(): diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index 0e1a6979b50d0..c8aea0376d918 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -178,7 +178,7 @@ class IncrementalPCA(_BasePCA): >>> transformer.partial_fit(X[:100, :]) IncrementalPCA(batch_size=200, n_components=7) >>> # or let the fit function itself divide the data into batches - >>> X_sparse = sparse.csr_matrix(X) + >>> X_sparse = sparse.csr_array(X) >>> X_transformed = transformer.fit_transform(X_sparse) >>> X_transformed.shape (1797, 7) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 25efec3d564ad..df271a02987ae 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -25,6 +25,7 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.utils import check_array, check_random_state, gen_batches from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from sklearn.utils._sparse import _align_api_if_sparse from sklearn.utils.extmath import _randomized_svd, safe_sparse_dot, squared_norm from sklearn.utils.validation import check_is_fitted, check_non_negative, validate_data @@ -196,8 +197,8 @@ def _special_sparse_dot(W, H, X): axis=1 ) - WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape) - return WH.tocsr() + WH = sp.coo_array((dot_vals, (ii, jj)), shape=X.shape) + return _align_api_if_sparse(WH.tocsr()) else: return np.dot(W, H) diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index afef1eaa7164f..5c8f1edfddaf8 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -141,12 +141,12 @@ class to data once, then keep the instance around to do transformations. Examples -------- >>> from sklearn.decomposition import TruncatedSVD - >>> from scipy.sparse import csr_matrix + >>> from scipy.sparse import csr_array >>> import numpy as np >>> np.random.seed(0) >>> X_dense = np.random.rand(100, 100) >>> X_dense[:, 2 * np.arange(50)] = 0 - >>> X = csr_matrix(X_dense) + >>> X = csr_array(X_dense) >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) >>> svd.fit(X) TruncatedSVD(n_components=5, n_iter=7, random_state=42) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 2b97138c4dea3..2891a93527e1c 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -4,7 +4,6 @@ import numpy as np import pytest -import scipy as sp from numpy.testing import assert_array_equal from sklearn import config_context, datasets @@ -19,6 +18,7 @@ yield_namespace_device_dtype_combinations, ) from sklearn.utils._array_api import device as array_device +from sklearn.utils._sparse import _sparse_random from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import _array_api_for_tests, assert_allclose from sklearn.utils.estimator_checks import ( @@ -87,17 +87,10 @@ def test_pca_sparse( atol = 1e-12 transform_atol = 1e-10 - random_state = np.random.default_rng(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=density, - ) - ) + rng = np.random.default_rng(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=density)) # Scale the data + vary the column means - scale_vector = random_state.random(X.shape[1]) * scale + scale_vector = rng.random(X.shape[1]) * scale X = X.multiply(scale_vector) pca = PCA( @@ -120,12 +113,7 @@ def test_pca_sparse( # Test transform X2 = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=density, - ) + _sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=density) ) X2d = X2.toarray() @@ -135,23 +123,9 @@ def test_pca_sparse( @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_pca_sparse_fit_transform(global_random_seed, sparse_container): - random_state = np.random.default_rng(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=0.01, - ) - ) - X2 = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - density=0.01, - ) - ) + rng = np.random.default_rng(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=0.01)) + X2 = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng, density=0.01)) pca_fit = PCA(n_components=10, svd_solver="arpack", random_state=global_random_seed) pca_fit_transform = PCA( @@ -170,14 +144,8 @@ def test_pca_sparse_fit_transform(global_random_seed, sparse_container): @pytest.mark.parametrize("svd_solver", ["randomized", "full"]) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container): - random_state = np.random.RandomState(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - ) - ) + rng = np.random.RandomState(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng)) pca = PCA(n_components=30, svd_solver=svd_solver) error_msg_pattern = ( 'PCA only support sparse inputs with the "arpack" and "covariance_eigh"' @@ -192,14 +160,8 @@ def test_sparse_pca_auto_arpack_singluar_values_consistency( global_random_seed, sparse_container ): """Check that "auto" and "arpack" solvers are equivalent for sparse inputs.""" - random_state = np.random.RandomState(global_random_seed) - X = sparse_container( - sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, - ) - ) + rng = np.random.RandomState(global_random_seed) + X = sparse_container(_sparse_random((SPARSE_M, SPARSE_N), rng=rng)) pca_arpack = PCA(n_components=10, svd_solver="arpack").fit(X) pca_auto = PCA(n_components=10, svd_solver="auto").fit(X) assert_allclose(pca_arpack.singular_values_, pca_auto.singular_values_, rtol=5e-3) diff --git a/sklearn/decomposition/tests/test_truncated_svd.py b/sklearn/decomposition/tests/test_truncated_svd.py index 07b35c873ee3e..8eea4fc1f3e6e 100644 --- a/sklearn/decomposition/tests/test_truncated_svd.py +++ b/sklearn/decomposition/tests/test_truncated_svd.py @@ -2,10 +2,10 @@ import numpy as np import pytest -import scipy.sparse as sp from sklearn.decomposition import PCA, TruncatedSVD from sklearn.utils import check_random_state +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import assert_allclose, assert_array_less SVD_SOLVERS = ["arpack", "randomized"] @@ -15,7 +15,7 @@ def X_sparse(): # Make an X that looks somewhat like a small tf-idf matrix. rng = check_random_state(42) - X = sp.random(60, 55, density=0.2, format="csr", random_state=rng) + X = _sparse_random((60, 55), density=0.2, format="csr", rng=rng) X.data[:] = 1 + np.log(X.data) return X diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index e64763123f270..0739b5741ac86 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -26,7 +26,7 @@ from time import time import numpy as np -from scipy.sparse import csc_matrix, csr_matrix, issparse +from scipy.sparse import csc_array, csr_array, issparse from sklearn._loss.loss import ( _LOSSES, @@ -628,7 +628,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. y : array-like of shape (n_samples,) Target values (strings or integers in classification, real numbers @@ -846,8 +846,8 @@ def _fit_stages( verbose_reporter = VerboseReporter(verbose=self.verbose) verbose_reporter.init(self, begin_at_stage) - X_csc = csc_matrix(X) if issparse(X) else None - X_csr = csr_matrix(X) if issparse(X) else None + X_csc = csc_array(X) if issparse(X) else None + X_csr = csr_array(X) if issparse(X) else None if self.n_iter_no_change is not None: loss_history = np.full(self.n_iter_no_change, np.inf) @@ -985,7 +985,7 @@ def _staged_raw_predict(self, X, check_input=True): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. check_input : bool, default=True If False, the input arrays X will not be checked. @@ -1100,7 +1100,7 @@ def apply(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will - be converted to a sparse ``csr_matrix``. + be converted to a sparse ``csr_array``. Returns ------- @@ -1574,7 +1574,7 @@ def decision_function(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Returns ------- @@ -1604,7 +1604,7 @@ def staged_decision_function(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Yields ------ @@ -1625,7 +1625,7 @@ def predict(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Returns ------- @@ -1650,7 +1650,7 @@ def staged_predict(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Yields ------ @@ -1674,7 +1674,7 @@ def predict_proba(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Returns ------- @@ -1698,7 +1698,7 @@ def predict_log_proba(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Returns ------- @@ -1725,7 +1725,7 @@ def staged_predict_proba(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Yields ------ @@ -2146,7 +2146,7 @@ def predict(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Returns ------- @@ -2170,7 +2170,7 @@ def staged_predict(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csr_matrix``. + to a sparse ``csr_array``. Yields ------ @@ -2190,7 +2190,7 @@ def apply(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will - be converted to a sparse ``csr_matrix``. + be converted to a sparse ``csr_array``. Returns ------- diff --git a/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py b/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py index 34c816628ee73..ade61f3f10d43 100644 --- a/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py +++ b/sklearn/externals/_scipy/sparse/csgraph/_laplacian.py @@ -283,19 +283,19 @@ def laplacian( Our final example illustrates the latter for a noisy directed linear graph. - >>> from scipy.sparse import diags, random + >>> from scipy.sparse import diags_array, random_array >>> from scipy.sparse.linalg import lobpcg Create a directed linear graph with ``N=35`` vertices using a sparse adjacency matrix ``G``: >>> N = 35 - >>> G = diags(np.ones(N-1), 1, format="csr") + >>> G = diags_array((np.ones(N-1), 1), format="csr") Fix a random seed ``rng`` and add a random sparse noise to the graph ``G``: >>> rng = np.random.default_rng() - >>> G += 1e-2 * random(N, N, density=0.1, random_state=rng) + >>> G += 1e-2 * random_array((N, N), density=0.1, random_state=rng) Set initial approximations for eigenvectors: diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index f862a03bb1d97..ce16566aafc9e 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -10,7 +10,7 @@ import scipy.sparse as sp from sklearn.base import BaseEstimator, TransformerMixin, _fit_context -from sklearn.utils import check_array, metadata_routing +from sklearn.utils import _align_api_if_sparse, check_array, metadata_routing from sklearn.utils.validation import check_is_fitted @@ -267,7 +267,7 @@ def _transform(self, X, fitting): indices = np.frombuffer(indices, dtype=np.intc) shape = (len(indptr) - 1, len(vocab)) - result_matrix = sp.csr_matrix( + result_matrix = sp.csr_array( (values, indices, indptr), shape=shape, dtype=dtype ) @@ -289,7 +289,7 @@ def _transform(self, X, fitting): self.feature_names_ = feature_names self.vocabulary_ = vocab - return result_matrix + return _align_api_if_sparse(result_matrix) @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 814bf912a42fc..a11c3db59c94f 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -9,7 +9,7 @@ from sklearn.base import BaseEstimator, TransformerMixin, _fit_context from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform -from sklearn.utils import metadata_routing +from sklearn.utils import _align_api_if_sparse, metadata_routing from sklearn.utils._param_validation import Interval, StrOptions @@ -188,14 +188,14 @@ def transform(self, raw_X): if n_samples == 0: raise ValueError("Cannot vectorize empty sequence.") - X = sp.csr_matrix( + X = sp.csr_array( (values, indices, indptr), dtype=self.dtype, shape=(n_samples, self.n_features), ) X.sum_duplicates() # also sorts the indices - return X + return _align_api_if_sparse(X) def __sklearn_tags__(self): tags = super().__sklearn_tags__() diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 020620adf6cfc..32c2765cd841a 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -11,7 +11,7 @@ from scipy import sparse from sklearn.base import BaseEstimator, TransformerMixin, _fit_context -from sklearn.utils import check_array, check_random_state +from sklearn.utils import _align_api_if_sparse, check_array, check_random_state from sklearn.utils._param_validation import ( Hidden, Interval, @@ -94,7 +94,7 @@ def _mask_edges_weights(mask, edges, weights=None): def _to_graph( - n_x, n_y, n_z, mask=None, img=None, return_as=sparse.coo_matrix, dtype=None + n_x, n_y, n_z, mask=None, img=None, return_as=sparse.coo_array, dtype=None ): """Auxiliary function for img_to_graph and grid_to_graph""" edges = _make_edges_3d(n_x, n_y, n_z) @@ -127,7 +127,7 @@ def _to_graph( diag_idx = np.arange(n_voxels) i_idx = np.hstack((edges[0], edges[1])) j_idx = np.hstack((edges[1], edges[0])) - graph = sparse.coo_matrix( + graph = sparse.coo_array( ( np.hstack((weights, weights, diag)), (np.hstack((i_idx, diag_idx)), np.hstack((j_idx, diag_idx))), @@ -137,7 +137,7 @@ def _to_graph( ) if return_as is np.ndarray: return graph.toarray() - return return_as(graph) + return _align_api_if_sparse(return_as(graph)) @validate_params( @@ -149,7 +149,7 @@ def _to_graph( }, prefer_skip_nested_validation=True, ) -def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None): +def img_to_graph(img, *, mask=None, return_as=sparse.coo_array, dtype=None): """Graph of the pixel-to-pixel gradient connections. Edges are weighted with the gradient values. @@ -165,7 +165,7 @@ def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None): An optional mask of the image, to consider only part of the pixels. return_as : np.ndarray or a sparse matrix class, \ - default=sparse.coo_matrix + default=sparse.coo_array The class to use to build the returned adjacency matrix. dtype : dtype, default=None The data of the returned sparse matrix. By default it is the @@ -203,9 +203,7 @@ def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None): }, prefer_skip_nested_validation=True, ) -def grid_to_graph( - n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix, dtype=int -): +def grid_to_graph(n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_array, dtype=int): """Graph of the pixel-to-pixel connections. Edges exist if 2 voxels are connected. @@ -224,7 +222,7 @@ def grid_to_graph( An optional mask of the image, to consider only part of the pixels. return_as : np.ndarray or a sparse matrix class, \ - default=sparse.coo_matrix + default=sparse.coo_array The class to use to build the returned adjacency matrix. dtype : dtype, default=int The data of the returned sparse matrix. By default it is int. diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 90c51d668f6c0..e73f00b298cf2 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -4,6 +4,7 @@ from sklearn.feature_extraction import FeatureHasher from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform +from sklearn.utils._sparse import SCIPY_VERSION_BELOW_1_12 def test_feature_hasher_dicts(): @@ -37,8 +38,12 @@ def test_feature_hasher_strings(): assert X.shape[0] == len(raw_X) assert X.shape[1] == n_features - assert X[0].sum() == 4 - assert X[1].sum() == 3 + if SCIPY_VERSION_BELOW_1_12: + assert X[[0], :].sum() == 4 + assert X[[1], :].sum() == 3 + else: + assert X[0].sum() == 4 + assert X[1].sum() == 3 assert X.nnz == 6 diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 00b94831767b5..a8bac941dedc8 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -27,6 +27,8 @@ from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC +from sklearn.utils import _align_api_if_sparse +from sklearn.utils._sparse import SCIPY_VERSION_BELOW_1_12 from sklearn.utils._testing import ( assert_allclose_dense_sparse, assert_almost_equal, @@ -658,8 +660,12 @@ def test_hashing_vectorizer(): assert np.max(X.data) < 1 # Check that the rows are normalized - for i in range(X.shape[0]): - assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0) + if SCIPY_VERSION_BELOW_1_12: + for i in range(X.shape[0]): + assert_almost_equal(np.linalg.norm(X[[0]].data, 2), 1.0) + else: + for i in range(X.shape[0]): + assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0) # Check vectorization with some non-default parameters v = HashingVectorizer(ngram_range=(1, 2), norm="l1") @@ -677,8 +683,12 @@ def test_hashing_vectorizer(): assert np.max(X.data) < 1 # Check that the rows are normalized - for i in range(X.shape[0]): - assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0) + if SCIPY_VERSION_BELOW_1_12: + for i in range(X.shape[0]): + assert_almost_equal(np.linalg.norm(X[[0]].data, 1), 1.0) + else: + for i in range(X.shape[0]): + assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0) def test_feature_names(): @@ -1611,7 +1621,14 @@ def test_tfidf_transformer_copy(csr_container): assert X_transform is not X_csr X_transform = transformer.transform(X_csr, copy=False) - assert X_transform is X_csr + # allow for config["sparse_interface"] to change output type + # there should be no data copied, but the `id` will change. + if _align_api_if_sparse(X_csr) is X_csr: + assert X_transform is X_csr + else: + assert X_transform is not X_csr + assert X_transform.indptr is X_csr.indptr + with pytest.raises(AssertionError): assert_allclose_dense_sparse(X_csr, X_csr_original) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index ad924c00f3523..24932ee4be3b9 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -28,6 +28,7 @@ from sklearn.preprocessing import normalize from sklearn.utils import metadata_routing from sklearn.utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions +from sklearn.utils._sparse import SCIPY_VERSION_BELOW_1_12, _align_api_if_sparse from sklearn.utils.fixes import _IS_32BIT from sklearn.utils.validation import ( FLOAT_DTYPES, @@ -889,7 +890,7 @@ def transform(self, X): X.data.fill(1) if self.norm is not None: X = normalize(X, norm=self.norm, copy=False) - return X + return _align_api_if_sparse(X) def fit_transform(self, X, y=None): """Transform a sequence of documents to a document-term matrix. @@ -939,7 +940,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): r"""Convert a collection of text documents to a matrix of token counts. This implementation produces a sparse representation of the counts using - scipy.sparse.csr_matrix. + scipy.sparse.csr_array. If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will @@ -1310,13 +1311,13 @@ def _count_vocab(self, raw_documents, fixed_vocab): indptr = np.asarray(indptr, dtype=indices_dtype) values = np.frombuffer(values, dtype=np.intc) - X = sp.csr_matrix( + X = sp.csr_array( (values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype, ) X.sort_indices() - return vocabulary, X + return vocabulary, _align_api_if_sparse(X) def fit(self, raw_documents, y=None): """Learn a vocabulary dictionary of all tokens in the raw documents. @@ -1403,7 +1404,7 @@ def fit_transform(self, raw_documents, y=None): X = self._sort_features(X, vocabulary) self.vocabulary_ = vocabulary - return X + return _align_api_if_sparse(X) def transform(self, raw_documents): """Transform documents to document-term matrix. @@ -1431,7 +1432,7 @@ def transform(self, raw_documents): _, X = self._count_vocab(raw_documents, fixed_vocab=True) if self.binary: X.data.fill(1) - return X + return _align_api_if_sparse(X) def inverse_transform(self, X): """Return terms per document with nonzero entries in X. @@ -1456,8 +1457,13 @@ def inverse_transform(self, X): inverse_vocabulary = terms[np.argsort(indices)] if sp.issparse(X): + if SCIPY_VERSION_BELOW_1_12: + return [ + inverse_vocabulary[X[[i], :].nonzero()[-1]].ravel() + for i in range(n_samples) + ] return [ - inverse_vocabulary[X[i, :].nonzero()[1]].ravel() + inverse_vocabulary[X[i, :].nonzero()[-1]].ravel() for i in range(n_samples) ] else: @@ -1665,7 +1671,7 @@ def fit(self, X, y=None): self, X, accept_sparse=("csr", "csc"), accept_large_sparse=not _IS_32BIT ) if not sp.issparse(X): - X = sp.csr_matrix(X) + X = sp.csr_array(X) dtype = X.dtype if X.dtype in (np.float64, np.float32) else np.float64 if self.use_idf: @@ -1716,7 +1722,7 @@ def transform(self, X, copy=True): reset=False, ) if not sp.issparse(X): - X = sp.csr_matrix(X, dtype=X.dtype) + X = sp.csr_array(X, dtype=X.dtype) if self.sublinear_tf: np.log(X.data, X.data) @@ -1730,7 +1736,7 @@ def transform(self, X, copy=True): if self.norm is not None: X = normalize(X, norm=self.norm, copy=False) - return X + return _align_api_if_sparse(X) def __sklearn_tags__(self): tags = super().__sklearn_tags__() diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 3c12cd035d5c8..de602aa00f3b9 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -8,11 +8,12 @@ from operator import attrgetter import numpy as np -from scipy.sparse import csc_matrix, issparse +from scipy.sparse import csc_array, issparse from sklearn.base import TransformerMixin from sklearn.utils import _safe_indexing, check_array, safe_sqr from sklearn.utils._set_output import _get_output_config +from sklearn.utils._sparse import _align_api_if_sparse from sklearn.utils._tags import get_tags from sklearn.utils.validation import ( _check_feature_names_in, @@ -153,12 +154,12 @@ def inverse_transform(self, X): it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1)) col_nonzeros = it.ravel() indptr = np.concatenate([[0], np.cumsum(col_nonzeros)]) - Xt = csc_matrix( + Xt = csc_array( (X.data, X.indices, indptr), shape=(X.shape[0], len(indptr) - 1), dtype=X.dtype, ) - return Xt + return _align_api_if_sparse(Xt) support = self.get_support() X = check_array(X, dtype=None) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 57f5a2daa7e19..ef5e9ec78f0ac 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -15,6 +15,7 @@ from sklearn.utils._mask import _get_mask from sklearn.utils._missing import is_pandas_na, is_scalar_nan from sklearn.utils._param_validation import MissingValues, StrOptions +from sklearn.utils._sparse import SCIPY_VERSION_BELOW_1_12, _align_api_if_sparse from sklearn.utils.fixes import _mode from sklearn.utils.sparsefuncs import _get_median from sklearn.utils.validation import ( @@ -467,16 +468,20 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value): if strategy == "constant": # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic # for empty features to drop them later. - if not self.keep_empty_features and any( - [all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])] - ): - warnings.warn( - "Currently, when `keep_empty_feature=False` and " - '`strategy="constant"`, empty features are not dropped. ' - "This behaviour will change in version 1.8. Set " - "`keep_empty_feature=True` to preserve this behaviour.", - FutureWarning, - ) + if not self.keep_empty_features: + mm = missing_mask + if SCIPY_VERSION_BELOW_1_12: + all_mm = (all(mm[:, [i]].data) for i in range(mm.shape[1])) + else: + all_mm = (all(mm[:, i].data) for i in range(mm.shape[1])) + if any(all_mm): + warnings.warn( + "Currently, when `keep_empty_feature=False` and " + '`strategy="constant"`, empty features are not dropped. ' + "This behaviour will change in version 1.8. Set " + "`keep_empty_feature=True` to preserve this behaviour.", + FutureWarning, + ) # for constant strategy, self.statistics_ is used to store # fill_value in each column @@ -925,7 +930,7 @@ def _get_missing_features_info(self, X): n_missing = imputer_mask.sum(axis=0) if self.sparse is True: - imputer_mask = sp.csc_matrix(imputer_mask) + imputer_mask = _align_api_if_sparse(sp.csc_array(imputer_mask)) if self.features == "all": features_indices = np.arange(X.shape[1]) diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 16501b0550364..44be6e4d37856 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -21,6 +21,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline, make_union from sklearn.random_projection import _sparse_random_matrix +from sklearn.utils._sparse import SCIPY_VERSION_BELOW_1_12 from sklearn.utils._testing import ( _convert_container, assert_allclose, @@ -1766,9 +1767,12 @@ def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_feat else: X_imputed = getattr(imputer, method)(X) assert X_imputed.shape == X.shape - constant_feature = ( - X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0] - ) + if SCIPY_VERSION_BELOW_1_12 and array_type == "sparse": + constant_feature = X_imputed[:, [0]].toarray() + else: + constant_feature = ( + X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0] + ) assert_array_equal(constant_feature, fill_value) @@ -1787,9 +1791,11 @@ def test_simple_imputer_keep_empty_features(strategy, array_type, keep_empty_fea X_imputed = getattr(imputer, method)(X) if keep_empty_features: assert X_imputed.shape == X.shape - constant_feature = ( - X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0] - ) + if SCIPY_VERSION_BELOW_1_12 and array_type == "sparse": + constant_feature = X_imputed[:, [0]].toarray() + else: + col0 = X_imputed[:, 0] + constant_feature = col0.toarray() if array_type == "sparse" else col0 assert_array_equal(constant_feature, 0) else: assert X_imputed.shape == (X.shape[0], X.shape[1] - 1) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index bd60f8494bf61..ea0edd3b3f1d6 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -22,7 +22,7 @@ PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels, ) -from sklearn.utils import check_random_state +from sklearn.utils import _align_api_if_sparse, check_random_state from sklearn.utils._param_validation import Interval, StrOptions from sklearn.utils.extmath import safe_sparse_dot from sklearn.utils.validation import ( @@ -807,8 +807,10 @@ def _transform_sparse(X, sample_steps, sample_interval): indptr = X.indptr.copy() data_step = np.sqrt(X.data * sample_interval) - X_step = sp.csr_matrix( - (data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False + X_step = _align_api_if_sparse( + sp.csr_array( + (data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False + ) ) X_new = [X_step] @@ -819,14 +821,24 @@ def _transform_sparse(X, sample_steps, sample_interval): factor_nz = np.sqrt(step_nz / np.cosh(np.pi * j * sample_interval)) data_step = factor_nz * np.cos(j * log_step_nz) - X_step = sp.csr_matrix( - (data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False + X_step = _align_api_if_sparse( + sp.csr_array( + (data_step, indices, indptr), + shape=X.shape, + dtype=X.dtype, + copy=False, + ) ) X_new.append(X_step) data_step = factor_nz * np.sin(j * log_step_nz) - X_step = sp.csr_matrix( - (data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False + X_step = _align_api_if_sparse( + sp.csr_array( + (data_step, indices, indptr), + shape=X.shape, + dtype=X.dtype, + copy=False, + ) ) X_new.append(X_step) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 6f34a63d3dac6..d09eab74abcc0 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -22,7 +22,7 @@ RegressorMixin, _fit_context, ) -from sklearn.utils import check_array, check_random_state +from sklearn.utils import _align_api_if_sparse, check_array, check_random_state from sklearn.utils._array_api import ( _asarray_with_order, _average, @@ -249,7 +249,7 @@ def _rescale_data(X, y, sample_weight, inplace=False): sample_weight_sqrt = xp.sqrt(sample_weight) if sp.issparse(X) or sp.issparse(y): - sw_matrix = sparse.dia_matrix( + sw_matrix = sparse.dia_array( (sample_weight_sqrt, 0), shape=(n_samples, n_samples) ) @@ -274,7 +274,7 @@ def _rescale_data(X, y, sample_weight, inplace=False): y = y * sample_weight_sqrt else: y = y * sample_weight_sqrt[:, None] - return X, y, sample_weight_sqrt + return _align_api_if_sparse(X), _align_api_if_sparse(y), sample_weight_sqrt class LinearModel(BaseEstimator, metaclass=ABCMeta): @@ -462,7 +462,7 @@ def sparsify(self): """ msg = "Estimator, %(name)s, must be fitted before sparsifying." check_is_fitted(self, msg=msg) - self.coef_ = sp.csr_matrix(self.coef_) + self.coef_ = _align_api_if_sparse(sp.csr_array(self.coef_)) return self diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 0db90c7b21b02..98e5193e9ed6e 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -31,6 +31,7 @@ StrOptions, validate_params, ) +from sklearn.utils._sparse import _align_api_if_sparse from sklearn.utils.extmath import safe_sparse_dot from sklearn.utils.metadata_routing import _routing_enabled, process_routing from sklearn.utils.parallel import Parallel, delayed @@ -1134,7 +1135,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): @property def sparse_coef_(self): """Sparse representation of the fitted `coef_`.""" - return sparse.csr_matrix(self.coef_) + return _align_api_if_sparse(sparse.csr_array(np.atleast_2d(self.coef_))) def _decision_function(self, X): """Decision function of the linear model. diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py index b9cb1fa35056f..37c9d740e3ca6 100644 --- a/sklearn/linear_model/_linear_loss.py +++ b/sklearn/linear_model/_linear_loss.py @@ -25,7 +25,7 @@ def sandwich_dot(X, W): n_samples = X.shape[0] if sparse.issparse(X): return ( - X.T @ sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)) @ X + X.T @ sparse.dia_array((W, 0), shape=(n_samples, n_samples)) @ X ).toarray() else: # np.einsum may use less memory but the following, using BLAS matrix @@ -726,7 +726,7 @@ def gradient_hessian_product( hessian_sum = hess_pointwise.sum() if sparse.issparse(X): hX = ( - sparse.dia_matrix((hess_pointwise, 0), shape=(n_samples, n_samples)) + sparse.dia_array((hess_pointwise, 0), shape=(n_samples, n_samples)) @ X ) else: diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py index aba8c3e642ac1..e4e6f059be52a 100644 --- a/sklearn/linear_model/_quantile.py +++ b/sklearn/linear_model/_quantile.py @@ -13,6 +13,7 @@ from sklearn.linear_model._base import LinearModel from sklearn.utils import _safe_indexing from sklearn.utils._param_validation import Interval, StrOptions +from sklearn.utils._sparse import _sparse_eye from sklearn.utils.fixes import parse_version, sp_version from sklearn.utils.validation import _check_sample_weight, validate_data @@ -240,9 +241,9 @@ def fit(self, X, y, sample_weight=None): # even for optimization problems parametrized using dense numpy arrays. # Therefore, we work with CSC matrices as early as possible to limit # unnecessary repeated memory copies. - eye = sparse.eye(n_indices, dtype=X.dtype, format="csc") + eye = _sparse_eye(n_indices, dtype=X.dtype, format="csc") if self.fit_intercept: - ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype)) + ones = sparse.csc_array(np.ones(shape=(n_indices, 1), dtype=X.dtype)) A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc") else: A_eq = sparse.hstack([X, -X, eye, -eye], format="csc") diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 07fca7e7ce55a..065a05ced2876 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1847,7 +1847,7 @@ def _compute_gram(self, X, sqrt_sw): return safe_sparse_dot(X, X.T, dense_output=True), X_mean # X is sparse n_samples = X.shape[0] - sample_weight_matrix = sparse.dia_matrix( + sample_weight_matrix = sparse.dia_array( (sqrt_sw, 0), shape=(n_samples, n_samples) ) X_weighted = sample_weight_matrix.dot(X) @@ -1895,7 +1895,7 @@ def _compute_covariance(self, X, sqrt_sw): return safe_sparse_dot(X.T, X, dense_output=True), X_mean # this function only gets called for sparse X n_samples = X.shape[0] - sample_weight_matrix = sparse.dia_matrix( + sample_weight_matrix = sparse.dia_array( (sqrt_sw, 0), shape=(n_samples, n_samples) ) X_weighted = sample_weight_matrix.dot(X) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index d96ec48737736..476d177650de9 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -15,6 +15,7 @@ make_dataset, ) from sklearn.preprocessing import add_dummy_feature +from sklearn.utils._sparse import _sparse_eye from sklearn.utils._testing import ( assert_allclose, assert_array_almost_equal, @@ -98,7 +99,7 @@ def test_linear_regression_sample_weights( def test_raises_value_error_if_positive_and_sparse(): error_msg = "Sparse data was passed for X, but dense data is required." # X must not be sparse if positive == True - X = sparse.eye(10) + X = _sparse_eye(10) y = np.ones(10) reg = LinearRegression(positive=True) @@ -148,7 +149,7 @@ def test_linear_regression_sparse(global_random_seed): # Test that linear regression also works with sparse data rng = np.random.RandomState(global_random_seed) n = 100 - X = sparse.eye(n, n) + X = _sparse_eye(n, n) beta = rng.rand(n) y = X @ beta diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 70226210c010d..547610242ebe1 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -1468,7 +1468,7 @@ def test_enet_cv_sample_weight_consistency( assert_allclose(reg.intercept_, intercept) -@pytest.mark.parametrize("X_is_sparse", [False, True]) +@pytest.mark.parametrize("X_is_sparse", [False, sparse.csc_array, sparse.csc_matrix]) @pytest.mark.parametrize("fit_intercept", [False, True]) @pytest.mark.parametrize("sample_weight", [np.array([10, 1, 10, 1]), None]) def test_enet_alpha_max_sample_weight(X_is_sparse, fit_intercept, sample_weight): @@ -1476,7 +1476,7 @@ def test_enet_alpha_max_sample_weight(X_is_sparse, fit_intercept, sample_weight) beta = np.array([1, 1]) y = X @ beta if X_is_sparse: - X = sparse.csc_matrix(X) + X = X_is_sparse(X) # Test alpha_max makes coefs zero. reg = ElasticNetCV(alphas=1, cv=2, eps=1, fit_intercept=fit_intercept) reg.fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 80b69adf99b99..459baaa8ea351 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -22,6 +22,7 @@ from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, scale from sklearn.svm import OneClassSVM from sklearn.utils import get_tags +from sklearn.utils._sparse import _align_api_if_sparse, _sparse_random from sklearn.utils._testing import ( assert_allclose, assert_almost_equal, @@ -42,48 +43,49 @@ def _update_kwargs(kwargs): class _SparseSGDClassifier(linear_model.SGDClassifier): def fit(self, X, y, *args, **kw): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return super().fit(X, y, *args, **kw) def partial_fit(self, X, y, *args, **kw): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return super().partial_fit(X, y, *args, **kw) def decision_function(self, X): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return super().decision_function(X) def predict_proba(self, X): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return super().predict_proba(X) class _SparseSGDRegressor(linear_model.SGDRegressor): def fit(self, X, y, *args, **kw): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return linear_model.SGDRegressor.fit(self, X, y, *args, **kw) def partial_fit(self, X, y, *args, **kw): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return linear_model.SGDRegressor.partial_fit(self, X, y, *args, **kw) def decision_function(self, X, *args, **kw): # XXX untested as of v0.22 - X = sp.csr_matrix(X) - return linear_model.SGDRegressor.decision_function(self, X, *args, **kw) + return linear_model.SGDRegressor.decision_function( + self, _align_api_if_sparse(X), *args, **kw + ) class _SparseSGDOneClassSVM(linear_model.SGDOneClassSVM): def fit(self, X, *args, **kw): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return linear_model.SGDOneClassSVM.fit(self, X, *args, **kw) def partial_fit(self, X, *args, **kw): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return linear_model.SGDOneClassSVM.partial_fit(self, X, *args, **kw) def decision_function(self, X, *args, **kw): - X = sp.csr_matrix(X) + X = _align_api_if_sparse(sp.csr_array(X)) return linear_model.SGDOneClassSVM.decision_function(self, X, *args, **kw) @@ -2067,7 +2069,7 @@ def test_SGDClassifier_fit_for_all_backends(backend): # Create a classification problem with 50000 features and 20 classes. Using # loky or multiprocessing this make the clf.coef_ exceed the threshold # above which memmaping is used in joblib and loky (1MB as of 2018/11/1). - X = sp.random(500, 2000, density=0.02, format="csr", random_state=random_state) + X = _sparse_random((500, 2000), density=0.02, format="csr", rng=random_state) y = random_state.choice(20, 500) # Begin by fitting a SGD classifier sequentially diff --git a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py index 1aab9babeeb40..21cd0a8bc1508 100644 --- a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py @@ -6,6 +6,7 @@ from sklearn.datasets import make_regression from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( assert_almost_equal, assert_array_almost_equal, @@ -375,7 +376,7 @@ def test_sparse_read_only_buffer(copy_X): rng = np.random.RandomState(0) clf = ElasticNet(alpha=0.1, copy_X=copy_X, random_state=rng) - X = sp.random(100, 20, format="csc", random_state=rng) + X = _sparse_random((100, 20), format="csc", rng=rng) # Make X.data read-only X.data = create_memmap_backed_data(X.data) diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 07ef626ab8101..65c99dd2cb71c 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -21,6 +21,7 @@ from sklearn.neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph from sklearn.preprocessing import KernelCenterer from sklearn.utils._param_validation import Interval, StrOptions +from sklearn.utils._sparse import _ensure_sparse_index_int32 from sklearn.utils.graph import _fix_connected_components from sklearn.utils.validation import check_is_fitted @@ -297,6 +298,7 @@ def _fit_transform(self, X): **self.nbrs_.effective_metric_params_, ) + _ensure_sparse_index_int32(nbg) self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False) if self.nbrs_._fit_X.dtype == np.float32: diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index aae947bbbf171..b8fb9c777043b 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -7,7 +7,7 @@ import numpy as np from scipy.linalg import eigh, qr, solve, svd -from scipy.sparse import csr_matrix, eye, lil_matrix +from scipy.sparse import csr_array, lil_array from scipy.sparse.linalg import eigsh from sklearn.base import ( @@ -21,7 +21,9 @@ from sklearn.utils import check_array, check_random_state from sklearn.utils._arpack import _init_arpack_v0 from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from sklearn.utils._sparse import _align_api_if_sparse, _sparse_eye from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.fixes import SCIPY_VERSION_BELOW_1_15 from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data @@ -118,7 +120,8 @@ def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None): ind = knn.kneighbors(X, return_distance=False)[:, 1:] data = barycenter_weights(X, X, ind, reg=reg) indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors) - return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples)) + csr = csr_array((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples)) + return _align_api_if_sparse(csr) def null_space( @@ -229,7 +232,7 @@ def _locally_linear_embedding( ) M_sparse = eigen_solver != "dense" - M_container_constructor = lil_matrix if M_sparse else np.zeros + M_container_constructor = lil_array if M_sparse else np.zeros if method == "standard": W = barycenter_kneighbors_graph( @@ -239,8 +242,8 @@ def _locally_linear_embedding( # we'll compute M = (I-W)'(I-W) # depending on the solver, we'll do this differently if M_sparse: - M = eye(*W.shape, format=W.format) - W - M = M.T @ M + M = _sparse_eye(*W.shape, format=W.format, dtype=W.dtype) - W + M = M.T @ M # M = (I - W)' (I - W) = W' W - W' - W + I else: M = (W.T @ W - W.T - W).toarray() M.flat[:: M.shape[0] + 1] += 1 # M = W' W - W' - W + I @@ -395,8 +398,12 @@ def _locally_linear_embedding( nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i]) M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T) Wi_sum1 = Wi.sum(1) - M[i, neighbors[i]] -= Wi_sum1 - M[neighbors[i], [i]] -= Wi_sum1 + if SCIPY_VERSION_BELOW_1_15: + M[[i], neighbors[i]] -= Wi_sum1 + M[neighbors[i], [i]] -= Wi_sum1 + else: + M[i, neighbors[i]] -= Wi_sum1 + M[neighbors[i], i] -= Wi_sum1 M[i, i] += s_i elif method == "ltsa": @@ -432,7 +439,7 @@ def _locally_linear_embedding( M[neighbors[i], neighbors[i]] += np.ones(shape=n_neighbors) if M_sparse: - M = M.tocsr() + M = _align_api_if_sparse(M.tocsr()) return null_space( M, diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 39310232269e8..678cda366b3f2 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -18,6 +18,7 @@ from sklearn.utils import check_array, check_random_state, check_symmetric from sklearn.utils._arpack import _init_arpack_v0 from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from sklearn.utils._sparse import _sparse_eye from sklearn.utils.extmath import _deterministic_vector_sign_flip from sklearn.utils.fixes import laplacian as csgraph_laplacian from sklearn.utils.fixes import parse_version, sp_version @@ -306,7 +307,10 @@ def _spectral_embedding( if eigen_solver == "amg": try: - from pyamg import smoothed_aggregation_solver + import pyamg + + smoothed_aggregation_solver = pyamg.smoothed_aggregation_solver + pyamg_supports_sparray = hasattr(pyamg.aggregation.aggregation, "csr_array") except ImportError as e: raise ValueError( "The eigen_solver was set to 'amg', but pyamg is not available." @@ -396,12 +400,16 @@ def _spectral_embedding( # Shift the Laplacian so its diagononal is not all ones. The shift # does change the eigenpairs however, so we'll feed the shifted # matrix to the solver and afterward set it back to the original. - diag_shift = 1e-5 * sparse.eye(laplacian.shape[0]) + diag_shift = 1e-5 * _sparse_eye(laplacian.shape[0]) laplacian += diag_shift if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array): - # `pyamg` does not work with `csr_array` and we need to convert it to a - # `csr_matrix` object. - laplacian = sparse.csr_matrix(laplacian) + # old version `pyamg` may not work with `csr_array` and new version + # may not work with `csr_matrix`. But we need to convert to CSR. + if pyamg_supports_sparray: + laplacian = sparse.csr_array(laplacian) + else: + laplacian = sparse.csr_matrix(laplacian) + ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr")) laplacian -= diag_shift diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 2f15b22be06ff..efab6b43982fd 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -11,7 +11,7 @@ import numpy as np from scipy import linalg -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import csr_array, issparse from scipy.spatial.distance import pdist, squareform from sklearn.base import ( @@ -27,7 +27,7 @@ from sklearn.manifold import _barnes_hut_tsne, _utils # type: ignore[attr-defined] from sklearn.metrics.pairwise import _VALID_METRICS, pairwise_distances from sklearn.neighbors import NearestNeighbors -from sklearn.utils import check_random_state +from sklearn.utils import _align_api_if_sparse, check_random_state from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Interval, StrOptions, validate_params from sklearn.utils.validation import _num_samples, check_non_negative, validate_data @@ -108,7 +108,7 @@ def _joint_probabilities_nn(distances, desired_perplexity, verbose): assert np.all(np.isfinite(conditional_P)), "All probabilities should be finite" # Symmetrize the joint probability distribution using sparse operations - P = csr_matrix( + P = csr_array( (conditional_P.ravel(), distances.indices, distances.indptr), shape=(n_samples, n_samples), ) @@ -122,7 +122,7 @@ def _joint_probabilities_nn(distances, desired_perplexity, verbose): if verbose >= 2: duration = time() - t0 print("[t-SNE] Computed conditional probabilities in {:.3f}s".format(duration)) - return P + return _align_api_if_sparse(P) def _kl_divergence( diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 4c4115734a404..0743571ed3a1d 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -17,6 +17,7 @@ from sklearn.metrics import normalized_mutual_info_score, pairwise_distances from sklearn.metrics.pairwise import rbf_kernel from sklearn.neighbors import NearestNeighbors +from sklearn.utils._sparse import _sparse_diags, _sparse_random from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.extmath import _deterministic_vector_sign_flip from sklearn.utils.fixes import ( @@ -311,9 +312,9 @@ def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36): def test_spectral_embedding_amg_solver_failure(dtype, seed=36): # Non-regression test for amg solver failure (issue #13393 on github) num_nodes = 100 - X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed) + X = _sparse_random((num_nodes, num_nodes), density=0.1, random_state=seed) X = X.astype(dtype) - upper = sparse.triu(X) - sparse.diags(X.diagonal()) + upper = sparse.triu(X) - _sparse_diags(X.diagonal()) sym_matrix = upper + upper.T embedding = spectral_embedding( sym_matrix, n_components=10, eigen_solver="amg", random_state=0 diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 4f32b889d5b1f..584abf1bf9d5f 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -337,7 +337,7 @@ def test_optimization_minimizes_kl_divergence(): @pytest.mark.parametrize("method", ["exact", "barnes_hut"]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) -def test_fit_transform_csr_matrix(method, csr_container): +def test_fit_transform_csr_sparse(method, csr_container): # TODO: compare results on dense and sparse data as proposed in: # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 # X can be a sparse matrix. diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 992885a97e46c..4277f0925e33f 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -14,12 +14,13 @@ from numbers import Integral, Real import numpy as np -from scipy.sparse import coo_matrix, csr_matrix, issparse +from scipy.sparse import coo_array, csr_array, issparse from scipy.special import xlogy from sklearn.exceptions import UndefinedMetricWarning from sklearn.preprocessing import LabelBinarizer, LabelEncoder from sklearn.utils import ( + _align_api_if_sparse, assert_all_finite, check_array, check_consistent_length, @@ -153,8 +154,8 @@ def _check_targets(y_true, y_pred, sample_weight=None): # they are passed as a dense arrays? This is not possible for array # API inputs in general hence we only do it for NumPy inputs. But even # for NumPy the usefulness is questionable. - y_true = csr_matrix(y_true) - y_pred = csr_matrix(y_pred) + y_true = _align_api_if_sparse(csr_array(y_true)) + y_pred = _align_api_if_sparse(csr_array(y_pred)) y_type = "multilabel-indicator" return y_type, y_true, y_pred, sample_weight @@ -559,7 +560,7 @@ def confusion_matrix( else: dtype = np.float64 - cm = coo_matrix( + cm = coo_array( (sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 2c3ca44047145..2574724e2c3e7 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -17,7 +17,7 @@ from cython cimport final from ...utils._typedefs cimport float64_t, float32_t, intp_t -from scipy.sparse import issparse, csr_matrix +from scipy.sparse import issparse, csr_matrix, csr_array {{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -124,8 +124,8 @@ cdef class DatasetsPair{{name_suffix}}: return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric) @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure that the CSR matrix is indexed with np.int32.""" + def unpack_csr(cls, X: csr_matrix | csr_array): + """Ensure that the CSR sparse is indexed with np.int32.""" X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) X_indices = np.asarray(X.indices, dtype=np.int32) X_indptr = np.asarray(X.indptr, dtype=np.int32) @@ -223,8 +223,8 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): def __init__(self, X, Y, {{DistanceMetric}} distance_metric): super().__init__(distance_metric, n_features=X.shape[1]) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr(Y) @final cdef intp_t n_samples_X(self) noexcept nogil: @@ -269,7 +269,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): Parameters ---------- - X: sparse matrix of shape (n_samples_X, n_features) + X: sparse matrix/array of shape (n_samples_X, n_features) Rows represent vectors. Must be in CSR format. Y: ndarray of shape (n_samples_Y, n_features) @@ -283,7 +283,7 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): def __init__(self, X, Y, {{DistanceMetric}} distance_metric): super().__init__(distance_metric, n_features=X.shape[1]) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr(X) # We support the sparse-dense case by using the sparse-sparse interfaces # of `DistanceMetric` (namely `DistanceMetric.{dist_csr,rdist_csr}`) to diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 1fca2d674720c..814334bf31c43 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -27,7 +27,7 @@ from ...utils._cython_blas cimport ( from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t import numpy as np -from scipy.sparse import issparse, csr_matrix +from scipy.sparse import issparse, csr_matrix, csr_array cdef void _middle_term_sparse_sparse_64( @@ -197,7 +197,7 @@ cdef class MiddleTermComputer{{name_suffix}}: ) @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): + def unpack_csr(cls, X: csr_matrix | csr_array): """Ensure that the CSR matrix is indexed with np.int32.""" X_data = np.asarray(X.data, dtype=np.float64) X_indices = np.asarray(X.indices, dtype=np.int32) @@ -471,8 +471,8 @@ cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{nam n_features, chunk_size, ) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr(Y) cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( self, @@ -559,7 +559,7 @@ cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name n_features, chunk_size, ) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr(X) self.Y = Y self.c_ordered_middle_term = c_ordered_middle_term diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index f0060030d26fd..c3aa2b7d27885 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -16,7 +16,7 @@ import numpy as np from scipy.integrate import trapezoid -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import csr_array, issparse from scipy.stats import rankdata from sklearn.exceptions import UndefinedMetricWarning @@ -1302,7 +1302,7 @@ def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None raise ValueError("{0} format is not supported".format(y_type)) if not issparse(y_true): - y_true = csr_matrix(y_true) + y_true = csr_array(y_true) y_score = -y_score @@ -1484,7 +1484,7 @@ def label_ranking_loss(y_true, y_score, *, sample_weight=None): n_samples, n_labels = y_true.shape - y_true = csr_matrix(y_true) + y_true = csr_array(y_true) loss = np.zeros(n_samples) for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 409cd74e4e007..03ce5e72aa8c3 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -17,7 +17,7 @@ from sklearn.metrics.cluster._expected_mutual_info_fast import ( expected_mutual_information, ) -from sklearn.utils import deprecated +from sklearn.utils import _align_api_if_sparse, deprecated from sklearn.utils._array_api import ( _max_precision_float_dtype, get_namespace_and_device, @@ -139,11 +139,11 @@ def contingency_matrix( ------- contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred] Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in - true class :math:`i` and in predicted class :math:`j`. If - ``eps is None``, the dtype of this array will be integer unless set + true class :math:`i` and in predicted class :math:`j`. + If ``eps is None``, the dtype of this array will be integer unless set otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype will be float. - Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``. + If ``sparse=True`` will be a sparse CSR contingency. Examples -------- @@ -166,12 +166,13 @@ def contingency_matrix( # Using coo_matrix to accelerate simple histogram calculation, # i.e. bins are consecutive integers # Currently, coo_matrix is faster than histogram2d for simple cases - contingency = sp.coo_matrix( + contingency = sp.coo_array( (np.ones(class_idx.shape[0]), (class_idx, cluster_idx)), shape=(n_classes, n_clusters), dtype=dtype, ) if sparse: + contingency = _align_api_if_sparse(contingency) contingency = contingency.tocsr() contingency.sum_duplicates() else: diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 26dfc968dbb77..b0042b3ecc1dd 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -11,7 +11,7 @@ import numpy as np from joblib import effective_n_jobs -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import csr_array, issparse from scipy.spatial import distance from sklearn import config_context @@ -1114,8 +1114,8 @@ def manhattan_distances(X, Y=None): X, Y = check_pairwise_arrays(X, Y) if issparse(X) or issparse(Y): - X = csr_matrix(X, copy=False) - Y = csr_matrix(Y, copy=False) + X = csr_array(X, copy=False) + Y = csr_array(Y, copy=False) X.sum_duplicates() # this also sorts indices in-place Y.sum_duplicates() D = np.zeros((X.shape[0], Y.shape[0])) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index f12335b41c754..0eac509440236 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -514,7 +514,7 @@ def predict(self, X): indices.extend(np.where(_predict_binary(e, X) > thresh)[0]) indptr.append(len(indices)) data = np.ones(len(indices), dtype=int) - indicator = sp.csc_matrix( + indicator = sp.csc_array( (data, indices, indptr), shape=(n_samples, len(self.estimators_)) ) return self.label_binarizer_.inverse_transform(indicator) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index eeee7aa66bfe3..3929594b85077 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -12,7 +12,7 @@ import numpy as np from joblib import effective_n_jobs -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import csr_array, issparse from sklearn.base import BaseEstimator, MultiOutputMixin, is_classifier from sklearn.exceptions import DataConversionWarning, EfficiencyWarning @@ -21,7 +21,7 @@ from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from sklearn.neighbors._ball_tree import BallTree from sklearn.neighbors._kd_tree import KDTree -from sklearn.utils import check_array, gen_even_slices, get_tags +from sklearn.utils import _align_api_if_sparse, check_array, gen_even_slices, get_tags from sklearn.utils._param_validation import Interval, StrOptions, validate_params from sklearn.utils.fixes import parse_version, sp_base_version from sklearn.utils.multiclass import check_classification_targets @@ -222,9 +222,9 @@ def sort_graph_by_row_values(graph, copy=False, warn_when_not_sorted=True): Examples -------- - >>> from scipy.sparse import csr_matrix + >>> from scipy.sparse import csr_array >>> from sklearn.neighbors import sort_graph_by_row_values - >>> X = csr_matrix( + >>> X = csr_array( ... [[0., 3., 1.], ... [3., 0., 2.], ... [1., 2., 0.]]) @@ -1013,7 +1013,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): # check the input only in self.kneighbors - # construct CSR matrix representation of the k-NN graph + # construct CSR representation of the k-NN graph if mode == "connectivity": A_ind = self.kneighbors(X, n_neighbors, return_distance=False) n_queries = A_ind.shape[0] @@ -1034,11 +1034,11 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): n_nonzero = n_queries * n_neighbors A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) - kneighbors_graph = csr_matrix( + kneighbors_graph = csr_array( (A_data, A_ind.ravel(), A_indptr), shape=(n_queries, n_samples_fit) ) - return kneighbors_graph + return _align_api_if_sparse(kneighbors_graph) class RadiusNeighborsMixin: @@ -1389,7 +1389,8 @@ def radius_neighbors_graph( A_data = np.ones(len(A_ind)) A_indptr = np.concatenate((np.zeros(1, dtype=int), np.cumsum(n_neighbors))) - return csr_matrix((A_data, A_ind, A_indptr), shape=(n_queries, n_samples_fit)) + csr = csr_array((A_data, A_ind, A_indptr), shape=(n_queries, n_samples_fit)) + return _align_api_if_sparse(csr) def __sklearn_tags__(self): tags = super().__sklearn_tags__() diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 316ccbc9ed128..c864989ef99cc 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -987,7 +987,7 @@ def partial_fit(self, X, y=None, sample_weight=None): "instead. See docstring for motivation and alternatives." ) sparse_constructor = ( - sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix + sparse.csr_array if X.format == "csr" else sparse.csc_array ) if self.with_std: @@ -2621,7 +2621,7 @@ def add_dummy_feature(X, value=1.0): row = np.concatenate((np.arange(n_samples), X.row)) # Prepend the dummy feature n_samples times. data = np.concatenate((np.full(n_samples, value), X.data)) - return sparse.coo_matrix((data, (row, col)), shape) + return X.__class__((data, (row, col)), shape) elif X.format == "csc": # Shift index pointers since we need to add n_samples elements. indptr = X.indptr + n_samples @@ -2631,10 +2631,9 @@ def add_dummy_feature(X, value=1.0): indices = np.concatenate((np.arange(n_samples), X.indices)) # Prepend the dummy feature n_samples times. data = np.concatenate((np.full(n_samples, value), X.data)) - return sparse.csc_matrix((data, indices, indptr), shape) + return X.__class__((data, indices, indptr), shape) else: - klass = X.__class__ - return klass(add_dummy_feature(X.tocoo(), value)) + return X.__class__(add_dummy_feature(X.tocoo(), value)) else: return np.hstack((np.full((n_samples, 1), value), X)) @@ -2816,7 +2815,7 @@ def _sparse_fit(self, X, random_state): X : sparse matrix of shape (n_samples, n_features) The data used to scale along the features axis. The sparse matrix needs to be nonnegative. If a sparse matrix is provided, - it will be converted into a sparse ``csc_matrix``. + it will be converted into a SciPy sparse CSC matrix. """ n_samples, n_features = X.shape references = self.references_ * 100 @@ -2862,7 +2861,7 @@ def fit(self, X, y=None): X : {array-like, sparse matrix} of shape (n_samples, n_features) The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse - ``csc_matrix``. Additionally, the sparse matrix needs to be + CSC matrix. Additionally, the sparse matrix needs to be nonnegative if `ignore_implicit_zeros` is False. y : None @@ -3034,7 +3033,7 @@ def transform(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse - ``csc_matrix``. Additionally, the sparse matrix needs to be + CSC matrix. Additionally, the sparse matrix needs to be nonnegative if `ignore_implicit_zeros` is False. Returns @@ -3055,7 +3054,7 @@ def inverse_transform(self, X): X : {array-like, sparse matrix} of shape (n_samples, n_features) The data used to scale along the features axis. If a sparse matrix is provided, it will be converted into a sparse - ``csc_matrix``. Additionally, the sparse matrix needs to be + CSC_matrix. Additionally, the sparse matrix needs to be nonnegative if `ignore_implicit_zeros` is False. Returns diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 77d9679a29450..83f7bb4142236 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -14,7 +14,7 @@ TransformerMixin, _fit_context, ) -from sklearn.utils import _safe_indexing, check_array +from sklearn.utils import _align_api_if_sparse, _safe_indexing, check_array from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique from sklearn.utils._mask import _get_mask from sklearn.utils._missing import is_scalar_nan @@ -541,8 +541,8 @@ class OneHotEncoder(_BaseEncoder): Support for dropping infrequent categories. sparse_output : bool, default=True - When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, - i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format. + When ``True``, it returns a SciPy sparse matrix/array + in "Compressed Sparse Row" (CSR) format. .. versionadded:: 1.2 `sparse` was renamed to `sparse_output` @@ -1006,8 +1006,7 @@ def transform(self, X): """ Transform X using one-hot encoding. - If `sparse_output=True` (default), it returns an instance of - :class:`scipy.sparse._csr.csr_matrix` (CSR format). + If `sparse_output=True` (default), it returns a SciPy sparse in CSR format. If there are infrequent categories for a feature, set by specifying `max_categories` or `min_frequency`, the infrequent categories are @@ -1079,15 +1078,15 @@ def transform(self, X): np.cumsum(indptr[1:], out=indptr[1:]) data = np.ones(indptr[-1]) - out = sparse.csr_matrix( + out = sparse.csr_array( (data, indices, indptr), shape=(n_samples, feature_indices[-1]), dtype=self.dtype, ) - if not self.sparse_output: - return out.toarray() + if self.sparse_output: + return _align_api_if_sparse(out) else: - return out + return out.toarray() def inverse_transform(self, X): """ diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 5c2ee8f5fce9f..01062ba44c13e 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -11,7 +11,7 @@ import scipy.sparse as sp from sklearn.base import BaseEstimator, TransformerMixin, _fit_context -from sklearn.utils import column_or_1d +from sklearn.utils import _align_api_if_sparse, column_or_1d from sklearn.utils._array_api import device, get_namespace, xpx from sklearn.utils._encode import _encode, _unique from sklearn.utils._param_validation import Interval, validate_params @@ -413,7 +413,7 @@ def inverse_transform(self, Y, threshold=None): ) if self.sparse_input_: - y_inv = sp.csr_matrix(y_inv) + y_inv = _align_api_if_sparse(sp.csr_array(y_inv)) elif sp.issparse(y_inv): y_inv = y_inv.toarray() @@ -540,7 +540,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False) if y_type == "binary": if n_classes == 1: if sparse_output: - return sp.csr_matrix((n_samples, 1), dtype=int) + return _align_api_if_sparse(sp.csr_array((n_samples, 1), dtype=int)) else: Y = np.zeros((len(y), 1), dtype=int) Y += neg_label @@ -569,9 +569,9 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False) data = np.empty_like(indices) data.fill(pos_label) - Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) + Y = sp.csr_array((data, indices, indptr), shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": - Y = sp.csr_matrix(y) + Y = sp.csr_array(y) if pos_label != 1: data = np.empty_like(Y.data) data.fill(pos_label) @@ -604,7 +604,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False) else: Y = Y[:, -1].reshape((-1, 1)) - return Y + return _align_api_if_sparse(Y) def _inverse_binarize_multiclass(y, classes): @@ -911,8 +911,10 @@ def _transform(self, y, class_mapping): ) data = np.ones(len(indices), dtype=int) - return sp.csr_matrix( - (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping)) + return _align_api_if_sparse( + sp.csr_array( + (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping)) + ) ) def inverse_transform(self, yt): diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index acc2aa1138b68..da9cbab25799d 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -21,7 +21,7 @@ _calc_total_nnz, _csr_polynomial_expansion, ) -from sklearn.utils import check_array +from sklearn.utils import _align_api_if_sparse, check_array from sklearn.utils._array_api import ( _is_numpy_namespace, get_namespace_and_device, @@ -81,10 +81,12 @@ def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0): interaction_only, deg, ) - return sparse.csr_matrix( - (expanded_data, expanded_indices, expanded_indptr), - shape=(X.indptr.shape[0] - 1, expanded_col), - dtype=X.dtype, + return _align_api_if_sparse( + sparse.csr_array( + (expanded_data, expanded_indices, expanded_indptr), + shape=(X.indptr.shape[0] - 1, expanded_col), + dtype=X.dtype, + ) ) @@ -417,8 +419,7 @@ def transform(self, X): XP : {ndarray, sparse matrix} of shape (n_samples, NP) The matrix of features, where `NP` is the number of polynomial features generated from the combination of inputs. If a sparse - matrix is provided, it will be converted into a sparse - `csr_matrix`. + matrix is provided, it will be converted into CSR format. """ check_is_fitted(self) xp, _, device_ = get_namespace_and_device(X) @@ -439,7 +440,7 @@ def transform(self, X): to_stack = [] if self.include_bias: to_stack.append( - sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype)) + sparse.csr_array(np.ones(shape=(n_samples, 1), dtype=X.dtype)) ) if self._min_degree <= 1 and self._max_degree > 0: to_stack.append(X) @@ -458,7 +459,7 @@ def transform(self, X): cumulative_size += expanded.shape[1] if len(to_stack) == 0: # edge case: deal with empty matrix - XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype) + XP = sparse.csr_array((n_samples, 0), dtype=X.dtype) else: # `scipy.sparse.hstack` breaks in scipy<1.9.2 # when `n_output_features_ > max_int32` @@ -496,7 +497,7 @@ def transform(self, X): out_col = X[:, [col_idx]].multiply(out_col) columns.append(out_col) else: - bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) + bias = sparse.csc_array(np.ones((X.shape[0], 1))) columns.append(bias) XP = sparse.hstack(columns, dtype=X.dtype).tocsc() else: @@ -538,7 +539,7 @@ def transform(self, X): current_col = 0 if self._max_degree == 0: - return XP + return _align_api_if_sparse(XP) # degree 1 term XP[:, current_col : current_col + n_features] = X @@ -591,7 +592,7 @@ def transform(self, X): else: Xout = xp.asarray(XP[:, n_XP - n_Xout :], copy=True) XP = Xout - return XP + return _align_api_if_sparse(XP) def __sklearn_tags__(self): tags = super().__sklearn_tags__() @@ -1151,8 +1152,7 @@ def transform(self, X): XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k) # Note: Without converting to lil_matrix we would get: # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity - # structure of a csr_matrix is expensive. lil_matrix is more - # efficient. + # structure of CSC is expensive. LIL is more efficient. if np.any(outside_range_mask): XBS_sparse = XBS_sparse.tolil() XBS_sparse[outside_range_mask, :] = 0 @@ -1298,7 +1298,8 @@ def transform(self, X): elif self.sparse_output: # TODO: Remove conversion to csr, once scipy 1.10 is the minimum version: # Adjust format of XBS to sparse, for scipy versions < 1.10.0: - XBS = sparse.csr_matrix(XBS) + XBS = sparse.csr_array(XBS) + XBS = _align_api_if_sparse(XBS) if self.include_bias: return XBS diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 20712fbbebd0e..b6bb3a241ca10 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -43,6 +43,7 @@ _get_namespace_device_dtype_ids, yield_namespace_device_dtype_combinations, ) +from sklearn.utils._sparse import _sparse_random from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids from sklearn.utils._testing import ( _array_api_for_tests, @@ -2498,7 +2499,7 @@ def test_power_transformer_box_cox_raise_all_nans_col(): @pytest.mark.parametrize( "X_2", - [sparse.random(10, 1, density=0.8, random_state=0)] + [_sparse_random((10, 1), density=0.8, rng=0)] + [ csr_container(np.full((10, 1), fill_value=np.nan)) for csr_container in CSR_CONTAINERS @@ -2507,7 +2508,7 @@ def test_power_transformer_box_cox_raise_all_nans_col(): def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/16448 - X_1 = sparse.random(5, 1, density=0.8) + X_1 = _sparse_random((5, 1), density=0.8) scaler = StandardScaler(with_mean=False) scaler.fit(X_1).partial_fit(X_2) assert np.isfinite(scaler.var_[0]) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 389d6da127f89..b6b691483ef1c 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -40,7 +40,7 @@ _fit_context, ) from sklearn.exceptions import DataDimensionalityWarning -from sklearn.utils import check_random_state +from sklearn.utils import _align_api_if_sparse, check_random_state from sklearn.utils._param_validation import Interval, StrOptions, validate_params from sklearn.utils.extmath import safe_sparse_dot from sklearn.utils.random import sample_without_replacement @@ -297,9 +297,10 @@ def _sparse_random_matrix(n_components, n_features, density="auto", random_state data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1 # build the CSR structure by concatenating the rows - components = sp.csr_matrix( + components = sp.csr_array( (data, indices, indptr), shape=(n_components, n_features) ) + components = _align_api_if_sparse(components) return np.sqrt(1 / density) / np.sqrt(n_components) * components diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 7ff1460b0d8be..9ad40e925ee54 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -463,7 +463,7 @@ class distributions will exceed 1 (normalization may be desired). affinity_matrix = self._get_kernel(self.X_) normalizer = affinity_matrix.sum(axis=0) if sparse.issparse(affinity_matrix): - affinity_matrix.data /= np.diag(np.array(normalizer)) + affinity_matrix.data /= np.diag(np.atleast_2d(normalizer)) else: affinity_matrix /= normalizer[:, np.newaxis] return affinity_matrix diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 4b046aa111250..a11fff374cb1e 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -18,7 +18,8 @@ assert_array_equal, ) -CONSTRUCTOR_TYPES = ("array", "sparse_csr", "sparse_csc") +SPARSE_TYPES = ("sparse_csr", "sparse_csc", "sparse_csr_array", "sparse_csc_array") +CONSTRUCTOR_TYPES = ("array",) + SPARSE_TYPES ESTIMATORS = [ (label_propagation.LabelPropagation, {"kernel": "rbf"}), @@ -126,7 +127,7 @@ def test_label_propagation_closed_form(global_dtype): assert_allclose(expected, clf.label_distributions_, atol=1e-4) -@pytest.mark.parametrize("accepted_sparse_type", ["sparse_csr", "sparse_csc"]) +@pytest.mark.parametrize("accepted_sparse_type", SPARSE_TYPES) @pytest.mark.parametrize("index_dtype", [np.int32, np.int64]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 6c8b981be55b7..cb25ab66bafb6 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -24,6 +24,7 @@ compute_class_weight, ) from sklearn.utils._param_validation import Interval, StrOptions +from sklearn.utils._sparse import SCIPY_VERSION_BELOW_1_12, _align_api_if_sparse from sklearn.utils.extmath import safe_sparse_dot from sklearn.utils.metaestimators import available_if from sklearn.utils.multiclass import ( @@ -64,10 +65,16 @@ def _one_vs_one_coef(dual_coef, n_support, support_vectors): # SVs for class1: sv2 = support_vectors[sv_locs[class2] : sv_locs[class2 + 1], :] - # dual coef for class1 SVs: - alpha1 = dual_coef[class2 - 1, sv_locs[class1] : sv_locs[class1 + 1]] - # dual coef for class2 SVs: - alpha2 = dual_coef[class1, sv_locs[class2] : sv_locs[class2 + 1]] + if SCIPY_VERSION_BELOW_1_12: + # dual coef for class1 SVs: + alpha1 = dual_coef[[class2 - 1], sv_locs[class1] : sv_locs[class1 + 1]] + # dual coef for class2 SVs: + alpha2 = dual_coef[[class1], sv_locs[class2] : sv_locs[class2 + 1]] + else: + # dual coef for class1 SVs: + alpha1 = dual_coef[class2 - 1, sv_locs[class1] : sv_locs[class1 + 1]] + # dual coef for class2 SVs: + alpha2 = dual_coef[class1, sv_locs[class2] : sv_locs[class2 + 1]] # build weight for class1 vs class2 coef.append(safe_sparse_dot(alpha1, sv1) + safe_sparse_dot(alpha2, sv2)) @@ -187,7 +194,7 @@ def fit(self, X, y, sample_weight=None): Notes ----- If X and y are not C-ordered and contiguous arrays of np.float64 and - X is not a scipy.sparse.csr_matrix, X and/or y may be copied. + X is not a sparse CSR format, X and/or y may be copied. If X is a dense array, then the other methods will not support sparse matrices as input. @@ -416,13 +423,16 @@ def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed): dual_coef_indices = np.tile(np.arange(n_SV), n_class) if not n_SV: - self.dual_coef_ = sp.csr_matrix([]) + self.dual_coef_ = _align_api_if_sparse(sp.csr_array([[]])) else: dual_coef_indptr = np.arange( 0, dual_coef_indices.size + 1, dual_coef_indices.size / n_class ) - self.dual_coef_ = sp.csr_matrix( - (dual_coef_data, dual_coef_indices, dual_coef_indptr), (n_class, n_SV) + self.dual_coef_ = _align_api_if_sparse( + sp.csr_array( + (dual_coef_data, dual_coef_indices, dual_coef_indptr), + (n_class, n_SV), + ) ) def predict(self, X): @@ -480,7 +490,7 @@ def _dense_predict(self, X): ) def _sparse_predict(self, X): - # Precondition: X is a csr_matrix of dtype np.float64. + # Precondition: X is CSR sparse of dtype np.float64. kernel = self.kernel if callable(kernel): kernel = "precomputed" @@ -629,9 +639,9 @@ def _validate_for_predict(self, X): reset=False, ) - if self._sparse and not sp.issparse(X): - X = sp.csr_matrix(X) if self._sparse: + if not sp.issparse(X): + X = _align_api_if_sparse(sp.csr_array(X)) X.sort_indices() if sp.issparse(X) and not self._sparse and not callable(self.kernel): diff --git a/sklearn/svm/_libsvm_sparse.pyx b/sklearn/svm/_libsvm_sparse.pyx index 529758061d299..154241c2a94b5 100644 --- a/sklearn/svm/_libsvm_sparse.pyx +++ b/sklearn/svm/_libsvm_sparse.pyx @@ -2,6 +2,7 @@ import numpy as np from scipy import sparse from ..utils._cython_blas cimport _dot from ..utils._typedefs cimport float64_t, int32_t, intp_t +from ..utils._sparse import _align_api_if_sparse cdef extern from *: ctypedef char* const_char_p "const char*" @@ -215,9 +216,9 @@ def libsvm_sparse_train (int n_features, model, n_features, ) - support_vectors_ = sparse.csr_matrix( + support_vectors_ = _align_api_if_sparse(sparse.csr_array( (SV_data, SV_indices, SV_indptr), (SV_len, n_features) - ) + )) # copy model.nSV # TODO: do only in classification diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 0842cf0c82b48..78b63e7f9a3d1 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -191,6 +191,10 @@ def test_clone_empty_array(): clf2 = clone(clf) assert_array_equal(clf.empty.data, clf2.empty.data) + clf = MyEstimator(empty=sp.csr_array(np.array([[0]]))) + clf2 = clone(clf) + assert_array_equal(clf.empty.data, clf2.empty.data) + def test_clone_nan(): # Regression test for cloning estimators with default parameter as np.nan diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index bf35eee623c18..b7beda31c2fd5 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -21,6 +21,7 @@ def test_config_context(): "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } # Not using as a context manager affects nothing @@ -39,6 +40,7 @@ def test_config_context(): "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } assert get_config()["assume_finite"] is False @@ -74,6 +76,7 @@ def test_config_context(): "transform_output": "default", "enable_metadata_routing": False, "skip_parameter_validation": False, + "sparse_interface": "spmatrix", } # No positional arguments diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 9d0b2854c3ba0..374fd5dd53238 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -21,7 +21,9 @@ cimport numpy as cnp cnp.import_array() from scipy.sparse import issparse -from scipy.sparse import csr_matrix +from scipy.sparse import csr_array + +from ..utils import _align_api_if_sparse from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray @@ -1002,7 +1004,7 @@ cdef class Tree: """ # Check input if not (issparse(X) and X.format == 'csr'): - raise ValueError("X should be in csr_matrix format, got %s" + raise ValueError("X should be in CSR sparse format, got %s" % type(X)) if X.dtype != DTYPE: @@ -1120,17 +1122,17 @@ cdef class Tree: indices = indices[:indptr[n_samples]] cdef intp_t[:] data = np.ones(shape=len(indices), dtype=np.intp) - out = csr_matrix((data, indices, indptr), - shape=(n_samples, self.node_count)) + out = csr_array((data, indices, indptr), + shape=(n_samples, self.node_count)) - return out + return _align_api_if_sparse(out) cdef inline object _decision_path_sparse_csr(self, object X): """Finds the decision path (=node) for each sample in X.""" # Check input if not (issparse(X) and X.format == "csr"): - raise ValueError("X should be in csr_matrix format, got %s" + raise ValueError("X should be in CSR sparse format, got %s" % type(X)) if X.dtype != DTYPE: @@ -1204,10 +1206,10 @@ cdef class Tree: indices = indices[:indptr[n_samples]] cdef intp_t[:] data = np.ones(shape=len(indices), dtype=np.intp) - out = csr_matrix((data, indices, indptr), - shape=(n_samples, self.node_count)) + out = csr_array((data, indices, indptr), + shape=(n_samples, self.node_count)) - return out + return _align_api_if_sparse(out) cpdef compute_node_depths(self): """Compute the depth of each node in a tree. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 87f015ddaa267..ffe68252e0189 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -17,6 +17,7 @@ from sklearn.utils._mask import safe_mask from sklearn.utils._repr_html.base import _HTMLDocumentationLinkMixin # noqa: F401 from sklearn.utils._repr_html.estimator import estimator_html_repr +from sklearn.utils._sparse import _align_api_if_sparse from sklearn.utils._tags import ( ClassifierTags, InputTags, @@ -53,6 +54,7 @@ "Tags", "TargetTags", "TransformerTags", + "_align_api_if_sparse", "_safe_indexing", "all_estimators", "as_float_array", diff --git a/sklearn/utils/_indexing.py b/sklearn/utils/_indexing.py index 6272ec02fc8eb..90b784025a0f7 100644 --- a/sklearn/utils/_indexing.py +++ b/sklearn/utils/_indexing.py @@ -13,7 +13,7 @@ from sklearn.utils._array_api import _is_numpy_namespace, get_namespace from sklearn.utils._param_validation import Interval, validate_params from sklearn.utils.extmath import _approximate_mode -from sklearn.utils.fixes import PYARROW_VERSION_BELOW_17 +from sklearn.utils.fixes import PYARROW_VERSION_BELOW_17, SCIPY_VERSION_BELOW_1_12 from sklearn.utils.validation import ( _check_sample_weight, _is_arraylike_not_scalar, @@ -32,8 +32,12 @@ def _array_indexing(array, key, key_dtype, axis): xp, is_array_api = get_namespace(array) if is_array_api: return xp.take(array, key, axis=axis) - if issparse(array) and key_dtype == "bool": - key = np.asarray(key) + if issparse(array): + if key_dtype == "bool": + key = np.asarray(key) + elif SCIPY_VERSION_BELOW_1_12: + if isinstance(key, numbers.Integral): + key = [key] if isinstance(key, tuple): key = list(key) return array[key, ...] if axis == 0 else array[:, key] @@ -569,8 +573,8 @@ def resample( >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) - >>> from scipy.sparse import coo_matrix - >>> X_sparse = coo_matrix(X) + >>> from scipy.sparse import coo_array + >>> X_sparse = coo_array(X) >>> from sklearn.utils import resample >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) @@ -580,7 +584,7 @@ def resample( [1., 0.]]) >>> X_sparse - >>> X_sparse.toarray() @@ -727,8 +731,8 @@ def shuffle(*arrays, random_state=None, n_samples=None): >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) - >>> from scipy.sparse import coo_matrix - >>> X_sparse = coo_matrix(X) + >>> from scipy.sparse import coo_array + >>> X_sparse = coo_array(X) >>> from sklearn.utils import shuffle >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) @@ -738,7 +742,7 @@ def shuffle(*arrays, random_state=None, n_samples=None): [1., 0.]]) >>> X_sparse - >>> X_sparse.toarray() diff --git a/sklearn/utils/_mask.py b/sklearn/utils/_mask.py index 83361743ce3e7..d6b44cea76d97 100644 --- a/sklearn/utils/_mask.py +++ b/sklearn/utils/_mask.py @@ -8,6 +8,7 @@ from sklearn.utils._missing import is_scalar_nan from sklearn.utils._param_validation import validate_params +from sklearn.utils._sparse import _align_api_if_sparse from sklearn.utils.fixes import _object_dtype_isnan @@ -59,12 +60,12 @@ def _get_mask(X, value_to_mask): Xt = _get_dense_mask(X.data, value_to_mask) - sparse_constructor = sp.csr_matrix if X.format == "csr" else sp.csc_matrix + sparse_constructor = sp.csr_array if X.format == "csr" else sp.csc_array Xt_sparse = sparse_constructor( (Xt, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool ) - return Xt_sparse + return _align_api_if_sparse(Xt_sparse) @validate_params( @@ -93,8 +94,8 @@ def safe_mask(X, mask): Examples -------- >>> from sklearn.utils import safe_mask - >>> from scipy.sparse import csr_matrix - >>> data = csr_matrix([[1], [2], [3], [4], [5]]) + >>> from scipy.sparse import csr_array + >>> data = csr_array([[1], [2], [3], [4], [5]]) >>> condition = [False, True, True, False, True] >>> mask = safe_mask(data, condition) >>> data[mask].toarray() diff --git a/sklearn/utils/_param_validation.py b/sklearn/utils/_param_validation.py index 24b0846508381..5a8c8733d2c97 100644 --- a/sklearn/utils/_param_validation.py +++ b/sklearn/utils/_param_validation.py @@ -11,7 +11,7 @@ from numbers import Integral, Real import numpy as np -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import csr_array, issparse from sklearn._config import config_context, get_config from sklearn.utils.validation import _is_arraylike_not_scalar @@ -541,7 +541,7 @@ def is_satisfied_by(self, val): return issparse(val) def __str__(self): - return "a sparse matrix" + return "a sparse array or matrix" class _Callables(_Constraint): @@ -844,7 +844,7 @@ def generate_valid_param(constraint): return np.array([1, 2, 3]) if isinstance(constraint, _SparseMatrices): - return csr_matrix([[0, 1], [1, 0]]) + return csr_array([[0, 1], [1, 0]]) if isinstance(constraint, _RandomStates): return np.random.RandomState(42) diff --git a/sklearn/utils/_sparse.py b/sklearn/utils/_sparse.py new file mode 100644 index 0000000000000..3bb3e9fb0673e --- /dev/null +++ b/sklearn/utils/_sparse.py @@ -0,0 +1,173 @@ +"""Control sparse interface based on config""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import scipy as sp + +from sklearn._config import get_config +from sklearn.utils.fixes import SCIPY_VERSION_BELOW_1_12 + + +def _align_api_if_sparse(X): + """ + Convert to sparse interface as set in config. Input can be dense or sparse. + If sparse, convert to sparse_interface indicated by get_config. + Otherwise, return X unchanged. + + """ + if not sp.sparse.issparse(X): + return X + + config_sparse_interface = get_config()["sparse_interface"] + + # there are only two sparse interfaces: sparray and spmatrix + if config_sparse_interface == "sparray": + if sp.sparse.isspmatrix(X): + # Fundamental code to switch to sparray in any format + return getattr(sp.sparse, X.format + "_array")(X) + return X + else: # config is spmatrix + if sp.sparse.isspmatrix(X): + return X + # Fundamental code to switch to spmatrix in any format + return getattr(sp.sparse, X.format + "_matrix")(X) + + +########### fixes for transitioning function names + +# TODO: Replace when Scipy 1.12 is the minimum supported version +if not SCIPY_VERSION_BELOW_1_12: + _sparse_eye = sp.sparse.eye_array + _sparse_diags = sp.sparse.diags_array + + def _sparse_random( + shape, + *, + density=0.01, + format="coo", + dtype=None, + random_state=None, + rng=None, + data_sampler=None, + ): + X = sp.sparse.random_array( + shape, + density=density, + format=format, + dtype=dtype, + random_state=rng or random_state, + data_sampler=data_sampler, + ) + _ensure_sparse_index_int32(X) + return X + +else: + + def _sparse_eye(m, n=None, *, k=0, dtype=float, format=None): + return sp.sparse.eye(m, n, k=k, dtype=dtype, format=format) + + def _sparse_diags(diagonals, /, *, offsets=0, shape=None, format=None, dtype=None): + return sp.sparse.diags( + diagonals, offsets=offsets, shape=shape, format=format, dtype=dtype + ) + + def _sparse_random( + shape, + *, + density=0.01, + format="coo", + dtype=None, + random_state=None, + rng=None, + data_sampler=None, + ): + return sp.sparse.random( + *shape, + density=density, + format=format, + dtype=dtype, + random_state=rng or random_state, + data_rvs=data_sampler, + ) + + +########### fixes for casting index arrays + + +def _ensure_sparse_index_int32(A): + """Safely ensure that index arrays are int32.""" + if A.format in ("csc", "csr", "bsr"): + A.indices, A.indptr = safely_cast_index_arrays(A) + elif A.format == "coo": + if hasattr(A, "coords"): + A.coords = safely_cast_index_arrays(A) + elif hasattr(A, "indices"): + A.indices = safely_cast_index_arrays(A) + else: + A.row, A.col = safely_cast_index_arrays(A) + elif A.format == "dia": + A.offsets = safely_cast_index_arrays(A) + + +# TODO: remove when SciPy 1.15 is minimal supported version +# (based on scipy.sparse._sputils.py function with same name) +def safely_cast_index_arrays(A, idx_dtype=np.int32, msg=""): + """Safely cast sparse array indices to `idx_dtype`. + + Check the shape of `A` to determine if it is safe to cast its index + arrays to dtype `idx_dtype`. If any dimension in shape is larger than + fits in the dtype, casting is unsafe so raise ``ValueError``. + If safe, cast the index arrays to `idx_dtype` and return the result + without changing the input `A`. The caller can assign results to `A` + attributes if desired or use the recast index arrays directly. + + Unless downcasting is needed, the original index arrays are returned. + You can test e.g. ``A.indptr is new_indptr`` to see if downcasting occurred. + + See SciPy: scipy.sparse._sputils.py for more info on safely_cast_index_arrays() + """ + max_value = np.iinfo(idx_dtype).max + + if A.format in ("csc", "csr"): + if A.indptr[-1] > max_value: + raise ValueError(f"indptr values too large for {msg}") + # check shape vs dtype + if max(*A.shape) > max_value: + if (A.indices > max_value).any(): + raise ValueError(f"indices values too large for {msg}") + + indices = A.indices.astype(idx_dtype, copy=False) + indptr = A.indptr.astype(idx_dtype, copy=False) + return indices, indptr + + elif A.format == "coo": + coords = getattr(A, "coords", None) + if coords is None: + coords = getattr(A, "indices", None) + if coords is None: + coords = (A.row, A.col) + if max(*A.shape) > max_value: + if any((co > max_value).any() for co in coords): + raise ValueError(f"coords values too large for {msg}") + return tuple(co.astype(idx_dtype, copy=False) for co in coords) + + elif A.format == "dia": + if max(*A.shape) > max_value: + if (A.offsets > max_value).any(): + raise ValueError(f"offsets values too large for {msg}") + offsets = A.offsets.astype(idx_dtype, copy=False) + return offsets + + elif A.format == "bsr": + R, C = A.blocksize + if A.indptr[-1] * R > max_value: + raise ValueError("indptr values too large for {msg}") + if max(*A.shape) > max_value: + if (A.indices * C > max_value).any(): + raise ValueError(f"indices values too large for {msg}") + indices = A.indices.astype(idx_dtype, copy=False) + indptr = A.indptr.astype(idx_dtype, copy=False) + return indices, indptr + # DOK and LIL formats are not associated with index arrays. diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 24b1f5710af9e..c7f2d5663bf0d 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -44,6 +44,7 @@ Tags, TargetTags, TransformerTags, + _align_api_if_sparse, ) from sklearn.utils._array_api import _check_array_api_dispatch from sklearn.utils.fixes import ( @@ -1060,11 +1061,11 @@ def _convert_container( if constructor_name in ("sparse", "sparse_csr"): # sparse and sparse_csr are equivalent for legacy reasons - return sp.sparse.csr_matrix(container, dtype=dtype) + return _align_api_if_sparse(sp.sparse.csr_array(container, dtype=dtype)) elif constructor_name == "sparse_csr_array": return sp.sparse.csr_array(container, dtype=dtype) elif constructor_name == "sparse_csc": - return sp.sparse.csc_matrix(container, dtype=dtype) + return _align_api_if_sparse(sp.sparse.csc_array(container, dtype=dtype)) elif constructor_name == "sparse_csc_array": return sp.sparse.csc_array(container, dtype=dtype) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 97f891b61ccff..71d7bf1f9ab1f 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -145,7 +145,7 @@ def density(w): -------- >>> from scipy import sparse >>> from sklearn.utils.extmath import density - >>> X = sparse.random(10, 10, density=0.25, random_state=0) + >>> X = sparse.random_array((10, 10), density=0.25, rng=0) >>> density(X) 0.25 """ @@ -174,9 +174,9 @@ def safe_sparse_dot(a, b, *, dense_output=False): Examples -------- - >>> from scipy.sparse import csr_matrix + >>> from scipy.sparse import csr_array >>> from sklearn.utils.extmath import safe_sparse_dot - >>> X = csr_matrix([[1, 2], [3, 4], [5, 6]]) + >>> X = csr_array([[1, 2], [3, 4], [5, 6]]) >>> dot_product = safe_sparse_dot(X, X.T) >>> dot_product.toarray() array([[ 5, 11, 17], @@ -552,7 +552,7 @@ def _randomized_svd( if sparse.issparse(M) and M.format in ("lil", "dok"): warnings.warn( "Calculating SVD of a {} is expensive. " - "csr_matrix is more efficient.".format(type(M).__name__), + "CSR format is more efficient.".format(type(M).__name__), sparse.SparseEfficiencyWarning, ) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 47702952e33f8..a9404df174b1f 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -68,6 +68,12 @@ def _mode(a, axis=0): return scipy.stats.mode(a, axis=axis) +# TODO: Remove when Scipy 1.12 is the minimum supported version +# Use git grep to see where this is used and update them too. +SCIPY_VERSION_BELOW_1_12 = sp_base_version < parse_version("1.12.0") +SCIPY_VERSION_BELOW_1_15 = sp_base_version < parse_version("1.15.0") + + # TODO: Remove when Scipy 1.12 is the minimum supported version if sp_base_version >= parse_version("1.12.0"): _sparse_linalg_cg = scipy.sparse.linalg.cg @@ -166,13 +172,13 @@ def _min_or_max_axis(X, axis, min_or_max): value = np.compress(mask, value) if axis == 0: - res = scipy.sparse.coo_matrix( + res = scipy.sparse.coo_array( (value, (np.zeros(len(value)), major_index)), dtype=X.dtype, shape=(1, M), ) else: - res = scipy.sparse.coo_matrix( + res = scipy.sparse.coo_array( (value, (major_index, np.zeros(len(value)))), dtype=X.dtype, shape=(M, 1), diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py index b28c2883e9499..cd68b4a5ad6ba 100644 --- a/sklearn/utils/graph.py +++ b/sklearn/utils/graph.py @@ -59,7 +59,7 @@ def single_source_shortest_path_length(graph, source, *, cutoff=None): if sparse.issparse(graph): graph = graph.tolil() else: - graph = sparse.lil_matrix(graph) + graph = sparse.lil_array(graph) seen = {} # level (number of hops) when seen in BFS level = 0 # the current level next_level = [source] # dict of nodes to check at next level diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py index 4da8f26894aa6..20ab006d702de 100644 --- a/sklearn/utils/random.py +++ b/sklearn/utils/random.py @@ -10,6 +10,7 @@ from sklearn.utils import check_random_state from sklearn.utils._random import sample_without_replacement +from sklearn.utils._sparse import _align_api_if_sparse __all__ = ["sample_without_replacement"] @@ -98,4 +99,5 @@ def _random_choice_csc(n_samples, classes, class_probability=None, random_state= data.extend(classes[j][classes_j_nonzero][classes_ind]) indptr.append(len(indices)) - return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int) + csc = sp.csc_array((data, indices, indptr), (n_samples, len(classes)), dtype=int) + return _align_api_if_sparse(csc) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index f4e62ef8f3438..7e752f2146bb2 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -60,18 +60,18 @@ def inplace_csr_column_scale(X, scale): >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) - >>> csr = sparse.csr_matrix((data, indices, indptr)) + >>> csr = sparse.csr_array((data, indices, indptr)) >>> csr.todense() - matrix([[8, 1, 2], - [0, 0, 5], - [0, 0, 0], - [0, 0, 0]]) + array([[8, 1, 2], + [0, 0, 5], + [0, 0, 0], + [0, 0, 0]]) >>> sparsefuncs.inplace_csr_column_scale(csr, scale) >>> csr.todense() - matrix([[16, 3, 4], - [ 0, 0, 10], - [ 0, 0, 0], - [ 0, 0, 0]]) + array([[16, 3, 4], + [ 0, 0, 10], + [ 0, 0, 0], + [ 0, 0, 0]]) """ assert scale.shape[0] == X.shape[1] X.data *= scale.take(X.indices, mode="clip") @@ -140,12 +140,12 @@ def mean_variance_axis(X, axis, weights=None, return_sum_weights=False): >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) - >>> csr = sparse.csr_matrix((data, indices, indptr)) + >>> csr = sparse.csr_array((data, indices, indptr)) >>> csr.todense() - matrix([[8, 1, 2], - [0, 0, 5], - [0, 0, 0], - [0, 0, 0]]) + array([[8, 1, 2], + [0, 0, 5], + [0, 0, 0], + [0, 0, 0]]) >>> sparsefuncs.mean_variance_axis(csr, axis=0) (array([2. , 0.25, 1.75]), array([12. , 0.1875, 4.1875])) """ @@ -242,12 +242,12 @@ def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=Non >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) - >>> csr = sparse.csr_matrix((data, indices, indptr)) + >>> csr = sparse.csr_array((data, indices, indptr)) >>> csr.todense() - matrix([[8, 1, 2], - [0, 0, 5], - [0, 0, 0], - [0, 0, 0]]) + array([[8, 1, 2], + [0, 0, 5], + [0, 0, 0], + [0, 0, 0]]) >>> sparsefuncs.incr_mean_variance_axis( ... csr, axis=0, last_mean=np.zeros(3), last_var=np.zeros(3), last_n=2 ... ) @@ -312,18 +312,18 @@ def inplace_column_scale(X, scale): >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) - >>> csr = sparse.csr_matrix((data, indices, indptr)) + >>> csr = sparse.csr_array((data, indices, indptr)) >>> csr.todense() - matrix([[8, 1, 2], - [0, 0, 5], - [0, 0, 0], - [0, 0, 0]]) + array([[8, 1, 2], + [0, 0, 5], + [0, 0, 0], + [0, 0, 0]]) >>> sparsefuncs.inplace_column_scale(csr, scale) >>> csr.todense() - matrix([[16, 3, 4], - [ 0, 0, 10], - [ 0, 0, 0], - [ 0, 0, 0]]) + array([[16, 3, 4], + [ 0, 0, 10], + [ 0, 0, 0], + [ 0, 0, 0]]) """ if sp.issparse(X) and X.format == "csc": inplace_csr_row_scale(X.T, scale) @@ -356,18 +356,18 @@ def inplace_row_scale(X, scale): >>> indices = np.array([0, 1, 2, 3, 3]) >>> data = np.array([8, 1, 2, 5, 6]) >>> scale = np.array([2, 3, 4, 5]) - >>> csr = sparse.csr_matrix((data, indices, indptr)) + >>> csr = sparse.csr_array((data, indices, indptr)) >>> csr.todense() - matrix([[8, 1, 0, 0], - [0, 0, 2, 0], - [0, 0, 0, 5], - [0, 0, 0, 6]]) + array([[8, 1, 0, 0], + [0, 0, 2, 0], + [0, 0, 0, 5], + [0, 0, 0, 6]]) >>> sparsefuncs.inplace_row_scale(csr, scale) >>> csr.todense() - matrix([[16, 2, 0, 0], - [ 0, 0, 6, 0], - [ 0, 0, 0, 20], - [ 0, 0, 0, 30]]) + array([[16, 2, 0, 0], + [ 0, 0, 6, 0], + [ 0, 0, 0, 20], + [ 0, 0, 0, 30]]) """ if sp.issparse(X) and X.format == "csc": inplace_csr_column_scale(X.T, scale) @@ -493,18 +493,18 @@ def inplace_swap_row(X, m, n): >>> indptr = np.array([0, 2, 3, 3, 3]) >>> indices = np.array([0, 2, 2]) >>> data = np.array([8, 2, 5]) - >>> csr = sparse.csr_matrix((data, indices, indptr)) + >>> csr = sparse.csr_array((data, indices, indptr)) >>> csr.todense() - matrix([[8, 0, 2], - [0, 0, 5], - [0, 0, 0], - [0, 0, 0]]) + array([[8, 0, 2], + [0, 0, 5], + [0, 0, 0], + [0, 0, 0]]) >>> sparsefuncs.inplace_swap_row(csr, 0, 1) >>> csr.todense() - matrix([[0, 0, 5], - [8, 0, 2], - [0, 0, 0], - [0, 0, 0]]) + array([[0, 0, 5], + [8, 0, 2], + [0, 0, 0], + [0, 0, 0]]) """ if sp.issparse(X) and X.format == "csc": inplace_swap_row_csc(X, m, n) @@ -538,18 +538,18 @@ def inplace_swap_column(X, m, n): >>> indptr = np.array([0, 2, 3, 3, 3]) >>> indices = np.array([0, 2, 2]) >>> data = np.array([8, 2, 5]) - >>> csr = sparse.csr_matrix((data, indices, indptr)) + >>> csr = sparse.csr_array((data, indices, indptr)) >>> csr.todense() - matrix([[8, 0, 2], - [0, 0, 5], - [0, 0, 0], - [0, 0, 0]]) + array([[8, 0, 2], + [0, 0, 5], + [0, 0, 0], + [0, 0, 0]]) >>> sparsefuncs.inplace_swap_column(csr, 0, 1) >>> csr.todense() - matrix([[0, 8, 2], - [0, 0, 5], - [0, 0, 0], - [0, 0, 0]]) + array([[0, 8, 2], + [0, 0, 5], + [0, 0, 0], + [0, 0, 0]]) """ if m < 0: m += X.shape[1] diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 23261c59de320..1178bf9c96ad2 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -8,6 +8,7 @@ from libc.stdint cimport intptr_t import numpy as np from cython cimport floating +from ..utils._sparse import _ensure_sparse_index_int32 from ..utils._typedefs cimport float64_t, int32_t, int64_t, intp_t, uint64_t @@ -367,6 +368,7 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None): if last_n.dtype not in [np.float32, np.float64]: last_n = last_n.astype(np.float64, copy=False) + _ensure_sparse_index_int32(X) return _incr_mean_variance_axis0(X.data, np.sum(weights), X.shape[1], @@ -487,13 +489,13 @@ def inplace_csr_row_normalize_l1(X): Examples -------- - >>> from scipy.sparse import csr_matrix + >>> from scipy.sparse import csr_array >>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1 >>> import numpy as np >>> indptr = np.array([0, 2, 3, 4]) >>> indices = np.array([0, 1, 2, 3]) >>> data = np.array([1.0, 2.0, 3.0, 4.0]) - >>> X = csr_matrix((data, indices, indptr), shape=(3, 4)) + >>> X = csr_array((data, indices, indptr), shape=(3, 4)) >>> X.toarray() array([[1., 2., 0., 0.], [0., 0., 3., 0.], @@ -549,13 +551,13 @@ def inplace_csr_row_normalize_l2(X): Examples -------- - >>> from scipy.sparse import csr_matrix + >>> from scipy.sparse import csr_array >>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2 >>> import numpy as np >>> indptr = np.array([0, 2, 3, 4]) >>> indices = np.array([0, 1, 2, 3]) >>> data = np.array([1.0, 2.0, 3.0, 4.0]) - >>> X = csr_matrix((data, indices, indptr), shape=(3, 4)) + >>> X = csr_array((data, indices, indptr), shape=(3, 4)) >>> X.toarray() array([[1., 2., 0., 0.], [0., 0., 3., 0.], @@ -611,7 +613,7 @@ def assign_rows_csr( Parameters ---------- - X : scipy.sparse.csr_matrix, shape=(n_samples, n_features) + X : scipy.sparse.csr_array, shape=(n_samples, n_features) X_rows : array, dtype=np.intp, shape=n_rows out_rows : array, dtype=np.intp, shape=n_rows out : array, shape=(arbitrary, n_features) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 907de11702af2..e203be48ffba2 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -19,6 +19,7 @@ get_namespace, yield_namespace_device_dtype_combinations, ) +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( _array_api_for_tests, assert_allclose, @@ -508,7 +509,7 @@ def test_randomized_svd_sparse_warnings(sparse_container): X = sparse_container(X) warn_msg = ( - "Calculating SVD of a {} is expensive. csr_matrix is more efficient.".format( + "Calculating SVD of a {} is expensive. CSR format is more efficient.".format( sparse_container.__name__ ) ) @@ -1041,8 +1042,8 @@ def test_safe_sparse_dot_2d_1d(container): def test_safe_sparse_dot_dense_output(dense_output): rng = np.random.RandomState(0) - A = sparse.random(30, 10, density=0.1, random_state=rng) - B = sparse.random(10, 20, density=0.1, random_state=rng) + A = _sparse_random((30, 10), density=0.1, rng=rng) + B = _sparse_random((10, 20), density=0.1, rng=rng) expected = A.dot(B) actual = safe_sparse_dot(A, B, dense_output=dense_output) diff --git a/sklearn/utils/tests/test_param_validation.py b/sklearn/utils/tests/test_param_validation.py index a47eaace5b9a2..06bd866eac9fc 100644 --- a/sklearn/utils/tests/test_param_validation.py +++ b/sklearn/utils/tests/test_param_validation.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from scipy.sparse import csr_matrix +from scipy.sparse import csr_array, csr_matrix from sklearn._config import config_context, get_config from sklearn.base import BaseEstimator, _fit_context @@ -406,6 +406,7 @@ def test_generate_valid_param(constraint): ("array-like", [[1, 2], [3, 4]]), ("array-like", np.array([[1, 2], [3, 4]])), ("sparse matrix", csr_matrix([[1, 2], [3, 4]])), + ("sparse matrix", csr_array([[1, 2], [3, 4]])), *[ ("sparse matrix", container([[1, 2], [3, 4]])) for container in CSR_CONTAINERS diff --git a/sklearn/utils/tests/test_sparse.py b/sklearn/utils/tests/test_sparse.py new file mode 100644 index 0000000000000..b84950b0ac3cf --- /dev/null +++ b/sklearn/utils/tests/test_sparse.py @@ -0,0 +1,116 @@ +import numpy as np +import pytest +import scipy as sp +from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix + +import sklearn + + +@pytest.mark.parametrize( + ["sparse_interface", "x", "result_type"], + [ + ("sparray", csr_array([[1, 2, 3]]), csr_array), + ("sparray", csr_matrix([[1, 2, 3]]), csr_array), + ("sparray", csc_array([[1, 2, 3]]), csc_array), + ("sparray", csc_matrix([[1, 2, 3]]), csc_array), + ("spmatrix", csr_array([[1, 2, 3]]), csr_matrix), + ("spmatrix", csr_matrix([[1, 2, 3]]), csr_matrix), + ("spmatrix", csc_array([[1, 2, 3]]), csc_matrix), + ("spmatrix", csc_matrix([[1, 2, 3]]), csc_matrix), + ], +) +def test_align_api_if_sparse(sparse_interface, x, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + result = sklearn.utils._align_api_if_sparse(x) + assert isinstance(result, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "x", "result_type"], + [ + ("sparray", np.array([[1, 2, 3]]), np.ndarray), + ("spmatrix", np.array([[1, 2, 3]]), np.ndarray), + ], +) +def test_ndarray_align_api_if_sparse(sparse_interface, x, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + result = sklearn.utils._align_api_if_sparse(x) + assert isinstance(result, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "result_type"], + [("sparray", csr_array), ("spmatrix", csr_matrix)], +) +def test_transform_returns_sparse(sparse_interface, result_type): + corpus = [ + "This is the first document.", + "This document is the second document.", + "And this is the third one.", + "Is this the first document?", + ] + with sklearn.config_context(sparse_interface=sparse_interface): + vectorizer = sklearn.feature_extraction.text.CountVectorizer() + X = vectorizer.fit_transform(corpus) + assert isinstance(X, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "result_type"], + [("sparray", csr_array), ("spmatrix", csr_matrix)], +) +def test_function_returns_sparse(sparse_interface, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + X, y = sklearn.datasets.make_regression(n_features=2, random_state=0) + X = sklearn.manifold._locally_linear.barycenter_kneighbors_graph(X, 1) + assert isinstance(X, result_type) + + +@pytest.mark.parametrize( + ["sparse_interface", "result_type"], + [("sparray", csr_array), ("spmatrix", csr_matrix)], +) +def test_estimator_property_sparse(sparse_interface, result_type): + with sklearn.config_context(sparse_interface=sparse_interface): + X, y = sklearn.datasets.make_regression(n_features=2, random_state=0) + regr = sklearn.linear_model.ElasticNet(random_state=0) + regr.fit(X, y) + # check spec_coeff property + assert isinstance(regr.sparse_coef_, result_type) + + +INDEX_CONSTRUCTORS = [ + sp.sparse.csc_array, + sp.sparse.csr_array, + sp.sparse.coo_array, + sp.sparse.csc_matrix, + sp.sparse.csr_matrix, + sp.sparse.coo_matrix, +] +NO_INDEX_TEST_CONSTRUCTORS = [ + sp.sparse.bsr_array, + sp.sparse.bsr_matrix, + sp.sparse.dia_array, + sp.sparse.dok_array, + sp.sparse.lil_array, + sp.sparse.dia_matrix, + sp.sparse.dok_matrix, + sp.sparse.lil_matrix, +] +SPARSE_CONSTRUCTORS = INDEX_CONSTRUCTORS + NO_INDEX_TEST_CONSTRUCTORS + + +@pytest.mark.parametrize("constructor", SPARSE_CONSTRUCTORS) +def test_ensure_sparse_index_int32(constructor): + A = constructor(np.array([[1.0, 2.0, 3.0], [3.0, 2.0, 1.0]])) + sklearn.utils._sparse._ensure_sparse_index_int32(A) + + +@pytest.mark.parametrize("constructor", INDEX_CONSTRUCTORS) +def test_ensure_int32_raises(constructor): + with pytest.raises(ValueError, match="too large"): + rows, cols = [2, 0], [1, np.iinfo(np.int32).max + 1] + if "csc" in constructor.__name__: + rows, cols = cols, rows + A = sp.sparse.coo_array(([1.0, 2.0], (rows, cols))) + sklearn.utils._sparse._ensure_sparse_index_int32(constructor(A)) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index f80b75c02d515..ed07d72788f5b 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -6,6 +6,7 @@ from scipy import linalg from sklearn.datasets import make_classification +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS from sklearn.utils.sparsefuncs import ( @@ -436,15 +437,15 @@ def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor): "X1, X2", [ ( - sp.random(5, 2, density=0.8, format="csr", random_state=0), - sp.random(13, 2, density=0.8, format="csr", random_state=0), + _sparse_random((5, 2), density=0.8, format="csr", rng=0), + _sparse_random((13, 2), density=0.8, format="csr", rng=0), ), ( - sp.random(5, 2, density=0.8, format="csr", random_state=0), + _sparse_random((5, 2), density=0.8, format="csr", rng=0), sp.hstack( [ np.full((13, 1), fill_value=np.nan), - sp.random(13, 1, density=0.8, random_state=42), + _sparse_random((13, 1), density=0.8, rng=42), ], format="csr", ), @@ -477,8 +478,8 @@ def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2, csr_container def test_incr_mean_variance_no_new_n(): # check the behaviour when we update the variance with an empty matrix axis = 0 - X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr() - X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr() + X1 = _sparse_random((5, 1), density=0.8, format="csr", rng=0) + X2 = _sparse_random((0, 1), density=0.8, format="csr", rng=0) last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1]) last_n = np.zeros(X1.shape[1], dtype=np.int64) last_mean, last_var, last_n = incr_mean_variance_axis( @@ -496,7 +497,7 @@ def test_incr_mean_variance_no_new_n(): def test_incr_mean_variance_n_float(): # check the behaviour when last_n is just a number axis = 0 - X = sp.random(5, 2, density=0.8, random_state=0).tocsr() + X = _sparse_random((5, 2), density=0.8, format="csr", rng=0) last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1]) last_n = 0 _, _, new_n = incr_mean_variance_axis( @@ -604,7 +605,7 @@ def test_densify_rows(csr_container): def test_inplace_column_scale(): rng = np.random.RandomState(0) - X = sp.random(100, 200, density=0.05) + X = _sparse_random((100, 200), density=0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() @@ -636,7 +637,7 @@ def test_inplace_column_scale(): def test_inplace_row_scale(): rng = np.random.RandomState(0) - X = sp.random(100, 200, density=0.05) + X = _sparse_random((100, 200), density=0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() @@ -937,7 +938,7 @@ def test_inplace_normalize(csr_container, inplace_csr_row_normalize): def test_csr_row_norms(dtype): # checks that csr_row_norms returns the same output as # scipy.sparse.linalg.norm, and that the dype is the same as X.dtype. - X = sp.random(100, 10, format="csr", dtype=dtype, random_state=42) + X = _sparse_random((100, 10), format="csr", dtype=dtype, rng=42) scipy_norms = sp.linalg.norm(X, axis=1) ** 2 norms = csr_row_norms(X) @@ -952,10 +953,10 @@ def centered_matrices(request): """Returns equivalent tuple[sp.linalg.LinearOperator, np.ndarray].""" sparse_container = request.param - random_state = np.random.default_rng(42) + rng = np.random.default_rng(42) X_sparse = sparse_container( - sp.random(500, 100, density=0.1, format="csr", random_state=random_state) + _sparse_random((500, 100), density=0.1, format="csr", rng=rng) ) X_dense = X_sparse.toarray() mu = np.asarray(X_sparse.mean(axis=0)).ravel() diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py index ae9c380941c8c..61e33fa81feea 100644 --- a/sklearn/utils/tests/test_testing.py +++ b/sklearn/utils/tests/test_testing.py @@ -8,6 +8,7 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.tree import DecisionTreeClassifier +from sklearn.utils._sparse import _sparse_diags, _sparse_random from sklearn.utils._testing import ( TempMemmap, _convert_container, @@ -57,7 +58,7 @@ def test_assert_allclose_dense_sparse(csr_container): with pytest.raises(ValueError, match="Can only compare two sparse"): assert_allclose_dense_sparse(x, y) - A = sparse.diags(np.ones(5), offsets=0).tocsr() + A = _sparse_diags(np.ones(5), offsets=0, format="csr") B = csr_container(np.ones((1, 5))) with pytest.raises(AssertionError, match="Arrays are not equal"): assert_allclose_dense_sparse(B, A) @@ -1102,7 +1103,7 @@ def test_convert_container_sparse_to_sparse(constructor_name): """Non-regression test to check that we can still convert a sparse container from a given format to another format. """ - X_sparse = sparse.random(10, 10, density=0.1, format="csr") + X_sparse = _sparse_random((10, 10), density=0.1, format="csr") _convert_container(X_sparse, constructor_name) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index dbc9fec7b3ee3..49440a2f07695 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -44,6 +44,7 @@ MockDataFrame, _MockEstimatorOnOffPrediction, ) +from sklearn.utils._sparse import _sparse_random from sklearn.utils._testing import ( SkipTest, TempMemmap, @@ -147,6 +148,7 @@ def test_as_float_array(): # Test the copy parameter with some matrices matrices = [ sp.csc_matrix(np.arange(5)).toarray(), + sp.csc_array([np.arange(5)]).toarray(), _sparse_random_matrix(10, 10, density=0.10).toarray(), ] for M in matrices: @@ -156,7 +158,7 @@ def test_as_float_array(): @pytest.mark.parametrize( - "X", [np.random.random((10, 2)), sp.random(10, 2, format="csr")] + "X", [np.random.random((10, 2)), _sparse_random((10, 2), format="csr")] ) def test_as_float_array_nan(X): X[5, 0] = np.nan @@ -171,6 +173,7 @@ def test_np_matrix(): assert not isinstance(as_float_array(X), np.matrix) assert not isinstance(as_float_array(sp.csc_matrix(X)), np.matrix) + assert not isinstance(as_float_array(sp.csc_array(X)), np.matrix) def test_memmap(): @@ -203,7 +206,7 @@ def test_ordering(): if copy: assert A is not B - X = sp.csr_matrix(X) + X = sp.csr_array(X) X.data = X.data[::-1] assert not X.data.flags["C_CONTIGUOUS"] @@ -212,7 +215,7 @@ def test_ordering(): "value, ensure_all_finite", [(np.inf, False), (np.nan, "allow-nan"), (np.nan, False)], ) -@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) +@pytest.mark.parametrize("retype", [np.asarray, sp.csr_array, sp.csr_matrix]) def test_check_array_ensure_all_finite_valid(value, ensure_all_finite, retype): X = retype(np.arange(4).reshape(2, 2).astype(float)) X[0, 0] = value @@ -239,7 +242,7 @@ def test_check_array_ensure_all_finite_valid(value, ensure_all_finite, retype): (np.nan, "", 1, "Input contains NaN"), ], ) -@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) +@pytest.mark.parametrize("retype", [np.asarray, sp.csr_array, sp.csr_matrix]) def test_check_array_ensure_all_finite_invalid( value, input_name, ensure_all_finite, match_msg, retype ): @@ -255,7 +258,7 @@ def test_check_array_ensure_all_finite_invalid( @pytest.mark.parametrize("input_name", ["X", "y", "sample_weight"]) -@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) +@pytest.mark.parametrize("retype", [np.asarray, sp.csr_array, sp.csr_matrix]) def test_check_array_links_to_imputer_doc_only_for_X(input_name, retype): data = retype(np.arange(4).reshape(2, 2).astype(np.float64)) data[0, 0] = np.nan @@ -357,7 +360,7 @@ def test_check_array(): # accept_sparse == False # raise error on sparse inputs X = [[1, 2], [3, 4]] - X_csr = sp.csr_matrix(X) + X_csr = sp.csr_array(X) with pytest.raises(TypeError): check_array(X_csr) @@ -621,9 +624,9 @@ def test_check_array_dtype_warning(): X_int_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] X_float32 = np.asarray(X_int_list, dtype=np.float32) X_int64 = np.asarray(X_int_list, dtype=np.int64) - X_csr_float32 = sp.csr_matrix(X_float32) - X_csc_float32 = sp.csc_matrix(X_float32) - X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32) + X_csr_float32 = sp.csr_array(X_float32) + X_csc_float32 = sp.csc_array(X_float32) + X_csc_int32 = sp.csc_array(X_int64, dtype=np.int32) integer_data = [X_int64, X_csc_int32] float32_data = [X_float32, X_csr_float32, X_csc_float32] with warnings.catch_warnings(): @@ -662,7 +665,7 @@ def test_check_array_dtype_warning(): def test_check_array_accept_sparse_type_exception(): X = [[1, 2], [3, 4]] - X_csr = sp.csr_matrix(X) + X_csr = sp.csr_array(X) invalid_type = SVR() msg = ( @@ -693,7 +696,7 @@ def test_check_array_accept_sparse_type_exception(): def test_check_array_accept_sparse_no_exception(): X = [[1, 2], [3, 4]] - X_csr = sp.csr_matrix(X) + X_csr = sp.csr_array(X) check_array(X_csr, accept_sparse=True) check_array(X_csr, accept_sparse="csr") @@ -703,7 +706,7 @@ def test_check_array_accept_sparse_no_exception(): @pytest.fixture(params=["csr", "csc", "coo", "bsr"]) def X_64bit(request): - X = sp.random(20, 10, format=request.param) + X = _sparse_random((20, 10), format=request.param) if request.param == "coo": if hasattr(X, "coords"): @@ -833,7 +836,7 @@ def test_check_array_complex_data_error(): check_array(X) # sparse matrix - X = sp.coo_matrix([[0, 1 + 2j], [0, 0]]) + X = sp.coo_array([[0, 1 + 2j], [0, 0]]) with pytest.raises(ValueError, match="Complex data not supported"): check_array(X) @@ -867,12 +870,12 @@ def test_check_symmetric(): test_arrays = { "dense": arr_asym, - "dok": sp.dok_matrix(arr_asym), - "csr": sp.csr_matrix(arr_asym), - "csc": sp.csc_matrix(arr_asym), - "coo": sp.coo_matrix(arr_asym), - "lil": sp.lil_matrix(arr_asym), - "bsr": sp.bsr_matrix(arr_asym), + "dok": sp.dok_array(arr_asym), + "csr": sp.csr_array(arr_asym), + "csc": sp.csc_array(arr_asym), + "coo": sp.coo_array(arr_asym), + "lil": sp.lil_array(arr_asym), + "bsr": sp.bsr_array(arr_asym), } # check error for bad inputs @@ -1020,7 +1023,7 @@ def test_check_consistent_length(): input types trigger TypeErrors.""" check_consistent_length([1], [2], [3], [4], [5]) check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ["a", "b"]) - check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2))) + check_consistent_length([1], (2,), np.array([3]), sp.csr_array((1, 2))) with pytest.raises(ValueError, match="inconsistent numbers of samples"): check_consistent_length([1, 2], [1]) with pytest.raises(TypeError, match=r"got <\w+ 'int'>"): @@ -1298,6 +1301,13 @@ class MockEstimator: sp.bsr_matrix, sp.dok_matrix, sp.dia_matrix, + sp.csr_array, + sp.csc_array, + sp.coo_array, + sp.lil_array, + sp.bsr_array, + sp.dok_array, + sp.dia_array, ], ) def test_check_non_negative(retype): @@ -1707,21 +1717,24 @@ def test_check_pos_label_consistency_invalid_array_api( assert _check_pos_label_consistency("a", arr) == "a" -@pytest.mark.parametrize("toarray", [np.array, sp.csr_matrix, sp.csc_matrix]) +CS_SPARSE = [sp.csr_array, sp.csr_matrix, sp.csc_array, sp.csc_matrix] + + +@pytest.mark.parametrize("toarray", [np.array] + CS_SPARSE) def test_allclose_dense_sparse_equals(toarray): base = np.arange(9).reshape(3, 3) x, y = toarray(base), toarray(base) assert _allclose_dense_sparse(x, y) -@pytest.mark.parametrize("toarray", [np.array, sp.csr_matrix, sp.csc_matrix]) +@pytest.mark.parametrize("toarray", [np.array] + CS_SPARSE) def test_allclose_dense_sparse_not_equals(toarray): base = np.arange(9).reshape(3, 3) x, y = toarray(base), toarray(base + 1) assert not _allclose_dense_sparse(x, y) -@pytest.mark.parametrize("toarray", [sp.csr_matrix, sp.csc_matrix]) +@pytest.mark.parametrize("toarray", CS_SPARSE) def test_allclose_dense_sparse_raise(toarray): x = np.arange(9).reshape(3, 3) y = toarray(x + 1) @@ -1799,8 +1812,10 @@ def test_check_method_params(indices): _params = { "list": [1, 2, 3, 4], "array": np.array([1, 2, 3, 4]), - "sparse-col": sp.csc_matrix([1, 2, 3, 4]).T, - "sparse-row": sp.csc_matrix([1, 2, 3, 4]), + "sparse-col2": sp.csc_matrix([[1, 2, 3, 4]]).T, + "sparse-row2": sp.csc_matrix([[1, 2, 3, 4]]), + "sparse-col": sp.csc_array([[1, 2, 3, 4]]).T, + "sparse-row": sp.csc_array([[1, 2, 3, 4]]), "scalar-int": 1, "scalar-str": "xxx", "None": None, @@ -1808,7 +1823,7 @@ def test_check_method_params(indices): result = _check_method_params(X, params=_params, indices=indices) indices_ = indices if indices is not None else list(range(X.shape[0])) - for key in ["sparse-row", "scalar-int", "scalar-str", "None"]: + for key in ["sparse-row", "sparse-row2", "scalar-int", "scalar-str", "None"]: assert result[key] is _params[key] assert result["list"] == _safe_indexing(_params["list"], indices_) @@ -1816,6 +1831,9 @@ def test_check_method_params(indices): assert_allclose_dense_sparse( result["sparse-col"], _safe_indexing(_params["sparse-col"], indices_) ) + assert_allclose_dense_sparse( + result["sparse-col2"], _safe_indexing(_params["sparse-col2"], indices_) + ) @pytest.mark.parametrize("sp_format", [True, "csr", "csc", "coo", "bsr"]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f1c3d11de13b2..99e50f4fcc358 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -532,10 +532,10 @@ def indexable(*iterables): Examples -------- >>> from sklearn.utils import indexable - >>> from scipy.sparse import csr_matrix + >>> from scipy.sparse import csr_array >>> import numpy as np >>> iterables = [ - ... [1, 2, 3], np.array([2, 3, 4]), None, csr_matrix([[5], [6], [7]]) + ... [1, 2, 3], np.array([2, 3, 4]), None, csr_array([[5], [6], [7]]) ... ] >>> indexable(*iterables) [[1, 2, 3], array([2, 3, 4]), None, <...Sparse...dtype 'int64'...shape (3, 1)>] @@ -594,6 +594,10 @@ def _ensure_sparse_format( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + accept_large_sparse : bool + If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by + accept_sparse, accept_large_sparse will cause it to be accepted only + if its indices are stored with a 32-bit dtype. estimator_name : str, default=None The estimator name, used to construct the error message. @@ -1604,10 +1608,10 @@ def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=Fal array([[0, 1, 2], [1, 0, 1], [2, 1, 0]]) - >>> from scipy.sparse import csr_matrix - >>> sparse_symmetric_array = csr_matrix(symmetric_array) + >>> from scipy.sparse import csr_array + >>> sparse_symmetric_array = csr_array(symmetric_array) >>> check_symmetric(sparse_symmetric_array) - """ if (array.ndim != 2) or (array.shape[0] != array.shape[1]):