diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index ba50134f744bb..715ab12d6cbc6 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -118,6 +118,11 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Fix| :class:`linear_model.ElasticNet`, :class:`linear_model.ElasticNetCV`, + :class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't + accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger + `. + - |API| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV` will now allow `alpha=0` when `cv != None`, which is consistent with :class:`linear_model.Ridge` and :class:`linear_model.RidgeClassifier`. diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 65280c06319d9..18c98ad5348b5 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -198,7 +198,8 @@ def _more_tags(self): "check_estimators_dtypes": "raises nan error", "check_fit2d_1sample": "_scale_normalize fails", "check_fit2d_1feature": "raises apply_along_axis error", - "check_estimator_sparse_data": "does not fail gracefully", + "check_estimator_sparse_matrix": "does not fail gracefully", + "check_estimator_sparse_array": "does not fail gracefully", "check_methods_subset_invariance": "empty array passed inside", "check_dont_overwrite_parameters": "empty array passed inside", "check_fit2d_predict1d": "empty array passed inside", diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index d1293bb62f262..e9602bd5f0a6c 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -906,9 +906,12 @@ def fit(self, X, y, sample_weight=None, check_input=True): Parameters ---------- - X : {ndarray, sparse matrix} of (n_samples, n_features) + X : {ndarray, sparse matrix, sparse array} of (n_samples, n_features) Data. + Note that large sparse matrices and arrays requiring `int64` + indices are not accepted. + y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target. Will be cast to X's dtype if necessary. @@ -958,6 +961,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): accept_sparse="csc", order="F", dtype=[np.float64, np.float32], + accept_large_sparse=False, copy=X_copied, multi_output=True, y_numeric=True, @@ -1532,7 +1536,8 @@ def fit(self, X, y, sample_weight=None, **params): X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. Pass directly as Fortran-contiguous data to avoid unnecessary memory duplication. If y is mono-output, - X can be sparse. + X can be sparse. Note that large sparse matrices and arrays + requiring `int64` indices are not accepted. y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. @@ -1582,7 +1587,10 @@ def fit(self, X, y, sample_weight=None, **params): # csr. We also want to allow y to be 64 or 32 but check_X_y only # allows to convert for 64. check_X_params = dict( - accept_sparse="csc", dtype=[np.float64, np.float32], copy=False + accept_sparse="csc", + dtype=[np.float64, np.float32], + copy=False, + accept_large_sparse=False, ) X, y = self._validate_data( X, y, validate_separately=(check_X_params, check_y_params) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index bfb83884399ef..c4b4f2bd3dd27 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -698,7 +698,19 @@ def fit(self, X, Y, **fit_params): X_aug = np.hstack((X, Y_pred_chain)) elif sp.issparse(X): - Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1])) + # TODO: remove this condition check when the minimum supported scipy version + # doesn't support sparse matrices anymore + if not sp.isspmatrix(X): + # if `X` is a scipy sparse dok_array, we convert it to a sparse + # coo_array format before hstacking, it's faster; see + # https://github.com/scipy/scipy/issues/20060#issuecomment-1937007039: + if X.format == "dok": + X = sp.coo_array(X) + # in case that `X` is a sparse array we create `Y_pred_chain` as a + # sparse array format: + Y_pred_chain = sp.coo_array((X.shape[0], Y.shape[1])) + else: + Y_pred_chain = sp.coo_matrix((X.shape[0], Y.shape[1])) X_aug = sp.hstack((X, Y_pred_chain), format="lil") else: diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 20edacaba5a34..c78765cb1ba5e 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -923,7 +923,7 @@ def __exit__(self, exc_type, exc_value, _): class MinimalClassifier: - """Minimal classifier implementation with inheriting from BaseEstimator. + """Minimal classifier implementation without inheriting from BaseEstimator. This estimator should be tested with: @@ -972,7 +972,7 @@ def score(self, X, y): class MinimalRegressor: - """Minimal regressor implementation with inheriting from BaseEstimator. + """Minimal regressor implementation without inheriting from BaseEstimator. This estimator should be tested with: @@ -1012,7 +1012,7 @@ def score(self, X, y): class MinimalTransformer: - """Minimal transformer implementation with inheriting from + """Minimal transformer implementation without inheriting from BaseEstimator. This estimator should be tested with: diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 564bd6928a5a6..ea220fa639f8a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -62,7 +62,7 @@ generate_invalid_param_val, make_constraint, ) -from ..utils.fixes import parse_version, sp_version +from ..utils.fixes import SPARSE_ARRAY_PRESENT, parse_version, sp_version from ..utils.validation import check_is_fitted from . import IS_PYPY, is_scalar_nan, shuffle from ._param_validation import Interval @@ -134,7 +134,8 @@ def _yield_checks(estimator): if hasattr(estimator, "sparsify"): yield check_sparsify_coefficients - yield check_estimator_sparse_data + yield check_estimator_sparse_array + yield check_estimator_sparse_matrix # Test that estimators can be pickled, and once pickled # give the same answer as before. @@ -836,17 +837,17 @@ def _is_pairwise_metric(estimator): return bool(metric == "precomputed") -def _generate_sparse_matrix(X_csr): - """Generate sparse matrices with {32,64}bit indices of diverse format. +def _generate_sparse_data(X_csr): + """Generate sparse matrices or arrays with {32,64}bit indices of diverse format. Parameters ---------- - X_csr: CSR Matrix - Input matrix in CSR format. + X_csr: scipy.sparse.csr_matrix or scipy.sparse.csr_array + Input in CSR format. Returns ------- - out: iter(Matrices) + out: iter(Matrices) or iter(Arrays) In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo', 'coo_64', 'csc_64', 'csr_64'] """ @@ -1029,19 +1030,18 @@ def check_array_api_input_and_values( ) -def check_estimator_sparse_data(name, estimator_orig): +def _check_estimator_sparse_container(name, estimator_orig, sparse_type): rng = np.random.RandomState(0) X = rng.uniform(size=(40, 3)) X[X < 0.8] = 0 X = _enforce_estimator_tags_X(estimator_orig, X) - X_csr = sparse.csr_matrix(X) y = (4 * rng.uniform(size=40)).astype(int) # catch deprecation warnings with ignore_warnings(category=FutureWarning): estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) tags = _safe_tags(estimator_orig) - for matrix_format, X in _generate_sparse_matrix(X_csr): + for matrix_format, X in _generate_sparse_data(sparse_type(X)): # catch deprecation warnings with ignore_warnings(category=FutureWarning): estimator = clone(estimator_orig) @@ -1052,13 +1052,14 @@ def check_estimator_sparse_data(name, estimator_orig): err_msg = ( f"Estimator {name} doesn't seem to support {matrix_format} " "matrix, and is not failing gracefully, e.g. by using " - "check_array(X, accept_large_sparse=False)" + "check_array(X, accept_large_sparse=False)." ) else: err_msg = ( f"Estimator {name} doesn't seem to fail gracefully on sparse " "data: error message should state explicitly that sparse " - "input is not supported if this is not the case." + "input is not supported if this is not the case, e.g. by using " + "check_array(X, accept_sparse=False)." ) with raises( (TypeError, ValueError), @@ -1083,6 +1084,15 @@ def check_estimator_sparse_data(name, estimator_orig): assert probs.shape == expected_probs_shape +def check_estimator_sparse_matrix(name, estimator_orig): + _check_estimator_sparse_container(name, estimator_orig, sparse.csr_matrix) + + +def check_estimator_sparse_array(name, estimator_orig): + if SPARSE_ARRAY_PRESENT: + _check_estimator_sparse_container(name, estimator_orig, sparse.csr_array) + + @ignore_warnings(category=FutureWarning) def check_sample_weights_pandas_series(name, estimator_orig): # check that estimators will accept a 'sample_weight' parameter of diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 41c7526c08b17..acbcc7793d752 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -50,6 +50,25 @@ BSR_CONTAINERS.append(scipy.sparse.bsr_array) DIA_CONTAINERS.append(scipy.sparse.dia_array) + +# Remove when minimum scipy version is 1.11.0 +try: + from scipy.sparse import sparray # noqa + + SPARRAY_PRESENT = True +except ImportError: + SPARRAY_PRESENT = False + + +# Remove when minimum scipy version is 1.8 +try: + from scipy.sparse import csr_array # noqa + + SPARSE_ARRAY_PRESENT = True +except ImportError: + SPARSE_ARRAY_PRESENT = False + + try: from scipy.optimize._linesearch import line_search_wolfe1, line_search_wolfe2 except ImportError: # SciPy < 1.8 diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index f52ef27df4015..1e0a083a9c989 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -64,7 +64,7 @@ check_requires_y_none, set_random_state, ) -from sklearn.utils.fixes import CSR_CONTAINERS +from sklearn.utils.fixes import CSR_CONTAINERS, SPARRAY_PRESENT from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import check_array, check_is_fitted, check_X_y @@ -207,9 +207,17 @@ def fit(self, X, y): class NoSparseClassifier(BaseBadClassifier): + def __init__(self, raise_for_type=None): + # raise_for_type : str, expects "sparse_array" or "sparse_matrix" + self.raise_for_type = raise_for_type + def fit(self, X, y): X, y = self._validate_data(X, y, accept_sparse=["csr", "csc"]) - if sp.issparse(X): + if self.raise_for_type == "sparse_array": + correct_type = isinstance(X, sp.sparray) + elif self.raise_for_type == "sparse_matrix": + correct_type = isinstance(X, sp.spmatrix) + if correct_type: raise ValueError("Nonsensical Error") return self @@ -357,6 +365,13 @@ def predict(self, X): class LargeSparseNotSupportedClassifier(BaseEstimator): + """Estimator that claims to support large sparse data + (accept_large_sparse=True), but doesn't""" + + def __init__(self, raise_for_type=None): + # raise_for_type : str, expects "sparse_array" or "sparse_matrix" + self.raise_for_type = raise_for_type + def fit(self, X, y): X, y = self._validate_data( X, @@ -366,11 +381,15 @@ def fit(self, X, y): multi_output=True, y_numeric=True, ) - if sp.issparse(X): - if X.getformat() == "coo": + if self.raise_for_type == "sparse_array": + correct_type = isinstance(X, sp.sparray) + elif self.raise_for_type == "sparse_matrix": + correct_type = isinstance(X, sp.spmatrix) + if correct_type: + if X.format == "coo": if X.row.dtype == "int64" or X.col.dtype == "int64": raise ValueError("Estimator doesn't support 64-bit indices") - elif X.getformat() in ["csc", "csr"]: + elif X.format in ["csc", "csr"]: assert "int64" not in ( X.indices.dtype, X.indptr.dtype, @@ -634,11 +653,15 @@ def test_check_estimator(): ) with raises(AssertionError, match=msg): check_estimator(NotInvariantPredict()) - # check for sparse matrix input handling + # check for sparse data input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name with raises(AssertionError, match=msg): - check_estimator(NoSparseClassifier()) + check_estimator(NoSparseClassifier("sparse_matrix")) + + if SPARRAY_PRESENT: + with raises(AssertionError, match=msg): + check_estimator(NoSparseClassifier("sparse_array")) # check for classifiers reducing to less than two classes via sample weights name = OneClassSampleErrorClassifier.__name__ @@ -656,7 +679,11 @@ def test_check_estimator(): r"support \S{3}_64 matrix, and is not failing gracefully.*" ) with raises(AssertionError, match=msg): - check_estimator(LargeSparseNotSupportedClassifier()) + check_estimator(LargeSparseNotSupportedClassifier("sparse_matrix")) + + if SPARRAY_PRESENT: + with raises(AssertionError, match=msg): + check_estimator(LargeSparseNotSupportedClassifier("sparse_array")) # does error on binary_only untagged estimator msg = "Only 2 classes are supported"