scikit-learn · adrinjalali · Feb 23, 2024 · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -118,6 +118,11 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Fix| :class:`linear_model.ElasticNet`, :class:`linear_model.ElasticNetCV`,
+  :class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't
+  accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger
+  <StefanieSenger>`.
+
 - |API| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV`
   will now allow `alpha=0` when `cv != None`, which is consistent with
   :class:`linear_model.Ridge` and :class:`linear_model.RidgeClassifier`.

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
@@ -198,7 +198,8 @@ def _more_tags(self):
                 "check_estimators_dtypes": "raises nan error",
                 "check_fit2d_1sample": "_scale_normalize fails",
                 "check_fit2d_1feature": "raises apply_along_axis error",
-                "check_estimator_sparse_data": "does not fail gracefully",
+                "check_estimator_sparse_matrix": "does not fail gracefully",
+                "check_estimator_sparse_array": "does not fail gracefully",
                 "check_methods_subset_invariance": "empty array passed inside",
                 "check_dont_overwrite_parameters": "empty array passed inside",
                 "check_fit2d_predict1d": "empty array passed inside",

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
@@ -906,9 +906,12 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         Parameters
         ----------
-        X : {ndarray, sparse matrix} of (n_samples, n_features)
+        X : {ndarray, sparse matrix, sparse array} of (n_samples, n_features)
             Data.
 
+            Note that large sparse matrices and arrays requiring `int64`
+            indices are not accepted.
+
         y : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Target. Will be cast to X's dtype if necessary.
 
@@ -958,6 +961,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                 accept_sparse="csc",
                 order="F",
                 dtype=[np.float64, np.float32],
+                accept_large_sparse=False,
                 copy=X_copied,
                 multi_output=True,
                 y_numeric=True,
@@ -1532,7 +1536,8 @@ def fit(self, X, y, sample_weight=None, **params):
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training data. Pass directly as Fortran-contiguous data
             to avoid unnecessary memory duplication. If y is mono-output,
-            X can be sparse.
+            X can be sparse. Note that large sparse matrices and arrays
+            requiring `int64` indices are not accepted.
 
         y : array-like of shape (n_samples,) or (n_samples, n_targets)
             Target values.
@@ -1582,7 +1587,10 @@ def fit(self, X, y, sample_weight=None, **params):
             # csr. We also want to allow y to be 64 or 32 but check_X_y only
             # allows to convert for 64.
             check_X_params = dict(
-                accept_sparse="csc", dtype=[np.float64, np.float32], copy=False
+                accept_sparse="csc",
+                dtype=[np.float64, np.float32],
+                copy=False,
+                accept_large_sparse=False,
             )
             X, y = self._validate_data(
                 X, y, validate_separately=(check_X_params, check_y_params)

diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
@@ -698,7 +698,19 @@ def fit(self, X, Y, **fit_params):
                 X_aug = np.hstack((X, Y_pred_chain))
 
         elif sp.issparse(X):
-            Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))
+            # TODO: remove this condition check when the minimum supported scipy version
+            # doesn't support sparse matrices anymore
+            if not sp.isspmatrix(X):
+                # if `X` is a scipy sparse dok_array, we convert it to a sparse
+                # coo_array format before hstacking, it's faster; see
+                # https://github.com/scipy/scipy/issues/20060#issuecomment-1937007039:
+                if X.format == "dok":
+                    X = sp.coo_array(X)
+                # in case that `X` is a sparse array we create `Y_pred_chain` as a
+                # sparse array format:
+                Y_pred_chain = sp.coo_array((X.shape[0], Y.shape[1]))
+            else:
+                Y_pred_chain = sp.coo_matrix((X.shape[0], Y.shape[1]))
             X_aug = sp.hstack((X, Y_pred_chain), format="lil")
 
         else:

diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
@@ -923,7 +923,7 @@ def __exit__(self, exc_type, exc_value, _):
 
 
 class MinimalClassifier:
-    """Minimal classifier implementation with inheriting from BaseEstimator.
+    """Minimal classifier implementation without inheriting from BaseEstimator.
 
     This estimator should be tested with:
 
@@ -972,7 +972,7 @@ def score(self, X, y):
 
 
 class MinimalRegressor:
-    """Minimal regressor implementation with inheriting from BaseEstimator.
+    """Minimal regressor implementation without inheriting from BaseEstimator.
 
     This estimator should be tested with:
 
@@ -1012,7 +1012,7 @@ def score(self, X, y):
 
 
 class MinimalTransformer:
-    """Minimal transformer implementation with inheriting from
+    """Minimal transformer implementation without inheriting from
     BaseEstimator.
 
     This estimator should be tested with:

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -62,7 +62,7 @@
     generate_invalid_param_val,
     make_constraint,
 )
-from ..utils.fixes import parse_version, sp_version
+from ..utils.fixes import SPARSE_ARRAY_PRESENT, parse_version, sp_version
 from ..utils.validation import check_is_fitted
 from . import IS_PYPY, is_scalar_nan, shuffle
 from ._param_validation import Interval
@@ -134,7 +134,8 @@ def _yield_checks(estimator):
     if hasattr(estimator, "sparsify"):
         yield check_sparsify_coefficients
 
-    yield check_estimator_sparse_data
+    yield check_estimator_sparse_array
+    yield check_estimator_sparse_matrix
 
     # Test that estimators can be pickled, and once pickled
     # give the same answer as before.
@@ -836,17 +837,17 @@ def _is_pairwise_metric(estimator):
     return bool(metric == "precomputed")
 
 
-def _generate_sparse_matrix(X_csr):
-    """Generate sparse matrices with {32,64}bit indices of diverse format.
+def _generate_sparse_data(X_csr):
+    """Generate sparse matrices or arrays with {32,64}bit indices of diverse format.
 
     Parameters
     ----------
-    X_csr: CSR Matrix
-        Input matrix in CSR format.
+    X_csr: scipy.sparse.csr_matrix or scipy.sparse.csr_array
+        Input in CSR format.
 
     Returns
     -------
-    out: iter(Matrices)
+    out: iter(Matrices) or iter(Arrays)
         In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',
         'coo_64', 'csc_64', 'csr_64']
     """
@@ -1029,19 +1030,18 @@ def check_array_api_input_and_values(
     )
 
 
-def check_estimator_sparse_data(name, estimator_orig):
+def _check_estimator_sparse_container(name, estimator_orig, sparse_type):
     rng = np.random.RandomState(0)
     X = rng.uniform(size=(40, 3))
     X[X < 0.8] = 0
     X = _enforce_estimator_tags_X(estimator_orig, X)
-    X_csr = sparse.csr_matrix(X)
     y = (4 * rng.uniform(size=40)).astype(int)
     # catch deprecation warnings
     with ignore_warnings(category=FutureWarning):
         estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
     tags = _safe_tags(estimator_orig)
-    for matrix_format, X in _generate_sparse_matrix(X_csr):
+    for matrix_format, X in _generate_sparse_data(sparse_type(X)):
         # catch deprecation warnings
         with ignore_warnings(category=FutureWarning):
             estimator = clone(estimator_orig)
@@ -1052,13 +1052,14 @@ def check_estimator_sparse_data(name, estimator_orig):
             err_msg = (
                 f"Estimator {name} doesn't seem to support {matrix_format} "
                 "matrix, and is not failing gracefully, e.g. by using "
-                "check_array(X, accept_large_sparse=False)"
+                "check_array(X, accept_large_sparse=False)."
             )
         else:
             err_msg = (
                 f"Estimator {name} doesn't seem to fail gracefully on sparse "
                 "data: error message should state explicitly that sparse "
-                "input is not supported if this is not the case."
+                "input is not supported if this is not the case, e.g. by using "
+                "check_array(X, accept_sparse=False)."
             )
         with raises(
             (TypeError, ValueError),
@@ -1083,6 +1084,15 @@ def check_estimator_sparse_data(name, estimator_orig):
                 assert probs.shape == expected_probs_shape
 
 
+def check_estimator_sparse_matrix(name, estimator_orig):
+    _check_estimator_sparse_container(name, estimator_orig, sparse.csr_matrix)
+
+
+def check_estimator_sparse_array(name, estimator_orig):
+    if SPARSE_ARRAY_PRESENT:
+        _check_estimator_sparse_container(name, estimator_orig, sparse.csr_array)
+
+
 @ignore_warnings(category=FutureWarning)
 def check_sample_weights_pandas_series(name, estimator_orig):
     # check that estimators will accept a 'sample_weight' parameter of

diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -50,6 +50,25 @@
     BSR_CONTAINERS.append(scipy.sparse.bsr_array)
     DIA_CONTAINERS.append(scipy.sparse.dia_array)
 
+
+# Remove when minimum scipy version is 1.11.0
+try:
+    from scipy.sparse import sparray  # noqa
+
+    SPARRAY_PRESENT = True
+except ImportError:
+    SPARRAY_PRESENT = False
+
+
+# Remove when minimum scipy version is 1.8
+try:
+    from scipy.sparse import csr_array  # noqa
+
+    SPARSE_ARRAY_PRESENT = True
+except ImportError:
+    SPARSE_ARRAY_PRESENT = False
+
+
 try:
     from scipy.optimize._linesearch import line_search_wolfe1, line_search_wolfe2
 except ImportError:  # SciPy < 1.8

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
@@ -64,7 +64,7 @@
     check_requires_y_none,
     set_random_state,
 )
-from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.fixes import CSR_CONTAINERS, SPARRAY_PRESENT
 from sklearn.utils.metaestimators import available_if
 from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
@@ -207,9 +207,17 @@ def fit(self, X, y):
 
 
 class NoSparseClassifier(BaseBadClassifier):
+    def __init__(self, raise_for_type=None):
+        # raise_for_type : str, expects "sparse_array" or "sparse_matrix"
+        self.raise_for_type = raise_for_type
+
     def fit(self, X, y):
         X, y = self._validate_data(X, y, accept_sparse=["csr", "csc"])
-        if sp.issparse(X):
+        if self.raise_for_type == "sparse_array":
+            correct_type = isinstance(X, sp.sparray)
+        elif self.raise_for_type == "sparse_matrix":
+            correct_type = isinstance(X, sp.spmatrix)
+        if correct_type:
             raise ValueError("Nonsensical Error")
         return self
 
@@ -357,6 +365,13 @@ def predict(self, X):
 
 
 class LargeSparseNotSupportedClassifier(BaseEstimator):
+    """Estimator that claims to support large sparse data
+    (accept_large_sparse=True), but doesn't"""
+
+    def __init__(self, raise_for_type=None):
+        # raise_for_type : str, expects "sparse_array" or "sparse_matrix"
+        self.raise_for_type = raise_for_type
+
     def fit(self, X, y):
         X, y = self._validate_data(
             X,
@@ -366,11 +381,15 @@ def fit(self, X, y):
             multi_output=True,
             y_numeric=True,
         )
-        if sp.issparse(X):
-            if X.getformat() == "coo":
+        if self.raise_for_type == "sparse_array":
+            correct_type = isinstance(X, sp.sparray)
+        elif self.raise_for_type == "sparse_matrix":
+            correct_type = isinstance(X, sp.spmatrix)
+        if correct_type:
+            if X.format == "coo":
                 if X.row.dtype == "int64" or X.col.dtype == "int64":
                     raise ValueError("Estimator doesn't support 64-bit indices")
-            elif X.getformat() in ["csc", "csr"]:
+            elif X.format in ["csc", "csr"]:
                 assert "int64" not in (
                     X.indices.dtype,
                     X.indptr.dtype,
@@ -634,11 +653,15 @@ def test_check_estimator():
     )
     with raises(AssertionError, match=msg):
         check_estimator(NotInvariantPredict())
-    # check for sparse matrix input handling
+    # check for sparse data input handling
     name = NoSparseClassifier.__name__
     msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
     with raises(AssertionError, match=msg):
-        check_estimator(NoSparseClassifier())
+        check_estimator(NoSparseClassifier("sparse_matrix"))
+
+    if SPARRAY_PRESENT:
+        with raises(AssertionError, match=msg):
+            check_estimator(NoSparseClassifier("sparse_array"))
 
     # check for classifiers reducing to less than two classes via sample weights
     name = OneClassSampleErrorClassifier.__name__
@@ -656,7 +679,11 @@ def test_check_estimator():
         r"support \S{3}_64 matrix, and is not failing gracefully.*"
     )
     with raises(AssertionError, match=msg):
-        check_estimator(LargeSparseNotSupportedClassifier())
+        check_estimator(LargeSparseNotSupportedClassifier("sparse_matrix"))
+
+    if SPARRAY_PRESENT:
+        with raises(AssertionError, match=msg):
+            check_estimator(LargeSparseNotSupportedClassifier("sparse_array"))
 
     # does error on binary_only untagged estimator
     msg = "Only 2 classes are supported"