Skip to content

TST separate checks for sparse array and sparse matrix input in estimator_checks #27576

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a6c488d
separate checks for sparse array and sparse matrix input
StefanieSenger Oct 12, 2023
c60e232
fix input for raise_for_type
StefanieSenger Oct 12, 2023
0071c10
correcter error message
StefanieSenger Oct 12, 2023
8b1eb77
repaired datatype selection and CI failures
StefanieSenger Oct 12, 2023
028d580
Lasso and ElasticNet don't support large sparse data
StefanieSenger Oct 18, 2023
5acf09a
Merge branch 'main' into sparse_estimator_checks
StefanieSenger Oct 18, 2023
8e9feaa
refactored code of almost identical checks
StefanieSenger Oct 18, 2023
22b3998
changelog
StefanieSenger Oct 18, 2023
b156ef6
test should now run with lower scipy version
StefanieSenger Oct 19, 2023
bba2616
fix
StefanieSenger Oct 19, 2023
2c7b59a
Update sklearn/linear_model/_coordinate_descent.py
StefanieSenger Oct 19, 2023
3314d79
doc for LinearModelCV
StefanieSenger Oct 19, 2023
cb0f629
Merge branch 'main' into sparse_estimator_checks
StefanieSenger Nov 11, 2023
22a0f1d
'dia' part of data generation and handled DeprecationWarning
StefanieSenger Nov 11, 2023
8792ce8
ignore_warnings instead of filterwarnings
StefanieSenger Nov 11, 2023
7ee5d0b
Merge branch 'main' into sparse_estimator_checks
StefanieSenger Dec 7, 2023
28a1417
use warnings.filterwarnings
StefanieSenger Dec 7, 2023
2c6e49a
match warning message from start and collect it
StefanieSenger Dec 7, 2023
cfe1a38
Merge branch 'main' into sparse_estimator_checks
StefanieSenger Jan 3, 2024
be3ea35
moved change log entry
StefanieSenger Jan 5, 2024
7cc9a53
Merge branch 'main' into sparse_estimator_checks
adrinjalali Jan 22, 2024
c00f38d
Merge branch 'main' into sparse_estimator_checks
StefanieSenger Feb 6, 2024
4c00851
swapped deprecated X.getformat() for the newer X.format
StefanieSenger Feb 6, 2024
a87292e
Merge branch 'sparse_estimator_checks' of github.com:StefanieSenger/s…
StefanieSenger Feb 6, 2024
1f2527c
Merge branch 'main' into sparse_estimator_checks
adrinjalali Feb 13, 2024
7a2fc50
convert sparse dok_array into sparse coo_array before hstack
StefanieSenger Feb 16, 2024
a0bd979
compatibility with scipy versions < 1.11
StefanieSenger Feb 16, 2024
43b482d
Merge branch 'main' into sparse_estimator_checks
StefanieSenger Feb 16, 2024
83439c3
public path for isinstance check
StefanieSenger Feb 20, 2024
3cfc057
Apply suggestions from code review
StefanieSenger Feb 20, 2024
2bf114a
added conversion into sparse array
StefanieSenger Feb 22, 2024
7b0df06
Merge branch 'main' into sparse_estimator_checks
StefanieSenger Feb 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ Changelog
:mod:`sklearn.linear_model`
...........................

- |Fix| :class:`linear_model.ElasticNet`, :class:`linear_model.ElasticNetCV`,
:class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't
accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger
<StefanieSenger>`.

- |API| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV`
will now allow `alpha=0` when `cv != None`, which is consistent with
:class:`linear_model.Ridge` and :class:`linear_model.RidgeClassifier`.
Expand Down
3 changes: 2 additions & 1 deletion sklearn/cluster/_bicluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,8 @@ def _more_tags(self):
"check_estimators_dtypes": "raises nan error",
"check_fit2d_1sample": "_scale_normalize fails",
"check_fit2d_1feature": "raises apply_along_axis error",
"check_estimator_sparse_data": "does not fail gracefully",
"check_estimator_sparse_matrix": "does not fail gracefully",
"check_estimator_sparse_array": "does not fail gracefully",
"check_methods_subset_invariance": "empty array passed inside",
"check_dont_overwrite_parameters": "empty array passed inside",
"check_fit2d_predict1d": "empty array passed inside",
Expand Down
14 changes: 11 additions & 3 deletions sklearn/linear_model/_coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,9 +906,12 @@ def fit(self, X, y, sample_weight=None, check_input=True):

Parameters
----------
X : {ndarray, sparse matrix} of (n_samples, n_features)
X : {ndarray, sparse matrix, sparse array} of (n_samples, n_features)
Data.

Note that large sparse matrices and arrays requiring `int64`
indices are not accepted.

y : ndarray of shape (n_samples,) or (n_samples, n_targets)
Target. Will be cast to X's dtype if necessary.

Expand Down Expand Up @@ -958,6 +961,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
accept_sparse="csc",
order="F",
dtype=[np.float64, np.float32],
accept_large_sparse=False,
copy=X_copied,
multi_output=True,
y_numeric=True,
Expand Down Expand Up @@ -1532,7 +1536,8 @@ def fit(self, X, y, sample_weight=None, **params):
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data. Pass directly as Fortran-contiguous data
to avoid unnecessary memory duplication. If y is mono-output,
X can be sparse.
X can be sparse. Note that large sparse matrices and arrays
requiring `int64` indices are not accepted.

y : array-like of shape (n_samples,) or (n_samples, n_targets)
Target values.
Expand Down Expand Up @@ -1582,7 +1587,10 @@ def fit(self, X, y, sample_weight=None, **params):
# csr. We also want to allow y to be 64 or 32 but check_X_y only
# allows to convert for 64.
check_X_params = dict(
accept_sparse="csc", dtype=[np.float64, np.float32], copy=False
accept_sparse="csc",
dtype=[np.float64, np.float32],
copy=False,
accept_large_sparse=False,
)
X, y = self._validate_data(
X, y, validate_separately=(check_X_params, check_y_params)
Expand Down
14 changes: 13 additions & 1 deletion sklearn/multioutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,19 @@ def fit(self, X, Y, **fit_params):
X_aug = np.hstack((X, Y_pred_chain))

elif sp.issparse(X):
Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))
# TODO: remove this condition check when the minimum supported scipy version
# doesn't support sparse matrices anymore
if not sp.isspmatrix(X):
# if `X` is a scipy sparse dok_array, we convert it to a sparse
# coo_array format before hstacking, it's faster; see
# https://github.com/scipy/scipy/issues/20060#issuecomment-1937007039:
if X.format == "dok":
X = sp.coo_array(X)
# in case that `X` is a sparse array we create `Y_pred_chain` as a
# sparse array format:
Y_pred_chain = sp.coo_array((X.shape[0], Y.shape[1]))
else:
Y_pred_chain = sp.coo_matrix((X.shape[0], Y.shape[1]))
X_aug = sp.hstack((X, Y_pred_chain), format="lil")

else:
Expand Down
6 changes: 3 additions & 3 deletions sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,7 @@ def __exit__(self, exc_type, exc_value, _):


class MinimalClassifier:
"""Minimal classifier implementation with inheriting from BaseEstimator.
"""Minimal classifier implementation without inheriting from BaseEstimator.

This estimator should be tested with:

Expand Down Expand Up @@ -972,7 +972,7 @@ def score(self, X, y):


class MinimalRegressor:
"""Minimal regressor implementation with inheriting from BaseEstimator.
"""Minimal regressor implementation without inheriting from BaseEstimator.

This estimator should be tested with:

Expand Down Expand Up @@ -1012,7 +1012,7 @@ def score(self, X, y):


class MinimalTransformer:
"""Minimal transformer implementation with inheriting from
"""Minimal transformer implementation without inheriting from
BaseEstimator.

This estimator should be tested with:
Expand Down
34 changes: 22 additions & 12 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
generate_invalid_param_val,
make_constraint,
)
from ..utils.fixes import parse_version, sp_version
from ..utils.fixes import SPARSE_ARRAY_PRESENT, parse_version, sp_version
from ..utils.validation import check_is_fitted
from . import IS_PYPY, is_scalar_nan, shuffle
from ._param_validation import Interval
Expand Down Expand Up @@ -134,7 +134,8 @@ def _yield_checks(estimator):
if hasattr(estimator, "sparsify"):
yield check_sparsify_coefficients

yield check_estimator_sparse_data
yield check_estimator_sparse_array
yield check_estimator_sparse_matrix

# Test that estimators can be pickled, and once pickled
# give the same answer as before.
Expand Down Expand Up @@ -836,17 +837,17 @@ def _is_pairwise_metric(estimator):
return bool(metric == "precomputed")


def _generate_sparse_matrix(X_csr):
"""Generate sparse matrices with {32,64}bit indices of diverse format.
def _generate_sparse_data(X_csr):
"""Generate sparse matrices or arrays with {32,64}bit indices of diverse format.

Parameters
----------
X_csr: CSR Matrix
Input matrix in CSR format.
X_csr: scipy.sparse.csr_matrix or scipy.sparse.csr_array
Input in CSR format.

Returns
-------
out: iter(Matrices)
out: iter(Matrices) or iter(Arrays)
In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',
'coo_64', 'csc_64', 'csr_64']
"""
Expand Down Expand Up @@ -1029,19 +1030,18 @@ def check_array_api_input_and_values(
)


def check_estimator_sparse_data(name, estimator_orig):
def _check_estimator_sparse_container(name, estimator_orig, sparse_type):
rng = np.random.RandomState(0)
X = rng.uniform(size=(40, 3))
X[X < 0.8] = 0
X = _enforce_estimator_tags_X(estimator_orig, X)
X_csr = sparse.csr_matrix(X)
y = (4 * rng.uniform(size=40)).astype(int)
# catch deprecation warnings
with ignore_warnings(category=FutureWarning):
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)
tags = _safe_tags(estimator_orig)
for matrix_format, X in _generate_sparse_matrix(X_csr):
for matrix_format, X in _generate_sparse_data(sparse_type(X)):
# catch deprecation warnings
with ignore_warnings(category=FutureWarning):
estimator = clone(estimator_orig)
Expand All @@ -1052,13 +1052,14 @@ def check_estimator_sparse_data(name, estimator_orig):
err_msg = (
f"Estimator {name} doesn't seem to support {matrix_format} "
"matrix, and is not failing gracefully, e.g. by using "
"check_array(X, accept_large_sparse=False)"
"check_array(X, accept_large_sparse=False)."
)
else:
err_msg = (
f"Estimator {name} doesn't seem to fail gracefully on sparse "
"data: error message should state explicitly that sparse "
"input is not supported if this is not the case."
"input is not supported if this is not the case, e.g. by using "
"check_array(X, accept_sparse=False)."
)
with raises(
(TypeError, ValueError),
Expand All @@ -1083,6 +1084,15 @@ def check_estimator_sparse_data(name, estimator_orig):
assert probs.shape == expected_probs_shape


def check_estimator_sparse_matrix(name, estimator_orig):
_check_estimator_sparse_container(name, estimator_orig, sparse.csr_matrix)


def check_estimator_sparse_array(name, estimator_orig):
if SPARSE_ARRAY_PRESENT:
_check_estimator_sparse_container(name, estimator_orig, sparse.csr_array)


@ignore_warnings(category=FutureWarning)
def check_sample_weights_pandas_series(name, estimator_orig):
# check that estimators will accept a 'sample_weight' parameter of
Expand Down
19 changes: 19 additions & 0 deletions sklearn/utils/fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,25 @@
BSR_CONTAINERS.append(scipy.sparse.bsr_array)
DIA_CONTAINERS.append(scipy.sparse.dia_array)


# Remove when minimum scipy version is 1.11.0
try:
from scipy.sparse import sparray # noqa

SPARRAY_PRESENT = True
except ImportError:
SPARRAY_PRESENT = False


# Remove when minimum scipy version is 1.8
try:
from scipy.sparse import csr_array # noqa

SPARSE_ARRAY_PRESENT = True
except ImportError:
SPARSE_ARRAY_PRESENT = False


try:
from scipy.optimize._linesearch import line_search_wolfe1, line_search_wolfe2
except ImportError: # SciPy < 1.8
Expand Down
43 changes: 35 additions & 8 deletions sklearn/utils/tests/test_estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
check_requires_y_none,
set_random_state,
)
from sklearn.utils.fixes import CSR_CONTAINERS
from sklearn.utils.fixes import CSR_CONTAINERS, SPARRAY_PRESENT
from sklearn.utils.metaestimators import available_if
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

Expand Down Expand Up @@ -207,9 +207,17 @@ def fit(self, X, y):


class NoSparseClassifier(BaseBadClassifier):
def __init__(self, raise_for_type=None):
# raise_for_type : str, expects "sparse_array" or "sparse_matrix"
self.raise_for_type = raise_for_type

def fit(self, X, y):
X, y = self._validate_data(X, y, accept_sparse=["csr", "csc"])
if sp.issparse(X):
if self.raise_for_type == "sparse_array":
correct_type = isinstance(X, sp.sparray)
elif self.raise_for_type == "sparse_matrix":
correct_type = isinstance(X, sp.spmatrix)
if correct_type:
raise ValueError("Nonsensical Error")
return self

Expand Down Expand Up @@ -357,6 +365,13 @@ def predict(self, X):


class LargeSparseNotSupportedClassifier(BaseEstimator):
"""Estimator that claims to support large sparse data
(accept_large_sparse=True), but doesn't"""

def __init__(self, raise_for_type=None):
# raise_for_type : str, expects "sparse_array" or "sparse_matrix"
self.raise_for_type = raise_for_type

def fit(self, X, y):
X, y = self._validate_data(
X,
Expand All @@ -366,11 +381,15 @@ def fit(self, X, y):
multi_output=True,
y_numeric=True,
)
if sp.issparse(X):
if X.getformat() == "coo":
if self.raise_for_type == "sparse_array":
correct_type = isinstance(X, sp.sparray)
elif self.raise_for_type == "sparse_matrix":
correct_type = isinstance(X, sp.spmatrix)
if correct_type:
if X.format == "coo":
if X.row.dtype == "int64" or X.col.dtype == "int64":
raise ValueError("Estimator doesn't support 64-bit indices")
elif X.getformat() in ["csc", "csr"]:
elif X.format in ["csc", "csr"]:
assert "int64" not in (
X.indices.dtype,
X.indptr.dtype,
Expand Down Expand Up @@ -634,11 +653,15 @@ def test_check_estimator():
)
with raises(AssertionError, match=msg):
check_estimator(NotInvariantPredict())
# check for sparse matrix input handling
# check for sparse data input handling
name = NoSparseClassifier.__name__
msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
with raises(AssertionError, match=msg):
check_estimator(NoSparseClassifier())
check_estimator(NoSparseClassifier("sparse_matrix"))

if SPARRAY_PRESENT:
with raises(AssertionError, match=msg):
check_estimator(NoSparseClassifier("sparse_array"))

# check for classifiers reducing to less than two classes via sample weights
name = OneClassSampleErrorClassifier.__name__
Expand All @@ -656,7 +679,11 @@ def test_check_estimator():
r"support \S{3}_64 matrix, and is not failing gracefully.*"
)
with raises(AssertionError, match=msg):
check_estimator(LargeSparseNotSupportedClassifier())
check_estimator(LargeSparseNotSupportedClassifier("sparse_matrix"))

if SPARRAY_PRESENT:
with raises(AssertionError, match=msg):
check_estimator(LargeSparseNotSupportedClassifier("sparse_array"))

# does error on binary_only untagged estimator
msg = "Only 2 classes are supported"
Expand Down