Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@ Enhancements
- Added ability to set ``n_jobs`` parameter to :func:`pipeline.make_union`.
A ``TypeError`` will be raised for any other kwargs. :issue:`8028`
by :user:`Alexander Booth <alexandercbooth>`.

- Added type checking to the ``accept_sparse`` parameter in
:mod:`sklearn.utils.validation` methods. This parameter now accepts only
boolean, string, or list/tuple of strings. ``accept_sparse=None`` is deprecated
and should be replaced by ``accept_sparse=False``.
:issue:`7880` by :user:`Josh Karnofsky <jkarno>`.

Bug fixes
.........
Expand Down
42 changes: 42 additions & 0 deletions sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,48 @@ def test_check_array_dtype_warning():
assert_equal(X_checked.format, 'csr')


def test_check_array_accept_sparse_type_exception():
X = [[1, 2], [3, 4]]
X_csr = sp.csr_matrix(X)
invalid_type = SVR()

msg = ("A sparse matrix was passed, but dense data is required. "
"Use X.toarray() to convert to a dense numpy array.")
assert_raise_message(TypeError, msg,
check_array, X_csr, accept_sparse=False)
assert_raise_message(TypeError, msg,
check_array, X_csr, accept_sparse=None)

msg = ("Parameter 'accept_sparse' should be a string, "
"boolean or list of strings. You provided 'accept_sparse={}'.")
assert_raise_message(ValueError, msg.format(invalid_type),
check_array, X_csr, accept_sparse=invalid_type)

msg = ("When providing 'accept_sparse' as a tuple or list, "
"it must contain at least one string value.")
assert_raise_message(ValueError, msg.format([]),
check_array, X_csr, accept_sparse=[])
assert_raise_message(ValueError, msg.format(()),
check_array, X_csr, accept_sparse=())

msg = "'SVR' object"
assert_raise_message(TypeError, msg,
check_array, X_csr, accept_sparse=[invalid_type])

# Test deprecation of 'None'
assert_warns(DeprecationWarning, check_array, X, accept_sparse=None)


def test_check_array_accept_sparse_no_exception():
X = [[1, 2], [3, 4]]
X_csr = sp.csr_matrix(X)

check_array(X_csr, accept_sparse=True)
check_array(X_csr, accept_sparse='csr')
check_array(X_csr, accept_sparse=['csr'])
check_array(X_csr, accept_sparse=('csr',))


def test_check_array_min_samples_and_features_messages():
# empty list is considered 2D by default:
msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required."
Expand Down
79 changes: 51 additions & 28 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,40 +200,55 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
spmatrix : scipy sparse matrix
Input to validate and convert.

accept_sparse : string, list of string or None (default=None)
accept_sparse : string, boolean or list/tuple of strings
String[s] representing allowed sparse matrix formats ('csc',
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). None means that sparse
matrix input will raise an error. If the input is sparse but not in
the allowed format, it will be converted to the first listed format.
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
not in the allowed format, it will be converted to the first listed
format. True allows the input to be any format. False means
that a sparse matrix input will raise an error.

dtype : string, type or None (default=none)
dtype : string, type or None
Data type of result. If None, the dtype of the input is preserved.

copy : boolean (default=False)
copy : boolean
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

force_all_finite : boolean (default=True)
force_all_finite : boolean
Whether to raise an error on np.inf and np.nan in X.

Returns
-------
spmatrix_converted : scipy sparse matrix.
Matrix that is ensured to have an allowed type.
"""
if accept_sparse in [None, False]:
raise TypeError('A sparse matrix was passed, but dense '
'data is required. Use X.toarray() to '
'convert to a dense numpy array.')
if dtype is None:
dtype = spmatrix.dtype

changed_format = False
if (isinstance(accept_sparse, (list, tuple))
and spmatrix.format not in accept_sparse):
# create new with correct sparse
spmatrix = spmatrix.asformat(accept_sparse[0])
changed_format = True

if isinstance(accept_sparse, six.string_types):
accept_sparse = [accept_sparse]

if accept_sparse is False:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use elif and remove the newline mostly for consistency.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That won't work, right? We want to end up in the case below where isinstance(accept_sparse, (list, tuple))

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That won't work, right?

Oh yeah, sorry I missed that. Ignore my comment then.

raise TypeError('A sparse matrix was passed, but dense '
'data is required. Use X.toarray() to '
'convert to a dense numpy array.')
elif isinstance(accept_sparse, (list, tuple)):
if len(accept_sparse) == 0:
raise ValueError("When providing 'accept_sparse' "
"as a tuple or list, it must contain at "
"least one string value.")
# ensure correct sparse format
if spmatrix.format not in accept_sparse:
# create new with correct sparse
spmatrix = spmatrix.asformat(accept_sparse[0])
changed_format = True
elif accept_sparse is not True:
# any other type
raise ValueError("Parameter 'accept_sparse' should be a string, "
"boolean or list of strings. You provided "
"'accept_sparse={}'.".format(accept_sparse))

if dtype != spmatrix.dtype:
# convert dtype
Expand All @@ -251,7 +266,7 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
return spmatrix


def check_array(array, accept_sparse=None, dtype="numeric", order=None,
def check_array(array, accept_sparse=False, dtype="numeric", order=None,
copy=False, force_all_finite=True, ensure_2d=True,
allow_nd=False, ensure_min_samples=1, ensure_min_features=1,
warn_on_dtype=False, estimator=None):
Expand All @@ -266,11 +281,12 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None,
array : object
Input object to check / convert.

accept_sparse : string, list of string or None (default=None)
accept_sparse : string, boolean or list/tuple of strings (default=False)
String[s] representing allowed sparse matrix formats, such as 'csc',
'csr', etc. None means that sparse matrix input will raise an error.
If the input is sparse but not in the allowed format, it will be
converted to the first listed format.
'csr', etc. If the input is sparse but not in the allowed format,
it will be converted to the first listed format. True allows the input
to be any format. False means that a sparse matrix input will
raise an error.

dtype : string, type, list of types or None (default="numeric")
Data type of result. If None, the dtype of the input is preserved.
Expand Down Expand Up @@ -321,8 +337,14 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None,
X_converted : object
The converted and validated X.
"""
if isinstance(accept_sparse, str):
accept_sparse = [accept_sparse]
# accept_sparse 'None' deprecation check
if accept_sparse is None:
warnings.warn(
"Passing 'None' to parameter 'accept_sparse' in methods "
"check_array and check_X_y is deprecated in version 0.19 "
"and will be removed in 0.21. Use 'accept_sparse=False' "
" instead.", DeprecationWarning)
accept_sparse = False

# store whether originally we wanted numeric dtype
dtype_numeric = dtype == "numeric"
Expand Down Expand Up @@ -406,7 +428,7 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None,
return array


def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None,
def check_X_y(X, y, accept_sparse=False, dtype="numeric", order=None,
copy=False, force_all_finite=True, ensure_2d=True,
allow_nd=False, multi_output=False, ensure_min_samples=1,
ensure_min_features=1, y_numeric=False,
Expand All @@ -427,11 +449,12 @@ def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None,
y : nd-array, list or sparse matrix
Labels.

accept_sparse : string, list of string or None (default=None)
accept_sparse : string, boolean or list of string (default=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you need to deprecate accept_sparse=None here too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean raise a warning? But check_array is called...

Copy link
Member

@lesteve lesteve Dec 13, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah sorry I missed that, maybe the deprecation message in check_array can be changed to mention both check_array and check_X_y then.

String[s] representing allowed sparse matrix formats, such as 'csc',
'csr', etc. None means that sparse matrix input will raise an error.
If the input is sparse but not in the allowed format, it will be
converted to the first listed format.
'csr', etc. If the input is sparse but not in the allowed format,
it will be converted to the first listed format. True allows the input
to be any format. False means that a sparse matrix input will
raise an error.

dtype : string, type, list of types or None (default="numeric")
Data type of result. If None, the dtype of the input is preserved.
Expand Down