Skip to content

[MRG+1] MAINT dissociate nan and inf in check_array #10459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 51 additions & 11 deletions sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from tempfile import NamedTemporaryFile
from itertools import product

import pytest
import numpy as np
from numpy.testing import assert_array_equal
import scipy.sparse as sp

from sklearn.utils.testing import assert_true, assert_false, assert_equal
Expand All @@ -18,6 +18,8 @@
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import SkipTest
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_allclose_dense_sparse
from sklearn.utils import as_float_array, check_array, check_symmetric
from sklearn.utils import check_X_y
from sklearn.utils.mocking import MockDataFrame
Expand Down Expand Up @@ -88,6 +90,17 @@ def test_as_float_array():
assert_false(np.isnan(M).any())


@pytest.mark.parametrize(
"X",
[(np.random.random((10, 2))),
(sp.rand(10, 2).tocsr())])
def test_as_float_array_nan(X):
X[5, 0] = np.nan
X[6, 1] = np.nan
X_converted = as_float_array(X, force_all_finite='allow-nan')
assert_allclose_dense_sparse(X_converted, X)


def test_np_matrix():
# Confirm that input validation code does not return np.matrix
X = np.arange(12).reshape(3, 4)
Expand Down Expand Up @@ -132,6 +145,43 @@ def test_ordering():
assert_false(X.data.flags['C_CONTIGUOUS'])


@pytest.mark.parametrize(
"value, force_all_finite",
[(np.inf, False), (np.nan, 'allow-nan'), (np.nan, False)]
)
@pytest.mark.parametrize(
"retype",
[np.asarray, sp.csr_matrix]
)
def test_check_array_force_all_finite_valid(value, force_all_finite, retype):
X = retype(np.arange(4).reshape(2, 2).astype(np.float))
X[0, 0] = value
X_checked = check_array(X, force_all_finite=force_all_finite,
accept_sparse=True)
assert_allclose_dense_sparse(X, X_checked)


@pytest.mark.parametrize(
"value, force_all_finite, match_msg",
[(np.inf, True, 'Input contains NaN, infinity'),
(np.inf, 'allow-nan', 'Input contains infinity'),
(np.nan, True, 'Input contains NaN, infinity'),
(np.nan, 'allow-inf', 'force_all_finite should be a bool or "allow-nan"'),
(np.nan, 1, 'force_all_finite should be a bool or "allow-nan"')]
)
@pytest.mark.parametrize(
"retype",
[np.asarray, sp.csr_matrix]
)
def test_check_array_force_all_finiteinvalid(value, force_all_finite,
match_msg, retype):
X = retype(np.arange(4).reshape(2, 2).astype(np.float))
X[0, 0] = value
with pytest.raises(ValueError, message=match_msg):
check_array(X, force_all_finite=force_all_finite,
accept_sparse=True)


@ignore_warnings
def test_check_array():
# accept_sparse == None
Expand All @@ -153,16 +203,6 @@ def test_check_array():
X_ndim = np.arange(8).reshape(2, 2, 2)
assert_raises(ValueError, check_array, X_ndim)
check_array(X_ndim, allow_nd=True) # doesn't raise
# force_all_finite
X_inf = np.arange(4).reshape(2, 2).astype(np.float)
X_inf[0, 0] = np.inf
assert_raises(ValueError, check_array, X_inf)
check_array(X_inf, force_all_finite=False) # no raise
# nan check
X_nan = np.arange(4).reshape(2, 2).astype(np.float)
X_nan[0, 0] = np.nan
assert_raises(ValueError, check_array, X_nan)
check_array(X_inf, force_all_finite=False) # no raise

# dtype and order enforcement.
X_C = np.arange(4).reshape(2, 2).copy("C")
Expand Down
86 changes: 68 additions & 18 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,35 @@
warnings.simplefilter('ignore', NonBLASDotWarning)


def _assert_all_finite(X):
def _assert_all_finite(X, allow_nan=False):
"""Like assert_all_finite, but only for ndarray."""
if _get_config()['assume_finite']:
return
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method.
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
and not np.isfinite(X).all()):
raise ValueError("Input contains NaN, infinity"
" or a value too large for %r." % X.dtype)


def assert_all_finite(X):
is_float = X.dtype.kind in 'fc'
if is_float and np.isfinite(X.sum()):
pass
elif is_float:
msg_err = "Input contains {} or a value too large for {!r}."
if (allow_nan and np.isinf(X).any() or
not allow_nan and not np.isfinite(X).all()):
type_err = 'infinity' if allow_nan else 'NaN, infinity'
raise ValueError(msg_err.format(type_err, X.dtype))


def assert_all_finite(X, allow_nan=False):
"""Throw a ValueError if X contains NaN or infinity.

Parameters
----------
X : array or sparse matrix

allow_nan : bool
"""
_assert_all_finite(X.data if sp.issparse(X) else X)
_assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)


def as_float_array(X, copy=True, force_all_finite=True):
Expand All @@ -70,8 +77,17 @@ def as_float_array(X, copy=True, force_all_finite=True):
If True, a copy of X will be created. If False, a copy may still be
returned if X's dtype is not a floating point type.

force_all_finite : boolean (default=True)
Whether to raise an error on np.inf and np.nan in X.
force_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf and np.nan in X. The possibilities
are:

- True: Force all values of X to be finite.
- False: accept both np.inf and np.nan in X.
- 'allow-nan': accept only np.nan values in X. Values cannot be
infinite.

.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.

Returns
-------
Expand Down Expand Up @@ -256,8 +272,17 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

force_all_finite : boolean
Whether to raise an error on np.inf and np.nan in X.
force_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf and np.nan in X. The possibilities
are:

- True: Force all values of X to be finite.
- False: accept both np.inf and np.nan in X.
- 'allow-nan': accept only np.nan values in X. Values cannot be
infinite.

.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.

Returns
-------
Expand Down Expand Up @@ -304,7 +329,9 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
warnings.warn("Can't check %s sparse matrix for nan or inf."
% spmatrix.format)
else:
_assert_all_finite(spmatrix.data)
_assert_all_finite(spmatrix.data,
allow_nan=force_all_finite == 'allow-nan')

return spmatrix


Expand Down Expand Up @@ -359,8 +386,17 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None,
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

force_all_finite : boolean (default=True)
Whether to raise an error on np.inf and np.nan in X.
force_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf and np.nan in X. The possibilities
are:

- True: Force all values of X to be finite.
- False: accept both np.inf and np.nan in X.
- 'allow-nan': accept only np.nan values in X. Values cannot be
infinite.

.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.

ensure_2d : boolean (default=True)
Whether to raise a value error if X is not 2d.
Expand Down Expand Up @@ -425,6 +461,10 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None,
# list of accepted types.
dtype = dtype[0]

if force_all_finite not in (True, False, 'allow-nan'):
raise ValueError('force_all_finite should be a bool or "allow-nan"'
'. Got {!r} instead'.format(force_all_finite))

if estimator is not None:
if isinstance(estimator, six.string_types):
estimator_name = estimator
Expand Down Expand Up @@ -483,7 +523,8 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None,
raise ValueError("Found array with dim %d. %s expected <= 2."
% (array.ndim, estimator_name))
if force_all_finite:
_assert_all_finite(array)
_assert_all_finite(array,
allow_nan=force_all_finite == 'allow-nan')

shape_repr = _shape_repr(array.shape)
if ensure_min_samples > 0:
Expand Down Expand Up @@ -555,9 +596,18 @@ def check_X_y(X, y, accept_sparse=False, dtype="numeric", order=None,
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

force_all_finite : boolean (default=True)
force_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf and np.nan in X. This parameter
does not influence whether y can have np.inf or np.nan values.
The possibilities are:

- True: Force all values of X to be finite.
- False: accept both np.inf and np.nan in X.
- 'allow-nan': accept only np.nan values in X. Values cannot be
infinite.

.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.

ensure_2d : boolean (default=True)
Whether to make X at least 2d.
Expand Down