Skip to content

MNT Correctly errors in check_array with dtype=numeric and string/bytes #18496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
56 changes: 42 additions & 14 deletions sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from sklearn.utils import check_X_y
from sklearn.utils import deprecated
from sklearn.utils._mocking import MockDataFrame
from sklearn.utils.fixes import np_version, parse_version
from sklearn.utils.estimator_checks import _NotAnArray
from sklearn.random_projection import _sparse_random_matrix
from sklearn.linear_model import ARDRegression
Expand Down Expand Up @@ -336,18 +337,41 @@ def test_check_array():
result = check_array(X_no_array)
assert isinstance(result, np.ndarray)

# deprecation warning if string-like array with dtype="numeric"
expected_warn_regex = r"converted to decimal numbers if dtype='numeric'"
X_str = [['11', '12'], ['13', 'xx']]
for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]:
with pytest.warns(FutureWarning, match=expected_warn_regex):
check_array(X, dtype="numeric")

# deprecation warning if byte-like array with dtype="numeric"
X_bytes = [[b'a', b'b'], [b'c', b'd']]
for X in [X_bytes, np.array(X_bytes, dtype='V1')]:
with pytest.warns(FutureWarning, match=expected_warn_regex):
check_array(X, dtype="numeric")
# TODO: Check for error in 0.26 when implicit conversation is removed
@pytest.mark.parametrize("X", [
[['1', '2'], ['3', '4']],
np.array([['1', '2'], ['3', '4']], dtype='U'),
np.array([['1', '2'], ['3', '4']], dtype='S'),
[[b'1', b'2'], [b'3', b'4']],
np.array([[b'1', b'2'], [b'3', b'4']], dtype='V1')
])
def test_check_array_numeric_warns(X):
"""Test that check_array warns when it converts a bytes/string into a
float."""
expected_msg = (r"Arrays of bytes/strings is being converted to decimal .*"
r"deprecated in 0.24 and will be removed in 0.26")
with pytest.warns(FutureWarning, match=expected_msg):
check_array(X, dtype="numeric")


# TODO: remove in 0.26
@ignore_warnings(category=FutureWarning)
@pytest.mark.parametrize("X", [
[['11', '12'], ['13', 'xx']],
np.array([['11', '12'], ['13', 'xx']], dtype='U'),
np.array([['11', '12'], ['13', 'xx']], dtype='S'),
[[b'a', b'b'], [b'c', b'd']],
np.array([[b'a', b'b'], [b'c', b'd']], dtype='V1')
])
def test_check_array_dtype_numeric_errors(X):
"""Error when string-ike array can not be converted"""
if (np_version < parse_version("1.14")
and hasattr(X, "dtype") and X.dtype.kind == "V"):
pytest.skip("old numpy would convert V dtype into float silently")
expected_warn_msg = "Unable to convert array of bytes/strings"
with pytest.raises(ValueError, match=expected_warn_msg):
check_array(X, dtype="numeric")


@pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"])
Expand Down Expand Up @@ -381,16 +405,20 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
check_array(X, force_all_finite=True)


# TODO: remove test in 0.26 once this behavior is deprecated
def test_check_array_pandas_dtype_object_conversion():
# test that data-frame like objects with dtype object
# get converted
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=object)
X_df = MockDataFrame(X)
assert check_array(X_df).dtype.kind == "f"
assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
with pytest.warns(FutureWarning):
assert check_array(X_df).dtype.kind == "f"
with pytest.warns(FutureWarning):
assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
# smoke-test against dataframes with column named "dtype"
X_df.dtype = "Hans"
assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
with pytest.warns(FutureWarning):
assert check_array(X_df, ensure_2d=False).dtype.kind == "f"


def test_check_array_pandas_dtype_casting():
Expand Down
26 changes: 13 additions & 13 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,20 +632,20 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
"your data has a single feature or array.reshape(1, -1) "
"if it contains a single sample.".format(array))

# in the future np.flexible dtypes will be handled like object dtypes
if dtype_numeric and np.issubdtype(array.dtype, np.flexible):
warnings.warn(
"Beginning in version 0.22, arrays of bytes/strings will be "
"converted to decimal numbers if dtype='numeric'. "
"It is recommended that you convert the array to "
"a float dtype before using it in scikit-learn, "
"for example by using "
"your_array = your_array.astype(np.float64).",
FutureWarning, stacklevel=2)

# make sure we actually converted to numeric:
if dtype_numeric and array.dtype.kind == "O":
array = array.astype(np.float64)
if dtype_numeric and array.dtype.kind in "OUSV":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't mind leaving U out of this here. But not strong feelings either way.

warnings.warn("Arrays of bytes/strings is being converted to "
"decimal numbers if dtype='numeric'. This behavior "
"is deprecated in 0.24 and will be removed in 0.26 "
"Please convert your data to numeric values "
"explicitly instead.",
FutureWarning, stacklevel=2)
try:
array = array.astype(np.float64)
except ValueError as e:
raise ValueError(
"Unable to convert array of bytes/strings "
"into decimal numbers with dtype='numeric'") from e
if not allow_nd and array.ndim >= 3:
raise ValueError("Found array with dim %d. %s expected <= 2."
% (array.ndim, estimator_name))
Expand Down