From cd85bb84591ba7ddda694367e3f4ea61c8f9b930 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Tue, 17 Jul 2018 01:04:29 +0200 Subject: [PATCH] Rework warning in check_array when silent convert string to float --- sklearn/utils/tests/test_validation.py | 39 ++++++-------------------- sklearn/utils/validation.py | 10 ++++--- 2 files changed, 14 insertions(+), 35 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index deec9a50179bf..57f84726791fa 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -291,40 +291,17 @@ def test_check_array(): assert_true(isinstance(result, np.ndarray)) # deprecation warning if string-like array with dtype="numeric" - X_str = [['a', 'b'], ['c', 'd']] - assert_warns_message( - FutureWarning, - "arrays of strings will be interpreted as decimal numbers if " - "parameter 'dtype' is 'numeric'. It is recommended that you convert " - "the array to type np.float64 before passing it to check_array.", - check_array, X_str, "numeric") - assert_warns_message( - FutureWarning, - "arrays of strings will be interpreted as decimal numbers if " - "parameter 'dtype' is 'numeric'. It is recommended that you convert " - "the array to type np.float64 before passing it to check_array.", - check_array, np.array(X_str, dtype='U'), "numeric") - assert_warns_message( - FutureWarning, - "arrays of strings will be interpreted as decimal numbers if " - "parameter 'dtype' is 'numeric'. It is recommended that you convert " - "the array to type np.float64 before passing it to check_array.", - check_array, np.array(X_str, dtype='S'), "numeric") + expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" + X_str = [['11', '12'], ['13', 'xx']] + for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: + with pytest.warns(FutureWarning, match=expected_warn_regex): + check_array(X, dtype="numeric") # deprecation warning if byte-like array with dtype="numeric" X_bytes = [[b'a', b'b'], [b'c', b'd']] - assert_warns_message( - FutureWarning, - "arrays of strings will be interpreted as decimal numbers if " - "parameter 'dtype' is 'numeric'. It is recommended that you convert " - "the array to type np.float64 before passing it to check_array.", - check_array, X_bytes, "numeric") - assert_warns_message( - FutureWarning, - "arrays of strings will be interpreted as decimal numbers if " - "parameter 'dtype' is 'numeric'. It is recommended that you convert " - "the array to type np.float64 before passing it to check_array.", - check_array, np.array(X_bytes, dtype='V1'), "numeric") + for X in [X_bytes, np.array(X_bytes, dtype='V1')]: + with pytest.warns(FutureWarning, match=expected_warn_regex): + check_array(X, dtype="numeric") def test_check_array_pandas_dtype_object_conversion(): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a000d935624c6..40ef0e9dc60f7 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -546,10 +546,12 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # in the future np.flexible dtypes will be handled like object dtypes if dtype_numeric and np.issubdtype(array.dtype, np.flexible): warnings.warn( - "Beginning in version 0.22, arrays of strings will be " - "interpreted as decimal numbers if parameter 'dtype' is " - "'numeric'. It is recommended that you convert the array to " - "type np.float64 before passing it to check_array.", + "Beginning in version 0.22, arrays of bytes/strings will be " + "converted to decimal numbers if dtype='numeric'. " + "It is recommended that you convert the array to " + "a float dtype before using it in scikit-learn, " + "for example by using " + "your_array = your_array.astype(np.float64).", FutureWarning) # make sure we actually converted to numeric: