From 6b7890e8f2c40bdd616a1e8b1e1856c23d89ceae Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 29 Sep 2020 13:46:52 -0400 Subject: [PATCH 1/8] MNT Correctly errors in check_array with dtype=numeric and string/bytes --- sklearn/utils/tests/test_validation.py | 21 ++++++++++++++++++--- sklearn/utils/validation.py | 20 +++++++------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 9adf431103cfb..9918fd28a0f9f 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -336,17 +336,32 @@ def test_check_array(): result = check_array(X_no_array) assert isinstance(result, np.ndarray) + +def test_check_array_numeric(): + # simple test for check_array and dtype='numeric' + X_str = [['1', '2'], ['3', '4']] + for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: + X_out = check_array(X, dtype="numeric") + assert_allclose(X_out, [[1, 2], [3, 4]]) + + X_bytes = [[b'1', b'2'], [b'3', b'4']] + for X in [X_bytes, np.array(X_bytes, dtype='V1')]: + X_out = check_array(X, dtype="numeric") + assert_allclose(X_out, [[1, 2], [3, 4]]) + + +def test_check_array_dtype_numeric_errors(): # deprecation warning if string-like array with dtype="numeric" - expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" + expected_warn_regex = "Unable to convert array of bytes/strings" X_str = [['11', '12'], ['13', 'xx']] for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: - with pytest.warns(FutureWarning, match=expected_warn_regex): + with pytest.raises(ValueError, match=expected_warn_regex): check_array(X, dtype="numeric") # deprecation warning if byte-like array with dtype="numeric" X_bytes = [[b'a', b'b'], [b'c', b'd']] for X in [X_bytes, np.array(X_bytes, dtype='V1')]: - with pytest.warns(FutureWarning, match=expected_warn_regex): + with pytest.raises(ValueError, match=expected_warn_regex): check_array(X, dtype="numeric") diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a0767d1c332d3..2921ec79621ce 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -632,20 +632,14 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, "your data has a single feature or array.reshape(1, -1) " "if it contains a single sample.".format(array)) - # in the future np.flexible dtypes will be handled like object dtypes - if dtype_numeric and np.issubdtype(array.dtype, np.flexible): - warnings.warn( - "Beginning in version 0.22, arrays of bytes/strings will be " - "converted to decimal numbers if dtype='numeric'. " - "It is recommended that you convert the array to " - "a float dtype before using it in scikit-learn, " - "for example by using " - "your_array = your_array.astype(np.float64).", - FutureWarning, stacklevel=2) - # make sure we actually converted to numeric: - if dtype_numeric and array.dtype.kind == "O": - array = array.astype(np.float64) + if dtype_numeric and array.dtype.kind in "OUSV": + try: + array = array.astype(np.float64) + except ValueError as e: + raise ValueError( + "Unable to convert array of bytes/strings " + "into decimal numbers with dtype='numeric'") from e if not allow_nd and array.ndim >= 3: raise ValueError("Found array with dim %d. %s expected <= 2." % (array.ndim, estimator_name)) From 765458cb9ede5e254d32a67e5e72027d5f7ea77d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 6 Nov 2020 12:14:52 -0500 Subject: [PATCH 2/8] ENH Adds warning when there is a converstion --- sklearn/utils/tests/test_validation.py | 51 ++++++++++++++------------ sklearn/utils/validation.py | 3 ++ 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 9918fd28a0f9f..17282f08bf3e7 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -46,6 +46,7 @@ FLOAT_DTYPES) from sklearn.utils.validation import _check_fit_params from sklearn.utils.fixes import parse_version +from sklearn.exceptions import DataConversionWarning import sklearn @@ -337,32 +338,34 @@ def test_check_array(): assert isinstance(result, np.ndarray) -def test_check_array_numeric(): - # simple test for check_array and dtype='numeric' - X_str = [['1', '2'], ['3', '4']] - for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: +@pytest.mark.parametrize("X", [ + [['1', '2'], ['3', '4']], + np.array([['1', '2'], ['3', '4']], dtype='U'), + np.array([['1', '2'], ['3', '4']], dtype='S'), + [[b'1', b'2'], [b'3', b'4']], + np.array([[b'1', b'2'], [b'3', b'4']], dtype='V1') +]) +def test_check_array_numeric_warns(X): + """Test that check_array warns when it converts a bytes/string into a + float.""" + expected_msg = "Arrays of bytes/strings is being converted to decimal" + with pytest.warns(DataConversionWarning, match=expected_msg): X_out = check_array(X, dtype="numeric") - assert_allclose(X_out, [[1, 2], [3, 4]]) + assert_allclose(X_out, [[1, 2], [3, 4]]) - X_bytes = [[b'1', b'2'], [b'3', b'4']] - for X in [X_bytes, np.array(X_bytes, dtype='V1')]: - X_out = check_array(X, dtype="numeric") - assert_allclose(X_out, [[1, 2], [3, 4]]) - - -def test_check_array_dtype_numeric_errors(): - # deprecation warning if string-like array with dtype="numeric" - expected_warn_regex = "Unable to convert array of bytes/strings" - X_str = [['11', '12'], ['13', 'xx']] - for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: - with pytest.raises(ValueError, match=expected_warn_regex): - check_array(X, dtype="numeric") - - # deprecation warning if byte-like array with dtype="numeric" - X_bytes = [[b'a', b'b'], [b'c', b'd']] - for X in [X_bytes, np.array(X_bytes, dtype='V1')]: - with pytest.raises(ValueError, match=expected_warn_regex): - check_array(X, dtype="numeric") + +@pytest.mark.parametrize("X", [ + [['11', '12'], ['13', 'xx']], + np.array([['11', '12'], ['13', 'xx']], dtype='U'), + np.array([['11', '12'], ['13', 'xx']], dtype='S'), + [[b'a', b'b'], [b'c', b'd']], + np.array([[b'a', b'b'], [b'c', b'd']], dtype='V1') +]) +def test_check_array_dtype_numeric_errors(X): + """Error when string-ike array can not be converted""" + expected_warn_msg = "Unable to convert array of bytes/strings" + with pytest.raises(ValueError, match=expected_warn_msg): + check_array(X, dtype="numeric") @pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2921ec79621ce..9ba5c03a69800 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -634,6 +634,9 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind in "OUSV": + warnings.warn("Arrays of bytes/strings is being converted to " + "decimal numbers if dtype='numeric'", + DataConversionWarning, stacklevel=2) try: array = array.astype(np.float64) except ValueError as e: From 7880b2f344c625804081d3ff698167609c3233fc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 6 Nov 2020 12:57:16 -0500 Subject: [PATCH 3/8] ENH Adjusts to a deprecation warning --- sklearn/utils/tests/test_validation.py | 7 ++++--- sklearn/utils/validation.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 17282f08bf3e7..5897b65c3a262 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -46,7 +46,6 @@ FLOAT_DTYPES) from sklearn.utils.validation import _check_fit_params from sklearn.utils.fixes import parse_version -from sklearn.exceptions import DataConversionWarning import sklearn @@ -338,6 +337,7 @@ def test_check_array(): assert isinstance(result, np.ndarray) +# TODO: Check for error in 0.26 when implicit conversation is removed @pytest.mark.parametrize("X", [ [['1', '2'], ['3', '4']], np.array([['1', '2'], ['3', '4']], dtype='U'), @@ -348,8 +348,9 @@ def test_check_array(): def test_check_array_numeric_warns(X): """Test that check_array warns when it converts a bytes/string into a float.""" - expected_msg = "Arrays of bytes/strings is being converted to decimal" - with pytest.warns(DataConversionWarning, match=expected_msg): + expected_msg = (r"Arrays of bytes/strings is being converted to decimal .*" + r"deprecated in 0.24 and will be removed in 0.26") + with pytest.warns(FutureWarning, match=expected_msg): X_out = check_array(X, dtype="numeric") assert_allclose(X_out, [[1, 2], [3, 4]]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 9ba5c03a69800..1b8ba8489cac1 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -635,8 +635,9 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind in "OUSV": warnings.warn("Arrays of bytes/strings is being converted to " - "decimal numbers if dtype='numeric'", - DataConversionWarning, stacklevel=2) + "decimal numbers if dtype='numeric'. This behavior " + "is deprecated in 0.24 and will be removed in 0.26", + FutureWarning, stacklevel=2) try: array = array.astype(np.float64) except ValueError as e: From ad3de5062bbab229039415b98b16944734192439 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 11 Nov 2020 19:21:27 -0500 Subject: [PATCH 4/8] TST Remove value check --- sklearn/utils/tests/test_validation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5897b65c3a262..21d8ddf552074 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -351,8 +351,7 @@ def test_check_array_numeric_warns(X): expected_msg = (r"Arrays of bytes/strings is being converted to decimal .*" r"deprecated in 0.24 and will be removed in 0.26") with pytest.warns(FutureWarning, match=expected_msg): - X_out = check_array(X, dtype="numeric") - assert_allclose(X_out, [[1, 2], [3, 4]]) + check_array(X, dtype="numeric") @pytest.mark.parametrize("X", [ From 593e123594bb26681de52e6ea98b0658401a1a9f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 18 Nov 2020 14:55:31 +0100 Subject: [PATCH 5/8] Update sklearn/utils/validation.py --- sklearn/utils/validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 1b8ba8489cac1..cfeeb7234ee4f 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -637,6 +637,8 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, warnings.warn("Arrays of bytes/strings is being converted to " "decimal numbers if dtype='numeric'. This behavior " "is deprecated in 0.24 and will be removed in 0.26", + "Please convert your data to numeric values", + "explicitly instead." FutureWarning, stacklevel=2) try: array = array.astype(np.float64) From 0802613ada10f6223c8f0140ff50580dc2fb8bf5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 18 Nov 2020 15:14:19 +0100 Subject: [PATCH 6/8] Ignore warning in test_check_array_dtype_numeric_errors --- sklearn/utils/tests/test_validation.py | 2 ++ sklearn/utils/validation.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 21d8ddf552074..dca3972ede912 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -354,6 +354,8 @@ def test_check_array_numeric_warns(X): check_array(X, dtype="numeric") +# TODO: remove in 0.26 +@ignore_warnings(category=FutureWarning) @pytest.mark.parametrize("X", [ [['11', '12'], ['13', 'xx']], np.array([['11', '12'], ['13', 'xx']], dtype='U'), diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index cfeeb7234ee4f..502d8b107c0d5 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -636,9 +636,9 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, if dtype_numeric and array.dtype.kind in "OUSV": warnings.warn("Arrays of bytes/strings is being converted to " "decimal numbers if dtype='numeric'. This behavior " - "is deprecated in 0.24 and will be removed in 0.26", - "Please convert your data to numeric values", - "explicitly instead." + "is deprecated in 0.24 and will be removed in 0.26 " + "Please convert your data to numeric values " + "explicitly instead.", FutureWarning, stacklevel=2) try: array = array.astype(np.float64) From ec8a61e5709138c52c36bee12349b42adc3def2a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 18 Nov 2020 16:27:19 +0100 Subject: [PATCH 7/8] Update test to take deprecation into account --- sklearn/utils/tests/test_validation.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index dca3972ede912..01cb8d00bd404 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -401,16 +401,20 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype): check_array(X, force_all_finite=True) +# TODO: remove test in 0.26 once this behavior is deprecated def test_check_array_pandas_dtype_object_conversion(): # test that data-frame like objects with dtype object # get converted X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=object) X_df = MockDataFrame(X) - assert check_array(X_df).dtype.kind == "f" - assert check_array(X_df, ensure_2d=False).dtype.kind == "f" + with pytest.warns(FutureWarning): + assert check_array(X_df).dtype.kind == "f" + with pytest.warns(FutureWarning): + assert check_array(X_df, ensure_2d=False).dtype.kind == "f" # smoke-test against dataframes with column named "dtype" X_df.dtype = "Hans" - assert check_array(X_df, ensure_2d=False).dtype.kind == "f" + with pytest.warns(FutureWarning): + assert check_array(X_df, ensure_2d=False).dtype.kind == "f" def test_check_array_pandas_dtype_casting(): From 6789dbff2189f779e46113b6a0204eba790bd611 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 18 Nov 2020 17:34:19 +0100 Subject: [PATCH 8/8] Backward compat for numpy 1.13 --- sklearn/utils/tests/test_validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 01cb8d00bd404..14b718dfc8df6 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -22,6 +22,7 @@ from sklearn.utils import check_X_y from sklearn.utils import deprecated from sklearn.utils._mocking import MockDataFrame +from sklearn.utils.fixes import np_version, parse_version from sklearn.utils.estimator_checks import _NotAnArray from sklearn.random_projection import _sparse_random_matrix from sklearn.linear_model import ARDRegression @@ -365,6 +366,9 @@ def test_check_array_numeric_warns(X): ]) def test_check_array_dtype_numeric_errors(X): """Error when string-ike array can not be converted""" + if (np_version < parse_version("1.14") + and hasattr(X, "dtype") and X.dtype.kind == "V"): + pytest.skip("old numpy would convert V dtype into float silently") expected_warn_msg = "Unable to convert array of bytes/strings" with pytest.raises(ValueError, match=expected_warn_msg): check_array(X, dtype="numeric")