diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 7b0c031f9196b..af08b832e9f6f 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -2,6 +2,25 @@ .. currentmodule:: sklearn +.. _changes_0_22_1: + +Version 0.22.1 +============== + +**In Development** + +This is a bug-fix release to primarily resolve some packaging issues in version +0.22.0. It also includes minor documentation improvements and some bug fixes. + +Changelog +--------- + +:mod:`sklearn.utils` +.................... + +- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with + boolean columns to floats. :pr:`15797` by `Thomas Fan`_. + .. _changes_0_22: Version 0.22.0 diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 56efb98a8b2d8..bdd31f9c4859f 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -826,6 +826,27 @@ def test_check_dataframe_warns_on_dtype(): assert len(record) == 0 +def test_check_dataframe_mixed_float_dtypes(): + # pandas dataframe will coerce a boolean into a object, this is a mismatch + # with np.result_type which will return a float + # check_array needs to explicitly check for bool dtype in a dataframe for + # this situation + # https://github.com/scikit-learn/scikit-learn/issues/15787 + + pd = importorskip("pandas") + df = pd.DataFrame({ + 'int': [1, 2, 3], + 'float': [0, 0.1, 2.1], + 'bool': [True, False, True]}, columns=['int', 'float', 'bool']) + + array = check_array(df, dtype=(np.float64, np.float32, np.float16)) + expected_array = np.array( + [[1.0, 0.0, 1.0], + [2.0, 0.1, 0.0], + [3.0, 2.1, 1.0]], dtype=np.float) + assert_allclose_dense_sparse(array, expected_array) + + class DummyMemory: def cache(self, func): return func diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 424cf4b5180a3..fb34f3b3cccbd 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -454,9 +454,14 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): - dtypes_orig = np.array(array.dtypes) + dtypes_orig = list(array.dtypes) + # pandas boolean dtype __array__ interface coerces bools to objects + for i, dtype_iter in enumerate(dtypes_orig): + if dtype_iter.kind == 'b': + dtypes_orig[i] = np.object + if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): - dtype_orig = np.result_type(*array.dtypes) + dtype_orig = np.result_type(*dtypes_orig) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O":