diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 2a41f2ca0116a..9583de1f5b871 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -314,8 +314,9 @@ Changelog - |Enhancement| :func:`utils.safe_indexing` accepts an ``axis`` parameter to index array-like across rows and columns. The column indexing can be done on - NumPy array, SciPy sparse matrix, and Pandas DataFrame. - :pr:`14035` by :user:`Guillaume Lemaitre `. + NumPy array, SciPy sparse matrix, and Pandas DataFrame. An additional + refactoring was done. + :pr:`14035` and :pr:`14475` by `Guillaume Lemaitre `. :mod:`sklearn.neighbors` .................... diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index ab5b4f941f4c7..fb46d07d172f6 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -20,7 +20,7 @@ from ..utils import Bunch from ..utils import safe_indexing from ..utils import _get_column_indices -from ..utils import _check_key_type +from ..utils import _determine_key_type from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted @@ -309,7 +309,8 @@ def _validate_remainder(self, X): # Make it possible to check for reordered named columns on transform if (hasattr(X, 'columns') and - any(_check_key_type(cols, str) for cols in self._columns)): + any(_determine_key_type(cols) == 'str' + for cols in self._columns)): self._df_columns = X.columns self._n_features = X.shape[1] @@ -755,6 +756,6 @@ def _is_negative_indexing(key): def is_neg(x): return isinstance(x, numbers.Integral) and x < 0 if isinstance(key, slice): return is_neg(key.start) or is_neg(key.stop) - elif _check_key_type(key, int): + elif _determine_key_type(key) == 'int': return np.any(np.asarray(key) < 0) return False diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index f95a0d6cccc57..c26fab41bfc93 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -3,6 +3,7 @@ """ from collections.abc import Sequence from contextlib import contextmanager +from itertools import compress from itertools import islice import numbers import platform @@ -180,199 +181,165 @@ def axis0_safe_slice(X, mask, len_mask): return np.zeros(shape=(0, X.shape[1])) -def safe_indexing(X, indices, axis=0): - """Return rows, items or columns of X using indices. +def _array_indexing(array, key, key_dtype, axis): + """Index an array or scipy.sparse consistently across NumPy version.""" + if np_version < (1, 12) or issparse(array): + # FIXME: Remove the check for NumPy when using >= 1.12 + # check if we have an boolean array-likes to make the proper indexing + if key_dtype == 'bool': + key = np.asarray(key) + return array[key] if axis == 0 else array[:, key] + + +def _pandas_indexing(X, key, key_dtype, axis): + """Index a pandas dataframe or a series.""" + if hasattr(key, 'shape'): + # Work-around for indexing with read-only key in pandas + # FIXME: solved in pandas 0.25 + key = np.asarray(key) + key = key if key.flags.writeable else key.copy() + # check whether we should index with loc or iloc + indexer = X.iloc if key_dtype == 'int' else X.loc + return indexer[:, key] if axis else indexer[key] + + +def _list_indexing(X, key, key_dtype): + """Index a Python list.""" + if np.isscalar(key) or isinstance(key, slice): + # key is a slice or a scalar + return X[key] + if key_dtype == 'bool': + # key is a boolean array-like + return list(compress(X, key)) + # key is a integer array-like of key + return [X[idx] for idx in key] + + +def _determine_key_type(key): + """Determine the data type of key. Parameters ---------- - X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series - Data from which to sample rows, items or columns. - indices : array-like - - When ``axis=0``, indices need to be an array of integer. - - When ``axis=1``, indices can be one of: - - scalar: output is 1D, unless `X` is sparse. - Supported data types for scalars: - - integer: supported for arrays, sparse matrices and - dataframes. - - string (key-based): only supported for dataframes. - - container: lists, slices, boolean masks: output is 2D. - Supported data types for containers: - - integer or boolean (positional): supported for - arrays, sparse matrices and dataframes - - string (key-based): only supported for dataframes. No keys - other than strings are allowed. - axis : int, default=0 - The axis along which `X` will be subsampled. ``axis=0`` will select - rows while ``axis=1`` will select columns. + key : scalar, slice or array-like + The key from which we want to infer the data type. Returns ------- - subset - Subset of X on axis 0 or 1. - - Notes - ----- - CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are - not supported. + dtype : {'int', 'str', 'bool', None} + Returns the data type of key. """ - if axis == 0: - return _safe_indexing_row(X, indices) - elif axis == 1: - return _safe_indexing_column(X, indices) - else: - raise ValueError( - "'axis' should be either 0 (to index rows) or 1 (to index " - " column). Got {} instead.".format(axis) - ) + err_msg = ("No valid specification of the columns. Only a scalar, list or " + "slice of all integers or all strings, or boolean mask is " + "allowed") + dtype_to_str = {int: 'int', str: 'str', bool: 'bool', np.bool_: 'bool'} + array_dtype_to_str = {'i': 'int', 'u': 'int', 'b': 'bool', 'O': 'str', + 'U': 'str', 'S': 'str'} -def _array_indexing(array, key, axis=0): - """Index an array consistently across NumPy version.""" - if axis not in (0, 1): - raise ValueError( - "'axis' should be either 0 (to index rows) or 1 (to index " - " column). Got {} instead.".format(axis) - ) - if np_version < (1, 12) or issparse(array): - # check if we have an boolean array-likes to make the proper indexing - key_array = np.asarray(key) - if np.issubdtype(key_array.dtype, np.bool_): - key = key_array - return array[key] if axis == 0 else array[:, key] - + if key is None: + return None + if isinstance(key, tuple(dtype_to_str.keys())): + try: + return dtype_to_str[type(key)] + except KeyError: + raise ValueError(err_msg) + if isinstance(key, slice): + if key.start is None and key.stop is None: + return None + key_start_type = _determine_key_type(key.start) + key_stop_type = _determine_key_type(key.stop) + if key_start_type is not None and key_stop_type is not None: + if key_start_type != key_stop_type: + raise ValueError(err_msg) + if key_start_type is not None: + return key_start_type + return key_stop_type + if isinstance(key, list): + unique_key = set(key) + key_type = {_determine_key_type(elt) for elt in unique_key} + if not key_type: + return None + if len(key_type) != 1: + raise ValueError(err_msg) + return key_type.pop() + if hasattr(key, 'dtype'): + try: + return array_dtype_to_str[key.dtype.kind] + except KeyError: + raise ValueError(err_msg) + raise ValueError(err_msg) -def _safe_indexing_row(X, indices): - """Return items or rows from X using indices. - Allows simple indexing of lists or arrays. +def safe_indexing(X, indices, axis=0): + """Return rows, items or columns of X using indices. Parameters ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series - Data from which to sample rows or items. - indices : array-like of int - Indices according to which X will be subsampled. + Data from which to sample rows, items or columns. `list` are only + supported when `axis=0`. + indices : bool, int, str, slice, array-like + - If `axis=0`, boolean and integer array-like, integer slice, + and scalar integer are supported. + - If `axis=1`: + - to select a single column, `indices` can be of `int` type for + all `X` types and `str` only for dataframe. The selected subset + will be 1D, unless `X` is a sparse matrix in which case it will + be 2D. + - to select multiples columns, `indices` can be one of the + following: `list`, `array`, `slice`. The type used in + these containers can be one of the following: `int`, 'bool' and + `str`. However, `str` is only supported when `X` is a dataframe. + The selected subset will be 2D. + axis : int, default=0 + The axis along which `X` will be subsampled. `axis=0` will select + rows while `axis=1` will select columns. Returns ------- subset - Subset of X on first axis. + Subset of X on axis 0 or 1. Notes ----- CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - if hasattr(X, "iloc"): - # Work-around for indexing with read-only indices in pandas - indices = np.asarray(indices) - indices = indices if indices.flags.writeable else indices.copy() - # Pandas Dataframes and Series - try: - return X.iloc[indices] - except ValueError: - # Cython typed memoryviews internally used in pandas do not support - # readonly buffers. - warnings.warn("Copying input dataframe for slicing.", - DataConversionWarning) - return X.copy().iloc[indices] - elif hasattr(X, "shape"): - if hasattr(X, 'take') and (hasattr(indices, 'dtype') and - indices.dtype.kind == 'i'): - # This is often substantially faster than X[indices] - return X.take(indices, axis=0) - else: - return _array_indexing(X, indices, axis=0) - else: - return [X[idx] for idx in indices] - - -def _check_key_type(key, superclass): - """Check that scalar, list or slice is of a certain type. - - This is only used in _safe_indexing_column and _get_column_indices to check - if the ``key`` (column specification) is fully integer or fully - string-like. + if indices is None: + return X - Parameters - ---------- - key : scalar, list, slice, array-like - The column specification to check. - superclass : int or str - The type for which to check the `key`. - """ - if isinstance(key, superclass): - return True - if isinstance(key, slice): - return (isinstance(key.start, (superclass, type(None))) and - isinstance(key.stop, (superclass, type(None)))) - if isinstance(key, list): - return all(isinstance(x, superclass) for x in key) - if hasattr(key, 'dtype'): - if superclass is int: - return key.dtype.kind == 'i' - elif superclass is bool: - return key.dtype.kind == 'b' - else: - # superclass = str - return key.dtype.kind in ('O', 'U', 'S') - return False - - -def _safe_indexing_column(X, key): - """Get feature column(s) from input data X. + if axis not in (0, 1): + raise ValueError( + "'axis' should be either 0 (to index rows) or 1 (to index " + " column). Got {} instead.".format(axis) + ) - Supported input types (X): numpy arrays, sparse arrays and DataFrames. + indices_dtype = _determine_key_type(indices) - Supported key types (key): - - scalar: output is 1D; - - lists, slices, boolean masks: output is 2D. + if axis == 0 and indices_dtype == 'str': + raise ValueError( + "String indexing is not supported with 'axis=0'" + ) - Supported key data types: - - integer or boolean mask (positional): - - supported for arrays, sparse matrices and dataframes. - - string (key-based): - - only supported for dataframes; - - So no keys other than strings are allowed (while in principle you - can use any hashable object as key). - """ - # check that X is a 2D structure - if X.ndim != 2: + if axis == 1 and X.ndim != 2: raise ValueError( "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " "dataframe when indexing the columns (i.e. 'axis=1'). " "Got {} instead with {} dimension(s).".format(type(X), X.ndim) ) - # check whether we have string column names or integers - if _check_key_type(key, int): - column_names = False - elif _check_key_type(key, str): - column_names = True - elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_): - # boolean mask - column_names = False - if hasattr(X, 'loc'): - # pandas boolean masks don't work with iloc, so take loc path - column_names = True - else: - raise ValueError("No valid specification of the columns. Only a " - "scalar, list or slice of all integers or all " - "strings, or boolean mask is allowed") - if column_names: - if hasattr(X, 'loc'): - # pandas dataframes - return X.loc[:, key] - else: - raise ValueError("Specifying the columns using strings is only " - "supported for pandas DataFrames") + if axis == 1 and indices_dtype == 'str' and not hasattr(X, 'loc'): + raise ValueError( + "Specifying the columns using strings is only supported for " + "pandas DataFrames" + ) + + if hasattr(X, "iloc"): + return _pandas_indexing(X, indices, indices_dtype, axis=axis) + elif hasattr(X, "shape"): + return _array_indexing(X, indices, indices_dtype, axis=axis) else: - if hasattr(X, 'iloc'): - # pandas dataframes - return X.iloc[:, key] - else: - # numpy arrays, sparse arrays - return _array_indexing(X, key, axis=1) + return _list_indexing(X, indices, indices_dtype) def _get_column_indices(X, key): @@ -383,17 +350,22 @@ def _get_column_indices(X, key): """ n_columns = X.shape[1] - if (_check_key_type(key, int) - or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)): + key_dtype = _determine_key_type(key) + + if isinstance(key, list) and not key: + # we get an empty list + return [] + elif key_dtype in ('bool', 'int'): # Convert key into positive indexes try: idx = safe_indexing(np.arange(n_columns), key) except IndexError as e: raise ValueError( - 'all features must be in [0, %d]' % (n_columns - 1) + 'all features must be in [0, {}] or [-{}, 0]' + .format(n_columns - 1, n_columns) ) from e return np.atleast_1d(idx).tolist() - elif _check_key_type(key, str): + elif key_dtype == 'str': try: all_columns = list(X.columns) except AttributeError: diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 806295f1aae28..363b77a44b5fc 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -1,5 +1,5 @@ -from collections.abc import Iterable -from itertools import chain, product +from copy import copy +from itertools import chain import warnings import string import timeit @@ -13,9 +13,8 @@ assert_allclose_dense_sparse, assert_raises_regex, assert_warns_message, assert_no_warnings) -from sklearn.utils import _array_indexing from sklearn.utils import check_random_state -from sklearn.utils import _check_key_type +from sklearn.utils import _determine_key_type from sklearn.utils import deprecated from sklearn.utils import _get_column_indices from sklearn.utils import resample @@ -199,114 +198,232 @@ def test_column_or_1d(): @pytest.mark.parametrize( - "key, clazz, is_expected_type", - [(0, int, True), - ('0', int, False), - ([0, 1, 2], int, True), - (['0', '1', '2'], int, False), - (slice(0, 2), int, True), - (np.array([0, 1, 2], dtype=np.int32), int, True), - (np.array([0, 1, 2], dtype=np.int64), int, True), - (np.array([0, 1, 2], dtype=np.uint8), int, False), - ([True, False], bool, True), - (np.array([True, False]), bool, True), - (np.array([True, False]), int, False), - ('col_0', str, True), - (['col_0', 'col_1', 'col_2'], str, True), - (slice('begin', 'end'), str, True), - (np.array(['col_0', 'col_1', 'col_2']), str, True), - (np.array(['col_0', 'col_1', 'col_2'], dtype=object), str, True)] + "key, dtype", + [(0, 'int'), + ('0', 'str'), + (True, 'bool'), + (np.bool_(True), 'bool'), + ([0, 1, 2], 'int'), + (['0', '1', '2'], 'str'), + (slice(None, None), None), + (slice(0, 2), 'int'), + (np.array([0, 1, 2], dtype=np.int32), 'int'), + (np.array([0, 1, 2], dtype=np.int64), 'int'), + (np.array([0, 1, 2], dtype=np.uint8), 'int'), + ([True, False], 'bool'), + (np.array([True, False]), 'bool'), + ('col_0', 'str'), + (['col_0', 'col_1', 'col_2'], 'str'), + (slice('begin', 'end'), 'str'), + (np.array(['col_0', 'col_1', 'col_2']), 'str'), + (np.array(['col_0', 'col_1', 'col_2'], dtype=object), 'str')] ) -def test_check_key_type(key, clazz, is_expected_type): - assert _check_key_type(key, clazz) is is_expected_type - - -@pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) -def test_safe_indexing_axis_0(asarray): - X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - inds = np.array([1, 2]) if asarray else [1, 2] - X_inds = safe_indexing(X, inds) - X_arrays = safe_indexing(np.array(X), inds) - assert_array_equal(np.array(X_inds), X_arrays) - assert_array_equal(np.array(X_inds), np.array(X)[inds]) - - -@pytest.mark.parametrize("idx", [0, [0, 1]], ids=['scalar', 'list']) -@pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) -def test_safe_indexing_axis_1_sparse(idx, asarray): - if isinstance(idx, Iterable) and asarray: - idx = np.asarray(idx) - X_true = safe_indexing(X_toy, idx, axis=1) - - # scipy matrix will always return a 2D array - if X_true.ndim == 1: - X_true = X_true[:, np.newaxis] - - X_sparse = sp.csc_matrix(X_toy) - assert_array_equal( - safe_indexing(X_sparse, idx, axis=1).toarray(), X_true +def test_determine_key_type(key, dtype): + assert _determine_key_type(key) == dtype + + +def test_determine_key_type_error(): + with pytest.raises(ValueError, match="No valid specification of the"): + _determine_key_type(1.0) + + +def _convert_container(container, constructor_name, columns_name=None): + if constructor_name == 'list': + return list(container) + elif constructor_name == 'array': + return np.asarray(container) + elif constructor_name == 'sparse': + return sp.csr_matrix(container) + elif constructor_name == 'dataframe': + pd = pytest.importorskip('pandas') + return pd.DataFrame(container, columns=columns_name) + elif constructor_name == 'series': + pd = pytest.importorskip('pandas') + return pd.Series(container) + elif constructor_name == 'slice': + return slice(container[0], container[1]) + + +@pytest.mark.parametrize( + "array_type", ["list", "array", "sparse", "dataframe"] +) +@pytest.mark.parametrize("indices_type", ["list", "array", "series", "slice"]) +def test_safe_indexing_2d_container_axis_0(array_type, indices_type): + indices = [1, 2] + if indices_type == 'slice' and isinstance(indices[1], int): + indices[1] += 1 + array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) + indices = _convert_container(indices, indices_type) + subset = safe_indexing(array, indices, axis=0) + assert_allclose_dense_sparse( + subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type) + ) + + +@pytest.mark.parametrize("array_type", ["list", "array", "series"]) +@pytest.mark.parametrize("indices_type", ["list", "array", "series", "slice"]) +def test_safe_indexing_1d_container(array_type, indices_type): + indices = [1, 2] + if indices_type == 'slice' and isinstance(indices[1], int): + indices[1] += 1 + array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) + indices = _convert_container(indices, indices_type) + subset = safe_indexing(array, indices, axis=0) + assert_allclose_dense_sparse( + subset, _convert_container([2, 3], array_type) + ) + + +@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"]) +@pytest.mark.parametrize("indices_type", ["list", "array", "series", "slice"]) +@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]]) +def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices): + # validation of the indices + # we make a copy because indices is mutable and shared between tests + indices_converted = copy(indices) + if indices_type == 'slice' and isinstance(indices[1], int): + indices_converted[1] += 1 + + columns_name = ['col_0', 'col_1', 'col_2'] + array = _convert_container( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name ) + indices_converted = _convert_container(indices_converted, indices_type) + if isinstance(indices[0], str) and array_type != 'dataframe': + err_msg = ("Specifying the columns using strings is only supported " + "for pandas DataFrames") + with pytest.raises(ValueError, match=err_msg): + safe_indexing(array, indices_converted, axis=1) + else: + subset = safe_indexing(array, indices_converted, axis=1) + assert_allclose_dense_sparse( + subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type) + ) + +@pytest.mark.parametrize("array_read_only", [True, False]) +@pytest.mark.parametrize("indices_read_only", [True, False]) +@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"]) +@pytest.mark.parametrize("indices_type", ["array", "series"]) @pytest.mark.parametrize( - "idx_array, idx_df", - [(0, 0), - (0, 'col_0'), - ([0, 1], [0, 1]), - ([0, 1], ['col_0', 'col_1']), - ([0, 1], slice(0, 2)), - ([1, 2], slice(1, None)), - ([0, 1], [True, True, False])], - ids=['scalar-int', 'scalar-str', 'list-int', 'list-str', 'slice', - 'slice-no-stop', 'mask'] + "axis, expected_array", + [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])] ) -@pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) -def test_safe_indexing_axis_1_pandas(idx_array, idx_df, asarray): - pd = pytest.importorskip('pandas') - if asarray and isinstance(idx_array, Iterable): - idx_array = np.asarray(idx_array) - if (asarray and (not isinstance(idx_df, str) and - isinstance(idx_df, Iterable))): - idx_df = np.asarray(idx_df) - - X_true = safe_indexing(X_toy, idx_array, axis=1) - X_df = pd.DataFrame(X_toy, columns=['col_{}'.format(i) for i in range(3)]) - assert_array_equal( - safe_indexing(X_df, idx_df, axis=1).values, X_true +def test_safe_indexing_2d_read_only_axis_1(array_read_only, indices_read_only, + array_type, indices_type, axis, + expected_array): + array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + if array_read_only: + array.setflags(write=False) + array = _convert_container(array, array_type) + indices = np.array([1, 2]) + if indices_read_only: + indices.setflags(write=False) + indices = _convert_container(indices, indices_type) + subset = safe_indexing(array, indices, axis=axis) + assert_allclose_dense_sparse( + subset, _convert_container(expected_array, array_type) ) -def test_safe_indexing_pandas(): - pd = pytest.importorskip("pandas") - X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - X_df = pd.DataFrame(X) - inds = np.array([1, 2]) - X_df_indexed = safe_indexing(X_df, inds) - X_indexed = safe_indexing(X_df, inds) - assert_array_equal(np.array(X_df_indexed), X_indexed) - # fun with read-only data in dataframes - # this happens in joblib memmapping - X.setflags(write=False) - X_df_readonly = pd.DataFrame(X) - inds_readonly = inds.copy() - inds_readonly.setflags(write=False) - - for this_df, this_inds in product([X_df, X_df_readonly], - [inds, inds_readonly]): - with warnings.catch_warnings(record=True): - X_df_indexed = safe_indexing(this_df, this_inds) - - assert_array_equal(np.array(X_df_indexed), X_indexed) +@pytest.mark.parametrize("array_type", ["list", "array", "series"]) +@pytest.mark.parametrize("indices_type", ["list", "array", "series"]) +def test_safe_indexing_1d_container_mask(array_type, indices_type): + indices = [False] + [True] * 2 + [False] * 6 + array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) + indices = _convert_container(indices, indices_type) + subset = safe_indexing(array, indices, axis=0) + assert_allclose_dense_sparse( + subset, _convert_container([2, 3], array_type) + ) + + +@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"]) +@pytest.mark.parametrize("indices_type", ["list", "array", "series"]) +@pytest.mark.parametrize( + "axis, expected_subset", + [(0, [[4, 5, 6], [7, 8, 9]]), + (1, [[2, 3], [5, 6], [8, 9]])] +) +def test_safe_indexing_2d_mask(array_type, indices_type, axis, + expected_subset): + columns_name = ['col_0', 'col_1', 'col_2'] + array = _convert_container( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name + ) + indices = [False, True, True] + indices = _convert_container(indices, indices_type) + + subset = safe_indexing(array, indices, axis=axis) + assert_allclose_dense_sparse( + subset, _convert_container(expected_subset, array_type) + ) + + +@pytest.mark.parametrize( + "array_type, expected_output_type", + [("list", "list"), ("array", "array"), + ("sparse", "sparse"), ("dataframe", "series")] +) +def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type): + array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) + indices = 2 + subset = safe_indexing(array, indices, axis=0) + expected_array = _convert_container([7, 8, 9], expected_output_type) + assert_allclose_dense_sparse(subset, expected_array) + + +@pytest.mark.parametrize("array_type", ["list", "array", "series"]) +def test_safe_indexing_1d_scalar(array_type): + array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) + indices = 2 + subset = safe_indexing(array, indices, axis=0) + assert subset == 3 @pytest.mark.parametrize( - "X, key, err_msg", - [(X_toy, 1.0, "No valid specification of the columns."), - (X_toy, ['col_0'], "Specifying the columns using strings is only")] + "array_type, expected_output_type", + [("array", "array"), ("sparse", "sparse"), ("dataframe", "series")] ) -def test_safe_indexing_axis_1_error(X, key, err_msg): +@pytest.mark.parametrize("indices", [2, "col_2"]) +def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, + indices): + columns_name = ['col_0', 'col_1', 'col_2'] + array = _convert_container( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name + ) + + if isinstance(indices, str) and array_type != 'dataframe': + err_msg = ("Specifying the columns using strings is only supported " + "for pandas DataFrames") + with pytest.raises(ValueError, match=err_msg): + safe_indexing(array, indices, axis=1) + else: + subset = safe_indexing(array, indices, axis=1) + expected_output = [3, 6, 9] + if expected_output_type == 'sparse': + # sparse matrix are keeping the 2D shape + expected_output = [[3], [6], [9]] + expected_array = _convert_container( + expected_output, expected_output_type + ) + assert_allclose_dense_sparse(subset, expected_array) + + +@pytest.mark.parametrize("array_type", ["list", "array", "sparse"]) +def test_safe_indexing_None_axis_0(array_type): + X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) + X_subset = safe_indexing(X, None, axis=0) + assert_allclose_dense_sparse(X_subset, X) + + +def test_safe_indexing_pandas_no_matching_cols_error(): + pd = pytest.importorskip('pandas') + err_msg = "No valid specification of the columns." + X = pd.DataFrame(X_toy) with pytest.raises(ValueError, match=err_msg): - safe_indexing(X, key, axis=1) + safe_indexing(X, [1.0], axis=1) @pytest.mark.parametrize("axis", [None, 3]) @@ -331,6 +448,14 @@ def test_safe_indexing_1d_array_error(X_constructor): safe_indexing(X_constructor, [0, 1], axis=1) +def test_safe_indexing_container_axis_0_unsupported_type(): + indices = ["col_1", "col_2"] + array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + err_msg = "String indexing is not supported with 'axis=0'" + with pytest.raises(ValueError, match=err_msg): + safe_indexing(array, indices, axis=0) + + @pytest.mark.parametrize( "key, err_msg", [(10, r"all features must be in \[0, 2\]"), @@ -344,57 +469,6 @@ def test_get_column_indices_error(key, err_msg): _get_column_indices(X_df, key) -@pytest.mark.parametrize( - "idx", - [[0, 1], - [True, True, False]] -) -@pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) -def test_safe_indexing_pandas_series(idx, asarray): - pd = pytest.importorskip("pandas") - idx = np.asarray(idx) if asarray else idx - serie = pd.Series(np.arange(3)) - assert_array_equal(safe_indexing(serie, idx).values, [0, 1]) - - -@pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) -def test_safe_indexing_mock_pandas(asarray): - X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - X_df = MockDataFrame(X) - inds = np.array([1, 2]) if asarray else [1, 2] - X_df_indexed = safe_indexing(X_df, inds) - X_indexed = safe_indexing(X_df, inds) - assert_array_equal(np.array(X_df_indexed), X_indexed) - - -@pytest.mark.parametrize("array_type", ['array', 'sparse', 'dataframe']) -def test_safe_indexing_mask_axis_1(array_type): - # regression test for #14510 - # check that boolean array-like and boolean array lead to the same indexing - # even in NumPy < 1.12 - if array_type == 'array': - array_constructor = np.asarray - elif array_type == 'sparse': - array_constructor = sp.csr_matrix - elif array_type == 'dataframe': - pd = pytest.importorskip('pandas') - array_constructor = pd.DataFrame - - X = array_constructor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - mask = [True, False, True] - mask_array = np.array(mask) - X_masked = safe_indexing(X, mask, axis=1) - X_masked_array = safe_indexing(X, mask_array, axis=1) - assert_allclose_dense_sparse(X_masked, X_masked_array) - - -def test_array_indexing_array_error(): - X = np.array([[0, 1], [2, 3]]) - mask = [True, False] - with pytest.raises(ValueError, match="'axis' should be either 0"): - _array_indexing(X, mask, axis=3) - - def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A)