diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 878e2ea809c01..e0ff0b9509c3b 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -21,8 +21,8 @@ _safe_indexing, check_random_state, column_or_1d, - indices_to_mask, ) +from ..utils._mask import indices_to_mask from ..utils._param_validation import HasMethods, Interval, RealNotInt from ..utils._tags import _safe_tags from ..utils.metadata_routing import ( diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 554f693061116..4c60a2de8cb86 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -7,7 +7,7 @@ from scipy import optimize from ..base import BaseEstimator, RegressorMixin, _fit_context -from ..utils import axis0_safe_slice +from ..utils._mask import axis0_safe_slice from ..utils._param_validation import Interval from ..utils.extmath import safe_sparse_dot from ..utils.optimize import _check_optimize_result diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 9a2481393271a..f44c0ca078777 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -19,6 +19,7 @@ from ._bunch import Bunch from ._chunking import gen_batches, gen_even_slices from ._estimator_html_repr import estimator_html_repr +from ._mask import safe_mask from ._param_validation import Interval, validate_params from .class_weight import compute_class_weight, compute_sample_weight from .deprecation import deprecated @@ -64,7 +65,6 @@ "check_scalar", "indexable", "check_symmetric", - "indices_to_mask", "deprecated", "parallel_backend", "register_parallel_backend", @@ -76,6 +76,7 @@ "Bunch", "metadata_routing", "safe_sqr", + "safe_mask", "gen_batches", "gen_even_slices", ] @@ -85,88 +86,6 @@ _IS_WASM = platform.machine() in ["wasm32", "wasm64"] -@validate_params( - { - "X": ["array-like", "sparse matrix"], - "mask": ["array-like"], - }, - prefer_skip_nested_validation=True, -) -def safe_mask(X, mask): - """Return a mask which is safe to use on X. - - Parameters - ---------- - X : {array-like, sparse matrix} - Data on which to apply mask. - - mask : array-like - Mask to be used on X. - - Returns - ------- - mask : ndarray - Array that is safe to use on X. - - Examples - -------- - >>> from sklearn.utils import safe_mask - >>> from scipy.sparse import csr_matrix - >>> data = csr_matrix([[1], [2], [3], [4], [5]]) - >>> condition = [False, True, True, False, True] - >>> mask = safe_mask(data, condition) - >>> data[mask].toarray() - array([[2], - [3], - [5]]) - """ - mask = np.asarray(mask) - if np.issubdtype(mask.dtype, np.signedinteger): - return mask - - if hasattr(X, "toarray"): - ind = np.arange(mask.shape[0]) - mask = ind[mask] - return mask - - -def axis0_safe_slice(X, mask, len_mask): - """Return a mask which is safer to use on X than safe_mask. - - This mask is safer than safe_mask since it returns an - empty array, when a sparse matrix is sliced with a boolean mask - with all False, instead of raising an unhelpful error in older - versions of SciPy. - - See: https://github.com/scipy/scipy/issues/5361 - - Also note that we can avoid doing the dot product by checking if - the len_mask is not zero in _huber_loss_and_gradient but this - is not going to be the bottleneck, since the number of outliers - and non_outliers are typically non-zero and it makes the code - tougher to follow. - - Parameters - ---------- - X : {array-like, sparse matrix} - Data on which to apply mask. - - mask : ndarray - Mask to be used on X. - - len_mask : int - The length of the mask. - - Returns - ------- - mask : ndarray - Array that is safe to use on X. - """ - if len_mask != 0: - return X[safe_mask(X, mask), :] - return np.zeros(shape=(0, X.shape[1])) - - def _array_indexing(array, key, key_dtype, axis): """Index an array or scipy.sparse consistently across NumPy version.""" if issparse(array) and key_dtype == "bool": @@ -806,38 +725,6 @@ def _to_object_array(sequence): return out -def indices_to_mask(indices, mask_length): - """Convert list of indices to boolean mask. - - Parameters - ---------- - indices : list-like - List of integers treated as indices. - mask_length : int - Length of boolean mask to be generated. - This parameter must be greater than max(indices). - - Returns - ------- - mask : 1d boolean nd-array - Boolean array that is True where indices are present, else False. - - Examples - -------- - >>> from sklearn.utils import indices_to_mask - >>> indices = [1, 2 , 3, 4] - >>> indices_to_mask(indices, 5) - array([False, True, True, True, True]) - """ - if mask_length <= np.max(indices): - raise ValueError("mask_length must be greater than max(indices)") - - mask = np.zeros(mask_length, dtype=bool) - mask[indices] = True - - return mask - - def _message_with_time(source, message, time): """Create one line message for logging purposes. diff --git a/sklearn/utils/_mask.py b/sklearn/utils/_mask.py index 15ea55e7520e1..0a66dc5a20a81 100644 --- a/sklearn/utils/_mask.py +++ b/sklearn/utils/_mask.py @@ -4,6 +4,7 @@ from scipy import sparse as sp from ._missing import is_scalar_nan +from ._param_validation import validate_params from .fixes import _object_dtype_isnan @@ -61,3 +62,117 @@ def _get_mask(X, value_to_mask): ) return Xt_sparse + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "mask": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def safe_mask(X, mask): + """Return a mask which is safe to use on X. + + Parameters + ---------- + X : {array-like, sparse matrix} + Data on which to apply mask. + + mask : array-like + Mask to be used on X. + + Returns + ------- + mask : ndarray + Array that is safe to use on X. + + Examples + -------- + >>> from sklearn.utils import safe_mask + >>> from scipy.sparse import csr_matrix + >>> data = csr_matrix([[1], [2], [3], [4], [5]]) + >>> condition = [False, True, True, False, True] + >>> mask = safe_mask(data, condition) + >>> data[mask].toarray() + array([[2], + [3], + [5]]) + """ + mask = np.asarray(mask) + if np.issubdtype(mask.dtype, np.signedinteger): + return mask + + if hasattr(X, "toarray"): + ind = np.arange(mask.shape[0]) + mask = ind[mask] + return mask + + +def axis0_safe_slice(X, mask, len_mask): + """Return a mask which is safer to use on X than safe_mask. + + This mask is safer than safe_mask since it returns an + empty array, when a sparse matrix is sliced with a boolean mask + with all False, instead of raising an unhelpful error in older + versions of SciPy. + + See: https://github.com/scipy/scipy/issues/5361 + + Also note that we can avoid doing the dot product by checking if + the len_mask is not zero in _huber_loss_and_gradient but this + is not going to be the bottleneck, since the number of outliers + and non_outliers are typically non-zero and it makes the code + tougher to follow. + + Parameters + ---------- + X : {array-like, sparse matrix} + Data on which to apply mask. + + mask : ndarray + Mask to be used on X. + + len_mask : int + The length of the mask. + + Returns + ------- + mask : ndarray + Array that is safe to use on X. + """ + if len_mask != 0: + return X[safe_mask(X, mask), :] + return np.zeros(shape=(0, X.shape[1])) + + +def indices_to_mask(indices, mask_length): + """Convert list of indices to boolean mask. + + Parameters + ---------- + indices : list-like + List of integers treated as indices. + mask_length : int + Length of boolean mask to be generated. + This parameter must be greater than max(indices). + + Returns + ------- + mask : 1d boolean nd-array + Boolean array that is True where indices are present, else False. + + Examples + -------- + >>> from sklearn.utils._mask import indices_to_mask + >>> indices = [1, 2 , 3, 4] + >>> indices_to_mask(indices, 5) + array([False, True, True, True, True]) + """ + if mask_length <= np.max(indices): + raise ValueError("mask_length must be greater than max(indices)") + + mask = np.zeros(mask_length, dtype=bool) + mask[indices] = True + + return mask