From 2f9e614afa16e2c8076b7d3e719d978be0d08285 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 21 Feb 2024 16:26:11 +0100 Subject: [PATCH] wip --- sklearn/ensemble/_bagging.py | 3 +- sklearn/linear_model/_huber.py | 2 +- sklearn/utils/__init__.py | 117 +-------------------------------- sklearn/utils/_mask.py | 115 ++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 117 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index da340ceec6fe4..7728779389511 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -16,7 +16,8 @@ from ..base import ClassifierMixin, RegressorMixin, _fit_context from ..metrics import accuracy_score, r2_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_random_state, column_or_1d, indices_to_mask +from ..utils import check_random_state, column_or_1d +from ..utils._mask import indices_to_mask from ..utils._param_validation import HasMethods, Interval, RealNotInt from ..utils._tags import _safe_tags from ..utils.metadata_routing import ( diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 554f693061116..4c60a2de8cb86 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -7,7 +7,7 @@ from scipy import optimize from ..base import BaseEstimator, RegressorMixin, _fit_context -from ..utils import axis0_safe_slice +from ..utils._mask import axis0_safe_slice from ..utils._param_validation import Interval from ..utils.extmath import safe_sparse_dot from ..utils.optimize import _check_optimize_result diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index e359a50c63aab..1484c5d6b93ac 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -20,6 +20,7 @@ from . import _joblib, metadata_routing from ._bunch import Bunch from ._estimator_html_repr import estimator_html_repr +from ._mask import safe_mask from ._param_validation import Integral, Interval, validate_params from .class_weight import compute_class_weight, compute_sample_weight from .deprecation import deprecated @@ -66,7 +67,6 @@ "check_scalar", "indexable", "check_symmetric", - "indices_to_mask", "deprecated", "parallel_backend", "register_parallel_backend", @@ -78,6 +78,7 @@ "Bunch", "metadata_routing", "safe_sqr", + "safe_mask", ] IS_PYPY = platform.python_implementation() == "PyPy" @@ -118,88 +119,6 @@ def _in_unstable_openblas_configuration(): return False -@validate_params( - { - "X": ["array-like", "sparse matrix"], - "mask": ["array-like"], - }, - prefer_skip_nested_validation=True, -) -def safe_mask(X, mask): - """Return a mask which is safe to use on X. - - Parameters - ---------- - X : {array-like, sparse matrix} - Data on which to apply mask. - - mask : array-like - Mask to be used on X. - - Returns - ------- - mask : ndarray - Array that is safe to use on X. - - Examples - -------- - >>> from sklearn.utils import safe_mask - >>> from scipy.sparse import csr_matrix - >>> data = csr_matrix([[1], [2], [3], [4], [5]]) - >>> condition = [False, True, True, False, True] - >>> mask = safe_mask(data, condition) - >>> data[mask].toarray() - array([[2], - [3], - [5]]) - """ - mask = np.asarray(mask) - if np.issubdtype(mask.dtype, np.signedinteger): - return mask - - if hasattr(X, "toarray"): - ind = np.arange(mask.shape[0]) - mask = ind[mask] - return mask - - -def axis0_safe_slice(X, mask, len_mask): - """Return a mask which is safer to use on X than safe_mask. - - This mask is safer than safe_mask since it returns an - empty array, when a sparse matrix is sliced with a boolean mask - with all False, instead of raising an unhelpful error in older - versions of SciPy. - - See: https://github.com/scipy/scipy/issues/5361 - - Also note that we can avoid doing the dot product by checking if - the len_mask is not zero in _huber_loss_and_gradient but this - is not going to be the bottleneck, since the number of outliers - and non_outliers are typically non-zero and it makes the code - tougher to follow. - - Parameters - ---------- - X : {array-like, sparse matrix} - Data on which to apply mask. - - mask : ndarray - Mask to be used on X. - - len_mask : int - The length of the mask. - - Returns - ------- - mask : ndarray - Array that is safe to use on X. - """ - if len_mask != 0: - return X[safe_mask(X, mask), :] - return np.zeros(shape=(0, X.shape[1])) - - def _array_indexing(array, key, key_dtype, axis): """Index an array or scipy.sparse consistently across NumPy version.""" if issparse(array) and key_dtype == "bool": @@ -965,38 +884,6 @@ def _to_object_array(sequence): return out -def indices_to_mask(indices, mask_length): - """Convert list of indices to boolean mask. - - Parameters - ---------- - indices : list-like - List of integers treated as indices. - mask_length : int - Length of boolean mask to be generated. - This parameter must be greater than max(indices). - - Returns - ------- - mask : 1d boolean nd-array - Boolean array that is True where indices are present, else False. - - Examples - -------- - >>> from sklearn.utils import indices_to_mask - >>> indices = [1, 2 , 3, 4] - >>> indices_to_mask(indices, 5) - array([False, True, True, True, True]) - """ - if mask_length <= np.max(indices): - raise ValueError("mask_length must be greater than max(indices)") - - mask = np.zeros(mask_length, dtype=bool) - mask[indices] = True - - return mask - - def _message_with_time(source, message, time): """Create one line message for logging purposes. diff --git a/sklearn/utils/_mask.py b/sklearn/utils/_mask.py index 07332bf1edbd4..f862a147f18fb 100644 --- a/sklearn/utils/_mask.py +++ b/sklearn/utils/_mask.py @@ -4,6 +4,7 @@ from scipy import sparse as sp from . import is_scalar_nan +from ._param_validation import validate_params from .fixes import _object_dtype_isnan @@ -61,3 +62,117 @@ def _get_mask(X, value_to_mask): ) return Xt_sparse + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "mask": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def safe_mask(X, mask): + """Return a mask which is safe to use on X. + + Parameters + ---------- + X : {array-like, sparse matrix} + Data on which to apply mask. + + mask : array-like + Mask to be used on X. + + Returns + ------- + mask : ndarray + Array that is safe to use on X. + + Examples + -------- + >>> from sklearn.utils import safe_mask + >>> from scipy.sparse import csr_matrix + >>> data = csr_matrix([[1], [2], [3], [4], [5]]) + >>> condition = [False, True, True, False, True] + >>> mask = safe_mask(data, condition) + >>> data[mask].toarray() + array([[2], + [3], + [5]]) + """ + mask = np.asarray(mask) + if np.issubdtype(mask.dtype, np.signedinteger): + return mask + + if hasattr(X, "toarray"): + ind = np.arange(mask.shape[0]) + mask = ind[mask] + return mask + + +def axis0_safe_slice(X, mask, len_mask): + """Return a mask which is safer to use on X than safe_mask. + + This mask is safer than safe_mask since it returns an + empty array, when a sparse matrix is sliced with a boolean mask + with all False, instead of raising an unhelpful error in older + versions of SciPy. + + See: https://github.com/scipy/scipy/issues/5361 + + Also note that we can avoid doing the dot product by checking if + the len_mask is not zero in _huber_loss_and_gradient but this + is not going to be the bottleneck, since the number of outliers + and non_outliers are typically non-zero and it makes the code + tougher to follow. + + Parameters + ---------- + X : {array-like, sparse matrix} + Data on which to apply mask. + + mask : ndarray + Mask to be used on X. + + len_mask : int + The length of the mask. + + Returns + ------- + mask : ndarray + Array that is safe to use on X. + """ + if len_mask != 0: + return X[safe_mask(X, mask), :] + return np.zeros(shape=(0, X.shape[1])) + + +def indices_to_mask(indices, mask_length): + """Convert list of indices to boolean mask. + + Parameters + ---------- + indices : list-like + List of integers treated as indices. + mask_length : int + Length of boolean mask to be generated. + This parameter must be greater than max(indices). + + Returns + ------- + mask : 1d boolean nd-array + Boolean array that is True where indices are present, else False. + + Examples + -------- + >>> from sklearn.utils._mask import indices_to_mask + >>> indices = [1, 2 , 3, 4] + >>> indices_to_mask(indices, 5) + array([False, True, True, True, True]) + """ + if mask_length <= np.max(indices): + raise ValueError("mask_length must be greater than max(indices)") + + mask = np.zeros(mask_length, dtype=bool) + mask[indices] = True + + return mask