Skip to content

MAINT cleanup utils.__init__: move masking tools into utils._mask #28515

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sklearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
_safe_indexing,
check_random_state,
column_or_1d,
indices_to_mask,
)
from ..utils._mask import indices_to_mask
from ..utils._param_validation import HasMethods, Interval, RealNotInt
from ..utils._tags import _safe_tags
from ..utils.metadata_routing import (
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/_huber.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from scipy import optimize

from ..base import BaseEstimator, RegressorMixin, _fit_context
from ..utils import axis0_safe_slice
from ..utils._mask import axis0_safe_slice
from ..utils._param_validation import Interval
from ..utils.extmath import safe_sparse_dot
from ..utils.optimize import _check_optimize_result
Expand Down
117 changes: 2 additions & 115 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ._bunch import Bunch
from ._chunking import gen_batches, gen_even_slices
from ._estimator_html_repr import estimator_html_repr
from ._mask import safe_mask
from ._param_validation import Interval, validate_params
from .class_weight import compute_class_weight, compute_sample_weight
from .deprecation import deprecated
Expand Down Expand Up @@ -64,7 +65,6 @@
"check_scalar",
"indexable",
"check_symmetric",
"indices_to_mask",
"deprecated",
"parallel_backend",
"register_parallel_backend",
Expand All @@ -76,6 +76,7 @@
"Bunch",
"metadata_routing",
"safe_sqr",
"safe_mask",
"gen_batches",
"gen_even_slices",
]
Expand All @@ -85,88 +86,6 @@
_IS_WASM = platform.machine() in ["wasm32", "wasm64"]


@validate_params(
{
"X": ["array-like", "sparse matrix"],
"mask": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def safe_mask(X, mask):
"""Return a mask which is safe to use on X.

Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.

mask : array-like
Mask to be used on X.

Returns
-------
mask : ndarray
Array that is safe to use on X.

Examples
--------
>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
[3],
[5]])
"""
mask = np.asarray(mask)
if np.issubdtype(mask.dtype, np.signedinteger):
return mask

if hasattr(X, "toarray"):
ind = np.arange(mask.shape[0])
mask = ind[mask]
return mask


def axis0_safe_slice(X, mask, len_mask):
"""Return a mask which is safer to use on X than safe_mask.

This mask is safer than safe_mask since it returns an
empty array, when a sparse matrix is sliced with a boolean mask
with all False, instead of raising an unhelpful error in older
versions of SciPy.

See: https://github.com/scipy/scipy/issues/5361

Also note that we can avoid doing the dot product by checking if
the len_mask is not zero in _huber_loss_and_gradient but this
is not going to be the bottleneck, since the number of outliers
and non_outliers are typically non-zero and it makes the code
tougher to follow.

Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.

mask : ndarray
Mask to be used on X.

len_mask : int
The length of the mask.

Returns
-------
mask : ndarray
Array that is safe to use on X.
"""
if len_mask != 0:
return X[safe_mask(X, mask), :]
return np.zeros(shape=(0, X.shape[1]))


def _array_indexing(array, key, key_dtype, axis):
"""Index an array or scipy.sparse consistently across NumPy version."""
if issparse(array) and key_dtype == "bool":
Expand Down Expand Up @@ -806,38 +725,6 @@ def _to_object_array(sequence):
return out


def indices_to_mask(indices, mask_length):
"""Convert list of indices to boolean mask.

Parameters
----------
indices : list-like
List of integers treated as indices.
mask_length : int
Length of boolean mask to be generated.
This parameter must be greater than max(indices).

Returns
-------
mask : 1d boolean nd-array
Boolean array that is True where indices are present, else False.

Examples
--------
>>> from sklearn.utils import indices_to_mask
>>> indices = [1, 2 , 3, 4]
>>> indices_to_mask(indices, 5)
array([False, True, True, True, True])
"""
if mask_length <= np.max(indices):
raise ValueError("mask_length must be greater than max(indices)")

mask = np.zeros(mask_length, dtype=bool)
mask[indices] = True

return mask


def _message_with_time(source, message, time):
"""Create one line message for logging purposes.

Expand Down
115 changes: 115 additions & 0 deletions sklearn/utils/_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scipy import sparse as sp

from ._missing import is_scalar_nan
from ._param_validation import validate_params
from .fixes import _object_dtype_isnan


Expand Down Expand Up @@ -61,3 +62,117 @@ def _get_mask(X, value_to_mask):
)

return Xt_sparse


@validate_params(
{
"X": ["array-like", "sparse matrix"],
"mask": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def safe_mask(X, mask):
"""Return a mask which is safe to use on X.

Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.

mask : array-like
Mask to be used on X.

Returns
-------
mask : ndarray
Array that is safe to use on X.

Examples
--------
>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
[3],
[5]])
"""
mask = np.asarray(mask)
if np.issubdtype(mask.dtype, np.signedinteger):
return mask

if hasattr(X, "toarray"):
ind = np.arange(mask.shape[0])
mask = ind[mask]
return mask


def axis0_safe_slice(X, mask, len_mask):
"""Return a mask which is safer to use on X than safe_mask.

This mask is safer than safe_mask since it returns an
empty array, when a sparse matrix is sliced with a boolean mask
with all False, instead of raising an unhelpful error in older
versions of SciPy.

See: https://github.com/scipy/scipy/issues/5361

Also note that we can avoid doing the dot product by checking if
the len_mask is not zero in _huber_loss_and_gradient but this
is not going to be the bottleneck, since the number of outliers
and non_outliers are typically non-zero and it makes the code
tougher to follow.

Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.

mask : ndarray
Mask to be used on X.

len_mask : int
The length of the mask.

Returns
-------
mask : ndarray
Array that is safe to use on X.
"""
if len_mask != 0:
return X[safe_mask(X, mask), :]
return np.zeros(shape=(0, X.shape[1]))


def indices_to_mask(indices, mask_length):
"""Convert list of indices to boolean mask.

Parameters
----------
indices : list-like
List of integers treated as indices.
mask_length : int
Length of boolean mask to be generated.
This parameter must be greater than max(indices).

Returns
-------
mask : 1d boolean nd-array
Boolean array that is True where indices are present, else False.

Examples
--------
>>> from sklearn.utils._mask import indices_to_mask
>>> indices = [1, 2 , 3, 4]
>>> indices_to_mask(indices, 5)
array([False, True, True, True, True])
"""
if mask_length <= np.max(indices):
raise ValueError("mask_length must be greater than max(indices)")

mask = np.zeros(mask_length, dtype=bool)
mask[indices] = True

return mask