Skip to content

MAINT cleanup utils.__init__: move safe_sqr and _approximate_mode into extmath #28481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@
from scipy.special import comb

from ..utils import (
_approximate_mode,
_safe_indexing,
check_random_state,
indexable,
metadata_routing,
)
from ..utils._param_validation import Interval, RealNotInt, validate_params
from ..utils.extmath import _approximate_mode
from ..utils.metadata_routing import _MetadataRequester
from ..utils.multiclass import type_of_target
from ..utils.validation import _num_samples, check_array, column_or_1d
Expand Down
106 changes: 2 additions & 104 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .class_weight import compute_class_weight, compute_sample_weight
from .deprecation import deprecated
from .discovery import all_estimators
from .extmath import _approximate_mode, safe_sqr
from .fixes import parse_version, threadpool_info
from .murmurhash import murmurhash3_32
from .validation import (
Expand Down Expand Up @@ -76,6 +77,7 @@
"estimator_html_repr",
"Bunch",
"metadata_routing",
"safe_sqr",
]

IS_PYPY = platform.python_implementation() == "PyPy"
Expand Down Expand Up @@ -778,41 +780,6 @@ def shuffle(*arrays, random_state=None, n_samples=None):
)


def safe_sqr(X, *, copy=True):
"""Element wise squaring of array-likes and sparse matrices.

Parameters
----------
X : {array-like, ndarray, sparse matrix}

copy : bool, default=True
Whether to create a copy of X and operate on it or to perform
inplace computation (default behaviour).

Returns
-------
X ** 2 : element wise square
Return the element-wise square of the input.

Examples
--------
>>> from sklearn.utils import safe_sqr
>>> safe_sqr([1, 2, 3])
array([1, 4, 9])
"""
X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
if issparse(X):
if copy:
X = X.copy()
X.data **= 2
else:
if copy:
X = X**2
else:
X **= 2
return X


def _chunk_generator(gen, chunksize):
"""Chunk generator, ``gen`` into lists of length ``chunksize``. The last
chunk may have a length less than ``chunksize``."""
Expand Down Expand Up @@ -1183,72 +1150,3 @@ def is_scalar_nan(x):
and isinstance(x, numbers.Real)
and math.isnan(x)
)


def _approximate_mode(class_counts, n_draws, rng):
"""Computes approximate mode of multivariate hypergeometric.

This is an approximation to the mode of the multivariate
hypergeometric given by class_counts and n_draws.
It shouldn't be off by more than one.

It is the mostly likely outcome of drawing n_draws many
samples from the population given by class_counts.

Parameters
----------
class_counts : ndarray of int
Population per class.
n_draws : int
Number of draws (samples to draw) from the overall population.
rng : random state
Used to break ties.

Returns
-------
sampled_classes : ndarray of int
Number of samples drawn from each class.
np.sum(sampled_classes) == n_draws

Examples
--------
>>> import numpy as np
>>> from sklearn.utils import _approximate_mode
>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
array([2, 1])
>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
array([3, 1])
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
... n_draws=2, rng=0)
array([0, 1, 1, 0])
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
... n_draws=2, rng=42)
array([1, 1, 0, 0])
"""
rng = check_random_state(rng)
# this computes a bad approximation to the mode of the
# multivariate hypergeometric given by class_counts and n_draws
continuous = class_counts / class_counts.sum() * n_draws
# floored means we don't overshoot n_samples, but probably undershoot
floored = np.floor(continuous)
# we add samples according to how much "left over" probability
# they had, until we arrive at n_samples
need_to_add = int(n_draws - floored.sum())
if need_to_add > 0:
remainder = continuous - floored
values = np.sort(np.unique(remainder))[::-1]
# add according to remainder, but break ties
# randomly to avoid biases
for value in values:
(inds,) = np.where(remainder == value)
# if we need_to_add less than what's in inds
# we draw randomly from them.
# if we need to add more, we add them all and
# go to the next value
add_now = min(len(inds), need_to_add)
inds = rng.choice(inds, size=add_now, replace=False)
floored[inds] += 1
need_to_add -= add_now
if need_to_add == 0:
break
return floored.astype(int)
107 changes: 105 additions & 2 deletions sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@

from ..utils import deprecated
from ..utils._param_validation import Interval, StrOptions, validate_params
from . import check_random_state
from ._array_api import _is_numpy_namespace, device, get_namespace
from .sparsefuncs_fast import csr_row_norms
from .validation import check_array
from .validation import check_array, check_random_state


def squared_norm(x):
Expand Down Expand Up @@ -1282,3 +1281,107 @@ def _nanaverage(a, weights=None):
except ZeroDivisionError:
# this is when all weights are zero, then ignore them
return np.average(a)


def safe_sqr(X, *, copy=True):
"""Element wise squaring of array-likes and sparse matrices.

Parameters
----------
X : {array-like, ndarray, sparse matrix}

copy : bool, default=True
Whether to create a copy of X and operate on it or to perform
inplace computation (default behaviour).

Returns
-------
X ** 2 : element wise square
Return the element-wise square of the input.

Examples
--------
>>> from sklearn.utils import safe_sqr
>>> safe_sqr([1, 2, 3])
array([1, 4, 9])
"""
X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
if sparse.issparse(X):
if copy:
X = X.copy()
X.data **= 2
else:
if copy:
X = X**2
else:
X **= 2
return X


def _approximate_mode(class_counts, n_draws, rng):
"""Computes approximate mode of multivariate hypergeometric.

This is an approximation to the mode of the multivariate
hypergeometric given by class_counts and n_draws.
It shouldn't be off by more than one.

It is the mostly likely outcome of drawing n_draws many
samples from the population given by class_counts.

Parameters
----------
class_counts : ndarray of int
Population per class.
n_draws : int
Number of draws (samples to draw) from the overall population.
rng : random state
Used to break ties.

Returns
-------
sampled_classes : ndarray of int
Number of samples drawn from each class.
np.sum(sampled_classes) == n_draws

Examples
--------
>>> import numpy as np
>>> from sklearn.utils.extmath import _approximate_mode
>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
array([2, 1])
>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
array([3, 1])
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
... n_draws=2, rng=0)
array([0, 1, 1, 0])
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
... n_draws=2, rng=42)
array([1, 1, 0, 0])
"""
rng = check_random_state(rng)
# this computes a bad approximation to the mode of the
# multivariate hypergeometric given by class_counts and n_draws
continuous = class_counts / class_counts.sum() * n_draws
# floored means we don't overshoot n_samples, but probably undershoot
floored = np.floor(continuous)
# we add samples according to how much "left over" probability
# they had, until we arrive at n_samples
need_to_add = int(n_draws - floored.sum())
if need_to_add > 0:
remainder = continuous - floored
values = np.sort(np.unique(remainder))[::-1]
# add according to remainder, but break ties
# randomly to avoid biases
for value in values:
(inds,) = np.where(remainder == value)
# if we need_to_add less than what's in inds
# we draw randomly from them.
# if we need to add more, we add them all and
# go to the next value
add_now = min(len(inds), need_to_add)
inds = rng.choice(inds, size=add_now, replace=False)
floored[inds] += 1
need_to_add -= add_now
if need_to_add == 0:
break
return floored.astype(int)
18 changes: 18 additions & 0 deletions sklearn/utils/tests/test_extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
skip_if_32bit,
)
from sklearn.utils.extmath import (
_approximate_mode,
_deterministic_vector_sign_flip,
_incremental_mean_and_var,
_randomized_eigsh,
Expand Down Expand Up @@ -1062,3 +1063,20 @@ def test_safe_sparse_dot_dense_output(dense_output):
if dense_output:
expected = expected.toarray()
assert_allclose_dense_sparse(actual, expected)


def test_approximate_mode():
"""Make sure sklearn.utils.extmath._approximate_mode returns valid
results for cases where "class_counts * n_draws" is enough
to overflow 32-bit signed integer.

Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20774
"""
X = np.array([99000, 1000], dtype=np.int32)
ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0)

# Draws 25% of the total population, so in this case a fair draw means:
# 25% * 99.000 = 24.750
# 25% * 1.000 = 250
assert_array_equal(ret, [24750, 250])
18 changes: 0 additions & 18 deletions sklearn/utils/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from sklearn import config_context
from sklearn.externals._packaging.version import parse as parse_version
from sklearn.utils import (
_approximate_mode,
_determine_key_type,
_get_column_indices,
_is_polars_df,
Expand Down Expand Up @@ -706,23 +705,6 @@ def test_is_scalar_nan(value, result):
assert isinstance(is_scalar_nan(value), bool)


def test_approximate_mode():
"""Make sure sklearn.utils._approximate_mode returns valid
results for cases where "class_counts * n_draws" is enough
to overflow 32-bit signed integer.

Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20774
"""
X = np.array([99000, 1000], dtype=np.int32)
ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0)

# Draws 25% of the total population, so in this case a fair draw means:
# 25% * 99.000 = 24.750
# 25% * 1.000 = 250
assert_array_equal(ret, [24750, 250])


def dummy_func():
pass

Expand Down