diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 93898a5114191..3f88285117d6b 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -24,13 +24,13 @@ from scipy.special import comb from ..utils import ( - _approximate_mode, _safe_indexing, check_random_state, indexable, metadata_routing, ) from ..utils._param_validation import Interval, RealNotInt, validate_params +from ..utils.extmath import _approximate_mode from ..utils.metadata_routing import _MetadataRequester from ..utils.multiclass import type_of_target from ..utils.validation import _num_samples, check_array, column_or_1d diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 53bb376ea93d6..e359a50c63aab 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -24,6 +24,7 @@ from .class_weight import compute_class_weight, compute_sample_weight from .deprecation import deprecated from .discovery import all_estimators +from .extmath import _approximate_mode, safe_sqr from .fixes import parse_version, threadpool_info from .murmurhash import murmurhash3_32 from .validation import ( @@ -76,6 +77,7 @@ "estimator_html_repr", "Bunch", "metadata_routing", + "safe_sqr", ] IS_PYPY = platform.python_implementation() == "PyPy" @@ -778,41 +780,6 @@ def shuffle(*arrays, random_state=None, n_samples=None): ) -def safe_sqr(X, *, copy=True): - """Element wise squaring of array-likes and sparse matrices. - - Parameters - ---------- - X : {array-like, ndarray, sparse matrix} - - copy : bool, default=True - Whether to create a copy of X and operate on it or to perform - inplace computation (default behaviour). - - Returns - ------- - X ** 2 : element wise square - Return the element-wise square of the input. - - Examples - -------- - >>> from sklearn.utils import safe_sqr - >>> safe_sqr([1, 2, 3]) - array([1, 4, 9]) - """ - X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False) - if issparse(X): - if copy: - X = X.copy() - X.data **= 2 - else: - if copy: - X = X**2 - else: - X **= 2 - return X - - def _chunk_generator(gen, chunksize): """Chunk generator, ``gen`` into lists of length ``chunksize``. The last chunk may have a length less than ``chunksize``.""" @@ -1183,72 +1150,3 @@ def is_scalar_nan(x): and isinstance(x, numbers.Real) and math.isnan(x) ) - - -def _approximate_mode(class_counts, n_draws, rng): - """Computes approximate mode of multivariate hypergeometric. - - This is an approximation to the mode of the multivariate - hypergeometric given by class_counts and n_draws. - It shouldn't be off by more than one. - - It is the mostly likely outcome of drawing n_draws many - samples from the population given by class_counts. - - Parameters - ---------- - class_counts : ndarray of int - Population per class. - n_draws : int - Number of draws (samples to draw) from the overall population. - rng : random state - Used to break ties. - - Returns - ------- - sampled_classes : ndarray of int - Number of samples drawn from each class. - np.sum(sampled_classes) == n_draws - - Examples - -------- - >>> import numpy as np - >>> from sklearn.utils import _approximate_mode - >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0) - array([2, 1]) - >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0) - array([3, 1]) - >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), - ... n_draws=2, rng=0) - array([0, 1, 1, 0]) - >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), - ... n_draws=2, rng=42) - array([1, 1, 0, 0]) - """ - rng = check_random_state(rng) - # this computes a bad approximation to the mode of the - # multivariate hypergeometric given by class_counts and n_draws - continuous = class_counts / class_counts.sum() * n_draws - # floored means we don't overshoot n_samples, but probably undershoot - floored = np.floor(continuous) - # we add samples according to how much "left over" probability - # they had, until we arrive at n_samples - need_to_add = int(n_draws - floored.sum()) - if need_to_add > 0: - remainder = continuous - floored - values = np.sort(np.unique(remainder))[::-1] - # add according to remainder, but break ties - # randomly to avoid biases - for value in values: - (inds,) = np.where(remainder == value) - # if we need_to_add less than what's in inds - # we draw randomly from them. - # if we need to add more, we add them all and - # go to the next value - add_now = min(len(inds), need_to_add) - inds = rng.choice(inds, size=add_now, replace=False) - floored[inds] += 1 - need_to_add -= add_now - if need_to_add == 0: - break - return floored.astype(int) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 975b3458692b9..372ebc4f6b8b5 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -21,10 +21,9 @@ from ..utils import deprecated from ..utils._param_validation import Interval, StrOptions, validate_params -from . import check_random_state from ._array_api import _is_numpy_namespace, device, get_namespace from .sparsefuncs_fast import csr_row_norms -from .validation import check_array +from .validation import check_array, check_random_state def squared_norm(x): @@ -1282,3 +1281,107 @@ def _nanaverage(a, weights=None): except ZeroDivisionError: # this is when all weights are zero, then ignore them return np.average(a) + + +def safe_sqr(X, *, copy=True): + """Element wise squaring of array-likes and sparse matrices. + + Parameters + ---------- + X : {array-like, ndarray, sparse matrix} + + copy : bool, default=True + Whether to create a copy of X and operate on it or to perform + inplace computation (default behaviour). + + Returns + ------- + X ** 2 : element wise square + Return the element-wise square of the input. + + Examples + -------- + >>> from sklearn.utils import safe_sqr + >>> safe_sqr([1, 2, 3]) + array([1, 4, 9]) + """ + X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False) + if sparse.issparse(X): + if copy: + X = X.copy() + X.data **= 2 + else: + if copy: + X = X**2 + else: + X **= 2 + return X + + +def _approximate_mode(class_counts, n_draws, rng): + """Computes approximate mode of multivariate hypergeometric. + + This is an approximation to the mode of the multivariate + hypergeometric given by class_counts and n_draws. + It shouldn't be off by more than one. + + It is the mostly likely outcome of drawing n_draws many + samples from the population given by class_counts. + + Parameters + ---------- + class_counts : ndarray of int + Population per class. + n_draws : int + Number of draws (samples to draw) from the overall population. + rng : random state + Used to break ties. + + Returns + ------- + sampled_classes : ndarray of int + Number of samples drawn from each class. + np.sum(sampled_classes) == n_draws + + Examples + -------- + >>> import numpy as np + >>> from sklearn.utils.extmath import _approximate_mode + >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0) + array([2, 1]) + >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0) + array([3, 1]) + >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), + ... n_draws=2, rng=0) + array([0, 1, 1, 0]) + >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), + ... n_draws=2, rng=42) + array([1, 1, 0, 0]) + """ + rng = check_random_state(rng) + # this computes a bad approximation to the mode of the + # multivariate hypergeometric given by class_counts and n_draws + continuous = class_counts / class_counts.sum() * n_draws + # floored means we don't overshoot n_samples, but probably undershoot + floored = np.floor(continuous) + # we add samples according to how much "left over" probability + # they had, until we arrive at n_samples + need_to_add = int(n_draws - floored.sum()) + if need_to_add > 0: + remainder = continuous - floored + values = np.sort(np.unique(remainder))[::-1] + # add according to remainder, but break ties + # randomly to avoid biases + for value in values: + (inds,) = np.where(remainder == value) + # if we need_to_add less than what's in inds + # we draw randomly from them. + # if we need to add more, we add them all and + # go to the next value + add_now = min(len(inds), need_to_add) + inds = rng.choice(inds, size=add_now, replace=False) + floored[inds] += 1 + need_to_add -= add_now + if need_to_add == 0: + break + return floored.astype(int) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index fc2eab70f007b..c167a7e9d8f59 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -22,6 +22,7 @@ skip_if_32bit, ) from sklearn.utils.extmath import ( + _approximate_mode, _deterministic_vector_sign_flip, _incremental_mean_and_var, _randomized_eigsh, @@ -1062,3 +1063,20 @@ def test_safe_sparse_dot_dense_output(dense_output): if dense_output: expected = expected.toarray() assert_allclose_dense_sparse(actual, expected) + + +def test_approximate_mode(): + """Make sure sklearn.utils.extmath._approximate_mode returns valid + results for cases where "class_counts * n_draws" is enough + to overflow 32-bit signed integer. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20774 + """ + X = np.array([99000, 1000], dtype=np.int32) + ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0) + + # Draws 25% of the total population, so in this case a fair draw means: + # 25% * 99.000 = 24.750 + # 25% * 1.000 = 250 + assert_array_equal(ret, [24750, 250]) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 20b3fdc600a8c..f3d9f81b372ec 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -11,7 +11,6 @@ from sklearn import config_context from sklearn.externals._packaging.version import parse as parse_version from sklearn.utils import ( - _approximate_mode, _determine_key_type, _get_column_indices, _is_polars_df, @@ -706,23 +705,6 @@ def test_is_scalar_nan(value, result): assert isinstance(is_scalar_nan(value), bool) -def test_approximate_mode(): - """Make sure sklearn.utils._approximate_mode returns valid - results for cases where "class_counts * n_draws" is enough - to overflow 32-bit signed integer. - - Non-regression test for: - https://github.com/scikit-learn/scikit-learn/issues/20774 - """ - X = np.array([99000, 1000], dtype=np.int32) - ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0) - - # Draws 25% of the total population, so in this case a fair draw means: - # 25% * 99.000 = 24.750 - # 25% * 1.000 = 250 - assert_array_equal(ret, [24750, 250]) - - def dummy_func(): pass