diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 493b7f40389cb..db14ecb22cb89 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -21,7 +21,8 @@ from ..metrics import pairwise_distances from ..metrics.pairwise import _VALID_METRICS, PAIRWISE_BOOLEAN_FUNCTIONS from ..neighbors import NearestNeighbors -from ..utils import gen_batches, get_chunk_n_rows +from ..utils import gen_batches +from ..utils._chunking import get_chunk_n_rows from ..utils._param_validation import ( HasMethods, Interval, diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 73613f835ad19..5c427441012d6 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -10,7 +10,7 @@ from ..externals import _arff from ..externals._arff import ArffSparseDataType -from ..utils import _chunk_generator, get_chunk_n_rows +from ..utils._chunking import chunk_generator, get_chunk_n_rows from ..utils._optional_dependencies import check_pandas_support from ..utils.fixes import pd_fillna @@ -192,7 +192,7 @@ def _io_to_generator(gzip_file): # read arff data with chunks columns_to_keep = [col for col in columns_names if col in columns_to_select] dfs = [first_df[columns_to_keep]] - for data in _chunk_generator(arff_container["data"], chunksize): + for data in chunk_generator(arff_container["data"], chunksize): dfs.append( pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep] ) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 870bd9cfd9b6c..480d1f2d3e4ef 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -16,8 +16,8 @@ check_array, check_random_state, gen_batches, - get_chunk_n_rows, ) +from ..utils._chunking import get_chunk_n_rows from ..utils._param_validation import Interval, RealNotInt, StrOptions from ..utils.validation import _num_samples, check_is_fitted from ._bagging import BaseBagging diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 4da6529436b94..7b09bfd70a2fd 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -24,8 +24,8 @@ check_array, gen_batches, gen_even_slices, - get_chunk_n_rows, ) +from ..utils._chunking import get_chunk_n_rows from ..utils._mask import _get_mask from ..utils._missing import is_scalar_nan from ..utils._param_validation import ( diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index a67166cadb9fc..9a2481393271a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -14,12 +14,12 @@ import numpy as np from scipy.sparse import issparse -from .. import get_config from ..exceptions import DataConversionWarning from . import _joblib, metadata_routing from ._bunch import Bunch +from ._chunking import gen_batches, gen_even_slices from ._estimator_html_repr import estimator_html_repr -from ._param_validation import Integral, Interval, validate_params +from ._param_validation import Interval, validate_params from .class_weight import compute_class_weight, compute_sample_weight from .deprecation import deprecated from .discovery import all_estimators @@ -76,6 +76,8 @@ "Bunch", "metadata_routing", "safe_sqr", + "gen_batches", + "gen_even_slices", ] IS_PYPY = platform.python_implementation() == "PyPy" @@ -745,132 +747,6 @@ def shuffle(*arrays, random_state=None, n_samples=None): ) -def _chunk_generator(gen, chunksize): - """Chunk generator, ``gen`` into lists of length ``chunksize``. The last - chunk may have a length less than ``chunksize``.""" - while True: - chunk = list(islice(gen, chunksize)) - if chunk: - yield chunk - else: - return - - -@validate_params( - { - "n": [Interval(numbers.Integral, 1, None, closed="left")], - "batch_size": [Interval(numbers.Integral, 1, None, closed="left")], - "min_batch_size": [Interval(numbers.Integral, 0, None, closed="left")], - }, - prefer_skip_nested_validation=True, -) -def gen_batches(n, batch_size, *, min_batch_size=0): - """Generator to create slices containing `batch_size` elements from 0 to `n`. - - The last slice may contain less than `batch_size` elements, when - `batch_size` does not divide `n`. - - Parameters - ---------- - n : int - Size of the sequence. - batch_size : int - Number of elements in each batch. - min_batch_size : int, default=0 - Minimum number of elements in each batch. - - Yields - ------ - slice of `batch_size` elements - - See Also - -------- - gen_even_slices: Generator to create n_packs slices going up to n. - - Examples - -------- - >>> from sklearn.utils import gen_batches - >>> list(gen_batches(7, 3)) - [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] - >>> list(gen_batches(6, 3)) - [slice(0, 3, None), slice(3, 6, None)] - >>> list(gen_batches(2, 3)) - [slice(0, 2, None)] - >>> list(gen_batches(7, 3, min_batch_size=0)) - [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] - >>> list(gen_batches(7, 3, min_batch_size=2)) - [slice(0, 3, None), slice(3, 7, None)] - """ - start = 0 - for _ in range(int(n // batch_size)): - end = start + batch_size - if end + min_batch_size > n: - continue - yield slice(start, end) - start = end - if start < n: - yield slice(start, n) - - -@validate_params( - { - "n": [Interval(Integral, 1, None, closed="left")], - "n_packs": [Interval(Integral, 1, None, closed="left")], - "n_samples": [Interval(Integral, 1, None, closed="left"), None], - }, - prefer_skip_nested_validation=True, -) -def gen_even_slices(n, n_packs, *, n_samples=None): - """Generator to create `n_packs` evenly spaced slices going up to `n`. - - If `n_packs` does not divide `n`, except for the first `n % n_packs` - slices, remaining slices may contain fewer elements. - - Parameters - ---------- - n : int - Size of the sequence. - n_packs : int - Number of slices to generate. - n_samples : int, default=None - Number of samples. Pass `n_samples` when the slices are to be used for - sparse matrix indexing; slicing off-the-end raises an exception, while - it works for NumPy arrays. - - Yields - ------ - `slice` representing a set of indices from 0 to n. - - See Also - -------- - gen_batches: Generator to create slices containing batch_size elements - from 0 to n. - - Examples - -------- - >>> from sklearn.utils import gen_even_slices - >>> list(gen_even_slices(10, 1)) - [slice(0, 10, None)] - >>> list(gen_even_slices(10, 10)) - [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)] - >>> list(gen_even_slices(10, 5)) - [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)] - >>> list(gen_even_slices(10, 3)) - [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)] - """ - start = 0 - for pack_num in range(n_packs): - this_n = n // n_packs - if pack_num < n % n_packs: - this_n += 1 - if this_n > 0: - end = start + this_n - if n_samples is not None: - end = min(n_samples, end) - yield slice(start, end, None) - start = end - - def tosequence(x): """Cast iterable x to a Sequence, avoiding a copy if possible. @@ -1012,44 +888,3 @@ def _print_elapsed_time(source, message=None): start = timeit.default_timer() yield print(_message_with_time(source, message, timeit.default_timer() - start)) - - -def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): - """Calculate how many rows can be processed within `working_memory`. - - Parameters - ---------- - row_bytes : int - The expected number of bytes of memory that will be consumed - during the processing of each row. - max_n_rows : int, default=None - The maximum return value. - working_memory : int or float, default=None - The number of rows to fit inside this number of MiB will be - returned. When None (default), the value of - ``sklearn.get_config()['working_memory']`` is used. - - Returns - ------- - int - The number of rows which can be processed within `working_memory`. - - Warns - ----- - Issues a UserWarning if `row_bytes exceeds `working_memory` MiB. - """ - - if working_memory is None: - working_memory = get_config()["working_memory"] - - chunk_n_rows = int(working_memory * (2**20) // row_bytes) - if max_n_rows is not None: - chunk_n_rows = min(chunk_n_rows, max_n_rows) - if chunk_n_rows < 1: - warnings.warn( - "Could not adhere to working_memory config. " - "Currently %.0fMiB, %.0fMiB required." - % (working_memory, np.ceil(row_bytes * 2**-20)) - ) - chunk_n_rows = 1 - return chunk_n_rows diff --git a/sklearn/utils/_chunking.py b/sklearn/utils/_chunking.py new file mode 100644 index 0000000000000..7bf53d0626c85 --- /dev/null +++ b/sklearn/utils/_chunking.py @@ -0,0 +1,175 @@ +import warnings +from itertools import islice +from numbers import Integral + +import numpy as np + +from .._config import get_config +from ._param_validation import Interval, validate_params + + +def chunk_generator(gen, chunksize): + """Chunk generator, ``gen`` into lists of length ``chunksize``. The last + chunk may have a length less than ``chunksize``.""" + while True: + chunk = list(islice(gen, chunksize)) + if chunk: + yield chunk + else: + return + + +@validate_params( + { + "n": [Interval(Integral, 1, None, closed="left")], + "batch_size": [Interval(Integral, 1, None, closed="left")], + "min_batch_size": [Interval(Integral, 0, None, closed="left")], + }, + prefer_skip_nested_validation=True, +) +def gen_batches(n, batch_size, *, min_batch_size=0): + """Generator to create slices containing `batch_size` elements from 0 to `n`. + + The last slice may contain less than `batch_size` elements, when + `batch_size` does not divide `n`. + + Parameters + ---------- + n : int + Size of the sequence. + batch_size : int + Number of elements in each batch. + min_batch_size : int, default=0 + Minimum number of elements in each batch. + + Yields + ------ + slice of `batch_size` elements + + See Also + -------- + gen_even_slices: Generator to create n_packs slices going up to n. + + Examples + -------- + >>> from sklearn.utils import gen_batches + >>> list(gen_batches(7, 3)) + [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] + >>> list(gen_batches(6, 3)) + [slice(0, 3, None), slice(3, 6, None)] + >>> list(gen_batches(2, 3)) + [slice(0, 2, None)] + >>> list(gen_batches(7, 3, min_batch_size=0)) + [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] + >>> list(gen_batches(7, 3, min_batch_size=2)) + [slice(0, 3, None), slice(3, 7, None)] + """ + start = 0 + for _ in range(int(n // batch_size)): + end = start + batch_size + if end + min_batch_size > n: + continue + yield slice(start, end) + start = end + if start < n: + yield slice(start, n) + + +@validate_params( + { + "n": [Interval(Integral, 1, None, closed="left")], + "n_packs": [Interval(Integral, 1, None, closed="left")], + "n_samples": [Interval(Integral, 1, None, closed="left"), None], + }, + prefer_skip_nested_validation=True, +) +def gen_even_slices(n, n_packs, *, n_samples=None): + """Generator to create `n_packs` evenly spaced slices going up to `n`. + + If `n_packs` does not divide `n`, except for the first `n % n_packs` + slices, remaining slices may contain fewer elements. + + Parameters + ---------- + n : int + Size of the sequence. + n_packs : int + Number of slices to generate. + n_samples : int, default=None + Number of samples. Pass `n_samples` when the slices are to be used for + sparse matrix indexing; slicing off-the-end raises an exception, while + it works for NumPy arrays. + + Yields + ------ + `slice` representing a set of indices from 0 to n. + + See Also + -------- + gen_batches: Generator to create slices containing batch_size elements + from 0 to n. + + Examples + -------- + >>> from sklearn.utils import gen_even_slices + >>> list(gen_even_slices(10, 1)) + [slice(0, 10, None)] + >>> list(gen_even_slices(10, 10)) + [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)] + >>> list(gen_even_slices(10, 5)) + [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)] + >>> list(gen_even_slices(10, 3)) + [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)] + """ + start = 0 + for pack_num in range(n_packs): + this_n = n // n_packs + if pack_num < n % n_packs: + this_n += 1 + if this_n > 0: + end = start + this_n + if n_samples is not None: + end = min(n_samples, end) + yield slice(start, end, None) + start = end + + +def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): + """Calculate how many rows can be processed within `working_memory`. + + Parameters + ---------- + row_bytes : int + The expected number of bytes of memory that will be consumed + during the processing of each row. + max_n_rows : int, default=None + The maximum return value. + working_memory : int or float, default=None + The number of rows to fit inside this number of MiB will be + returned. When None (default), the value of + ``sklearn.get_config()['working_memory']`` is used. + + Returns + ------- + int + The number of rows which can be processed within `working_memory`. + + Warns + ----- + Issues a UserWarning if `row_bytes exceeds `working_memory` MiB. + """ + + if working_memory is None: + working_memory = get_config()["working_memory"] + + chunk_n_rows = int(working_memory * (2**20) // row_bytes) + if max_n_rows is not None: + chunk_n_rows = min(chunk_n_rows, max_n_rows) + if chunk_n_rows < 1: + warnings.warn( + "Could not adhere to working_memory config. " + "Currently %.0fMiB, %.0fMiB required." + % (working_memory, np.ceil(row_bytes * 2**-20)) + ) + chunk_n_rows = 1 + return chunk_n_rows diff --git a/sklearn/utils/tests/test_chunking.py b/sklearn/utils/tests/test_chunking.py new file mode 100644 index 0000000000000..10c7ed17a0c2d --- /dev/null +++ b/sklearn/utils/tests/test_chunking.py @@ -0,0 +1,73 @@ +import warnings +from itertools import chain + +import pytest + +from sklearn import config_context +from sklearn.utils._chunking import gen_even_slices, get_chunk_n_rows +from sklearn.utils._testing import assert_array_equal + + +def test_gen_even_slices(): + # check that gen_even_slices contains all samples + some_range = range(10) + joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)])) + assert_array_equal(some_range, joined_range) + + +@pytest.mark.parametrize( + ("row_bytes", "max_n_rows", "working_memory", "expected"), + [ + (1024, None, 1, 1024), + (1024, None, 0.99999999, 1023), + (1023, None, 1, 1025), + (1025, None, 1, 1023), + (1024, None, 2, 2048), + (1024, 7, 1, 7), + (1024 * 1024, None, 1, 1), + ], +) +def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected): + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + actual = get_chunk_n_rows( + row_bytes=row_bytes, + max_n_rows=max_n_rows, + working_memory=working_memory, + ) + + assert actual == expected + assert type(actual) is type(expected) + with config_context(working_memory=working_memory): + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows) + assert actual == expected + assert type(actual) is type(expected) + + +def test_get_chunk_n_rows_warns(): + """Check that warning is raised when working_memory is too low.""" + row_bytes = 1024 * 1024 + 1 + max_n_rows = None + working_memory = 1 + expected = 1 + + warn_msg = ( + "Could not adhere to working_memory config. Currently 1MiB, 2MiB required." + ) + with pytest.warns(UserWarning, match=warn_msg): + actual = get_chunk_n_rows( + row_bytes=row_bytes, + max_n_rows=max_n_rows, + working_memory=working_memory, + ) + + assert actual == expected + assert type(actual) is type(expected) + + with config_context(working_memory=working_memory): + with pytest.warns(UserWarning, match=warn_msg): + actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows) + assert actual == expected + assert type(actual) is type(expected) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 0a19d2a11b144..d87a4a57c6ac2 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -2,13 +2,11 @@ import timeit import warnings from copy import copy -from itertools import chain from unittest import SkipTest import numpy as np import pytest -from sklearn import config_context from sklearn.externals._packaging.version import parse as parse_version from sklearn.utils import ( _determine_key_type, @@ -22,8 +20,6 @@ check_random_state, column_or_1d, deprecated, - gen_even_slices, - get_chunk_n_rows, resample, safe_mask, shuffle, @@ -558,71 +554,6 @@ def test_shuffle_dont_convert_to_array(csc_container): assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]])) -def test_gen_even_slices(): - # check that gen_even_slices contains all samples - some_range = range(10) - joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)])) - assert_array_equal(some_range, joined_range) - - -@pytest.mark.parametrize( - ("row_bytes", "max_n_rows", "working_memory", "expected"), - [ - (1024, None, 1, 1024), - (1024, None, 0.99999999, 1023), - (1023, None, 1, 1025), - (1025, None, 1, 1023), - (1024, None, 2, 2048), - (1024, 7, 1, 7), - (1024 * 1024, None, 1, 1), - ], -) -def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected): - with warnings.catch_warnings(): - warnings.simplefilter("error", UserWarning) - actual = get_chunk_n_rows( - row_bytes=row_bytes, - max_n_rows=max_n_rows, - working_memory=working_memory, - ) - - assert actual == expected - assert type(actual) is type(expected) - with config_context(working_memory=working_memory): - with warnings.catch_warnings(): - warnings.simplefilter("error", UserWarning) - actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows) - assert actual == expected - assert type(actual) is type(expected) - - -def test_get_chunk_n_rows_warns(): - """Check that warning is raised when working_memory is too low.""" - row_bytes = 1024 * 1024 + 1 - max_n_rows = None - working_memory = 1 - expected = 1 - - warn_msg = ( - "Could not adhere to working_memory config. Currently 1MiB, 2MiB required." - ) - with pytest.warns(UserWarning, match=warn_msg): - actual = get_chunk_n_rows( - row_bytes=row_bytes, - max_n_rows=max_n_rows, - working_memory=working_memory, - ) - - assert actual == expected - assert type(actual) is type(expected) - - with config_context(working_memory=working_memory): - with pytest.warns(UserWarning, match=warn_msg): - actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows) - assert actual == expected - assert type(actual) is type(expected) - - @pytest.mark.parametrize( ["source", "message", "is_long"], [