From 8ee67d4347703873edd7358f9a41ddfa6c5e13fd Mon Sep 17 00:00:00 2001 From: rmorotti Date: Mon, 10 Mar 2025 18:54:21 +0000 Subject: [PATCH 1/2] gh-117151: optimize algorithm to grow the buffer size for readall() on files --- Modules/_io/fileio.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 54e5270f8161d6..88ada6e786e2cd 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -43,17 +43,8 @@ # include #endif -#if BUFSIZ < (8*1024) -# define SMALLCHUNK (8*1024) -#elif (BUFSIZ >= (2 << 25)) -# error "unreasonable BUFSIZ > 64 MiB defined" -#else -# define SMALLCHUNK BUFSIZ -#endif - -/* Size at which a buffer is considered "large" and behavior should change to - avoid excessive memory allocation */ -#define LARGE_BUFFER_CUTOFF_SIZE 65536 +#define LARGE_BUFFER_CUTOFF_SIZE (4096*1024) +#define SMALL_BUFFER_SIZE (128*1024) /*[clinic input] module _io @@ -709,16 +700,20 @@ new_buffersize(fileio *self, size_t currentsize) size_t addend; /* Expand the buffer by an amount proportional to the current size, - giving us amortized linear-time behavior. For bigger sizes, use a - less-than-double growth factor to avoid excessive allocation. */ + giving us amortized linear-time behavior. This heuristic is only used + when the file size was unknown or changed since the file was opened. + For smaller sizes, use exponential growth to avoid many small reads. + For bigger sizes, use a less-than-double growth factor to avoid + excessive allocation. + */ assert(currentsize <= PY_SSIZE_T_MAX); if (currentsize > LARGE_BUFFER_CUTOFF_SIZE) addend = currentsize >> 3; else - addend = 256 + currentsize; - if (addend < SMALLCHUNK) + addend = 3 * currentsize; + if (addend < SMALL_BUFFER_SIZE) /* Avoid tiny read() calls. */ - addend = SMALLCHUNK; + addend = SMALL_BUFFER_SIZE; return addend + currentsize; } @@ -743,7 +738,6 @@ _io_FileIO_readall_impl(fileio *self) Py_ssize_t bytes_read = 0; Py_ssize_t n; size_t bufsize; - if (self->fd < 0) { return err_closed(); } @@ -756,7 +750,7 @@ _io_FileIO_readall_impl(fileio *self) } if (end <= 0) { /* Use a default size and resize as needed. */ - bufsize = SMALLCHUNK; + bufsize = SMALL_BUFFER_SIZE; } else { /* This is probably a real file. */ @@ -777,7 +771,7 @@ _io_FileIO_readall_impl(fileio *self) then calls readall() to get the rest, which would result in allocating more than required. Guard against that for larger files where we expect the I/O time to dominate anyways while keeping small files fast. */ - if (bufsize > LARGE_BUFFER_CUTOFF_SIZE) { + if (bufsize > SMALL_BUFFER_SIZE) { Py_BEGIN_ALLOW_THREADS _Py_BEGIN_SUPPRESS_IPH #ifdef MS_WINDOWS From 25d3cb7edc8078d47ec238b1b3babf6bed746002 Mon Sep 17 00:00:00 2001 From: rmorotti Date: Tue, 11 Mar 2025 10:57:58 +0000 Subject: [PATCH 2/2] gh-117151: add news item --- .../Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst diff --git a/Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst b/Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst new file mode 100644 index 00000000000000..88eafc2ef679e8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst @@ -0,0 +1,4 @@ +Optimize the algorithm to grow the buffer when reading a full file and the +file size was unknown or changed since the file was opened. Increase +exponentially and faster and in steps no smaller than 128 kB. This should +improve I/O performance.