From 8ee67d4347703873edd7358f9a41ddfa6c5e13fd Mon Sep 17 00:00:00 2001
From: rmorotti <romain.morotti@man.com>
Date: Mon, 10 Mar 2025 18:54:21 +0000
Subject: [PATCH 1/2] gh-117151: optimize algorithm to grow the buffer size for
 readall() on files

---
 Modules/_io/fileio.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c
index 54e5270f8161d6..88ada6e786e2cd 100644
--- a/Modules/_io/fileio.c
+++ b/Modules/_io/fileio.c
@@ -43,17 +43,8 @@
 #  include <windows.h>
 #endif
 
-#if BUFSIZ < (8*1024)
-#  define SMALLCHUNK (8*1024)
-#elif (BUFSIZ >= (2 << 25))
-#  error "unreasonable BUFSIZ > 64 MiB defined"
-#else
-#  define SMALLCHUNK BUFSIZ
-#endif
-
-/* Size at which a buffer is considered "large" and behavior should change to
-   avoid excessive memory allocation */
-#define LARGE_BUFFER_CUTOFF_SIZE 65536
+#define LARGE_BUFFER_CUTOFF_SIZE (4096*1024)
+#define SMALL_BUFFER_SIZE (128*1024)
 
 /*[clinic input]
 module _io
@@ -709,16 +700,20 @@ new_buffersize(fileio *self, size_t currentsize)
     size_t addend;
 
     /* Expand the buffer by an amount proportional to the current size,
-       giving us amortized linear-time behavior.  For bigger sizes, use a
-       less-than-double growth factor to avoid excessive allocation. */
+       giving us amortized linear-time behavior. This heuristic is only used
+       when the file size was unknown or changed since the file was opened.
+       For smaller sizes, use exponential growth to avoid many small reads.
+       For bigger sizes, use a less-than-double growth factor to avoid
+       excessive allocation.
+    */
     assert(currentsize <= PY_SSIZE_T_MAX);
     if (currentsize > LARGE_BUFFER_CUTOFF_SIZE)
         addend = currentsize >> 3;
     else
-        addend = 256 + currentsize;
-    if (addend < SMALLCHUNK)
+        addend = 3 * currentsize;
+    if (addend < SMALL_BUFFER_SIZE)
         /* Avoid tiny read() calls. */
-        addend = SMALLCHUNK;
+        addend = SMALL_BUFFER_SIZE;
     return addend + currentsize;
 }
 
@@ -743,7 +738,6 @@ _io_FileIO_readall_impl(fileio *self)
     Py_ssize_t bytes_read = 0;
     Py_ssize_t n;
     size_t bufsize;
-
     if (self->fd < 0) {
         return err_closed();
     }
@@ -756,7 +750,7 @@ _io_FileIO_readall_impl(fileio *self)
     }
     if (end <= 0) {
         /* Use a default size and resize as needed. */
-        bufsize = SMALLCHUNK;
+        bufsize = SMALL_BUFFER_SIZE;
     }
     else {
         /* This is probably a real file. */
@@ -777,7 +771,7 @@ _io_FileIO_readall_impl(fileio *self)
            then calls readall() to get the rest, which would result in allocating
            more than required. Guard against that for larger files where we expect
            the I/O time to dominate anyways while keeping small files fast. */
-        if (bufsize > LARGE_BUFFER_CUTOFF_SIZE) {
+        if (bufsize > SMALL_BUFFER_SIZE) {
             Py_BEGIN_ALLOW_THREADS
             _Py_BEGIN_SUPPRESS_IPH
 #ifdef MS_WINDOWS

From 25d3cb7edc8078d47ec238b1b3babf6bed746002 Mon Sep 17 00:00:00 2001
From: rmorotti <romain.morotti@man.com>
Date: Tue, 11 Mar 2025 10:57:58 +0000
Subject: [PATCH 2/2] gh-117151: add news item

---
 .../Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst    | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst

diff --git a/Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst b/Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst
new file mode 100644
index 00000000000000..88eafc2ef679e8
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-03-11-10-57-36.gh-issue-131052.QuKA1H.rst
@@ -0,0 +1,4 @@
+Optimize the algorithm to grow the buffer when reading a full file and the
+file size was unknown or changed since the file was opened. Increase
+exponentially and faster and in steps no smaller than 128 kB. This should
+improve I/O performance.