From a8ffed5b3ae8baf0d1641c80dba64c5c67e44c7d Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 1 Jun 2025 10:07:14 -0700 Subject: [PATCH 01/16] Add set_pledged_input_size to ZstdCompressor --- Modules/_zstd/_zstdmodule.c | 3 ++ Modules/_zstd/_zstdmodule.h | 1 + Modules/_zstd/clinic/compressor.c.h | 36 +++++++++++++- Modules/_zstd/compressor.c | 75 +++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 1 deletion(-) diff --git a/Modules/_zstd/_zstdmodule.c b/Modules/_zstd/_zstdmodule.c index 986b3579479f0f..9ac08eb2a33c44 100644 --- a/Modules/_zstd/_zstdmodule.c +++ b/Modules/_zstd/_zstdmodule.c @@ -35,6 +35,9 @@ set_zstd_error(const _zstd_state* const state, case ERR_COMPRESS: msg = "Unable to compress Zstandard data: %s"; break; + case ERR_SET_PLEDGED_INPUT_SIZE: + msg = "Unable to set pledged uncompressed content size: %s"; + break; case ERR_LOAD_D_DICT: msg = "Unable to load Zstandard dictionary or prefix for " diff --git a/Modules/_zstd/_zstdmodule.h b/Modules/_zstd/_zstdmodule.h index 1f4160f474f0b0..0daba0dd0ca240 100644 --- a/Modules/_zstd/_zstdmodule.h +++ b/Modules/_zstd/_zstdmodule.h @@ -25,6 +25,7 @@ typedef struct { typedef enum { ERR_DECOMPRESS, ERR_COMPRESS, + ERR_SET_PLEDGED_INPUT_SIZE, ERR_LOAD_D_DICT, ERR_LOAD_C_DICT, diff --git a/Modules/_zstd/clinic/compressor.c.h b/Modules/_zstd/clinic/compressor.c.h index f69161b590e5b7..58e73e09f0f390 100644 --- a/Modules/_zstd/clinic/compressor.c.h +++ b/Modules/_zstd/clinic/compressor.c.h @@ -252,4 +252,38 @@ _zstd_ZstdCompressor_flush(PyObject *self, PyObject *const *args, Py_ssize_t nar exit: return return_value; } -/*[clinic end generated code: output=ee2d1dc298de790c input=a9049054013a1b77]*/ + +PyDoc_STRVAR(_zstd_ZstdCompressor_set_pledged_input_size__doc__, +"set_pledged_input_size($self, size, /)\n" +"--\n" +"\n" +"Set the uncompressed content size to be written into the frame header.\n" +"\n" +" size\n" +" The size of the uncompressed data to be provided to the compressor.\n" +"\n" +"This method can be used to ensure the header of the frame about to be written\n" +"includes the size of the data, unless CompressionParameter.content_size_flag is\n" +"set to False. If .last_mode != .FLUSH_FRAME, then a RuntimeError is raised.\n" +"\n" +"It is important to ensure that the pledged data size matches the actual data\n" +"size. If they do not match the compressed output data may be corrupted and the\n" +"final chunk written may be lost."); + +#define _ZSTD_ZSTDCOMPRESSOR_SET_PLEDGED_INPUT_SIZE_METHODDEF \ + {"set_pledged_input_size", (PyCFunction)_zstd_ZstdCompressor_set_pledged_input_size, METH_O, _zstd_ZstdCompressor_set_pledged_input_size__doc__}, + +static PyObject * +_zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, + PyObject *size); + +static PyObject * +_zstd_ZstdCompressor_set_pledged_input_size(PyObject *self, PyObject *size) +{ + PyObject *return_value = NULL; + + return_value = _zstd_ZstdCompressor_set_pledged_input_size_impl((ZstdCompressor *)self, size); + + return return_value; +} +/*[clinic end generated code: output=6ffee5a8c9b54742 input=a9049054013a1b77]*/ diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index 8ff2a3aadc1cd6..fdd235226500d8 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -690,9 +690,84 @@ _zstd_ZstdCompressor_flush_impl(ZstdCompressor *self, int mode) return ret; } + +/*[clinic input] +_zstd.ZstdCompressor.set_pledged_input_size + + size: object + The size of the uncompressed data to be provided to the compressor. + / + +Set the uncompressed content size to be written into the frame header. + +This method can be used to ensure the header of the frame about to be written +includes the size of the data, unless CompressionParameter.content_size_flag is +set to False. If .last_mode != .FLUSH_FRAME, then a RuntimeError is raised. + +It is important to ensure that the pledged data size matches the actual data +size. If they do not match the compressed output data may be corrupted and the +final chunk written may be lost. +[clinic start generated code]*/ + +static PyObject * +_zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, + PyObject *size) +/*[clinic end generated code: output=2fb57610ee586d42 input=272a7881a3526dd1]*/ +{ + uint64_t pledged_size; + size_t zstd_ret; + PyObject *ret; + + /* Get size value */ + if (size == Py_None) { + pledged_size = ZSTD_CONTENTSIZE_UNKNOWN; + } else { + pledged_size = PyLong_AsUnsignedLongLong(size); + if (pledged_size == (uint64_t)-1 && PyErr_Occurred()) { + if (PyErr_ExceptionMatches(PyExc_OverflowError)) { + PyErr_Format(PyExc_ValueError, + "size argument should be a positive int less " + "than %ull", ZSTD_CONTENTSIZE_ERROR); + } + return NULL; + } + } + + /* Thread-safe code */ + PyMutex_Lock(&self->lock); + + /* Check the current mode */ + if (self->last_mode != ZSTD_e_end) { + PyErr_SetString(PyExc_RuntimeError, + "._set_pledged_input_size() method must be called " + "when (.last_mode == .FLUSH_FRAME)."); + goto error; + } + + /* Set pledged content size */ + zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, pledged_size); + if (ZSTD_isError(zstd_ret)) { + _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); + set_zstd_error(mod_state, ERR_SET_PLEDGED_INPUT_SIZE, zstd_ret); + goto error; + } + + /* Return None */ + ret = Py_None; + Py_INCREF(ret); + goto success; + +error: + ret = NULL; +success: + PyMutex_Unlock(&self->lock); + return ret; +} + static PyMethodDef ZstdCompressor_methods[] = { _ZSTD_ZSTDCOMPRESSOR_COMPRESS_METHODDEF _ZSTD_ZSTDCOMPRESSOR_FLUSH_METHODDEF + _ZSTD_ZSTDCOMPRESSOR_SET_PLEDGED_INPUT_SIZE_METHODDEF {NULL, NULL} }; From de534abc67206b5a6a32dc487d6993ff48c5939f Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 1 Jun 2025 10:07:39 -0700 Subject: [PATCH 02/16] Add tests for set_pledged_input_size --- Lib/test/test_zstd.py | 84 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py index 014634e450e449..02069321f8a6e8 100644 --- a/Lib/test/test_zstd.py +++ b/Lib/test/test_zstd.py @@ -395,6 +395,90 @@ def test_compress_empty(self): c = ZstdCompressor() self.assertNotEqual(c.compress(b'', c.FLUSH_FRAME), b'') + def test_set_pledged_input_size(self): + DAT = DECOMPRESSED_100_PLUS_32KB + CHUNK_SIZE = len(DAT) // 3 + + # wrong value + c = ZstdCompressor() + with self.assertRaisesRegex(ValueError, + r'should be a positive int less than \d+'): + c.set_pledged_input_size(-300) + + # wrong mode + c = ZstdCompressor(level=1) + c.compress(b'123456') + self.assertEqual(c.last_mode, c.CONTINUE) + with self.assertRaisesRegex(RuntimeError, + r'\.last_mode == \.FLUSH_FRAME'): + c.set_pledged_input_size(300) + + # None value + c = ZstdCompressor(level=1) + c.set_pledged_input_size(None) + dat = c.compress(DAT) + c.flush() + + ret = get_frame_info(dat) + self.assertEqual(ret.decompressed_size, None) + + # correct value + c = ZstdCompressor(level=1) + c.set_pledged_input_size(len(DAT)) + + chunks = [] + posi = 0 + while posi < len(DAT): + dat = c.compress(DAT[posi:posi+CHUNK_SIZE]) + posi += CHUNK_SIZE + chunks.append(dat) + + dat = c.flush() + chunks.append(dat) + chunks = b''.join(chunks) + + ret = get_frame_info(chunks) + self.assertEqual(ret.decompressed_size, len(DAT)) + self.assertEqual(decompress(chunks), DAT) + + c.set_pledged_input_size(len(DAT)) # the second frame + dat = c.compress(DAT) + c.flush() + + ret = get_frame_info(dat) + self.assertEqual(ret.decompressed_size, len(DAT)) + self.assertEqual(decompress(dat), DAT) + + # not enough data + c = ZstdCompressor(level=1) + c.set_pledged_input_size(len(DAT)+1) + + for start in range(0, len(DAT), CHUNK_SIZE): + end = min(start+CHUNK_SIZE, len(DAT)) + _dat = c.compress(DAT[start:end]) + + with self.assertRaises(ZstdError): + c.flush() + + # too much data + c = ZstdCompressor(level=1) + c.set_pledged_input_size(len(DAT)) + + for start in range(0, len(DAT), CHUNK_SIZE): + end = min(start+CHUNK_SIZE, len(DAT)) + _dat = c.compress(DAT[start:end]) + + with self.assertRaises(ZstdError): + c.compress(b'extra', ZstdCompressor.FLUSH_FRAME) + + # content size not set if content_size_flag == 0 + c = ZstdCompressor(options={CompressionParameter.content_size_flag: 0}) + c.set_pledged_input_size(10) + dat1 = c.compress(b"hello") + dat2 = c.compress(b"world") + dat3 = c.flush() + frame_data = get_frame_info(dat1 + dat2 + dat3) + self.assertIsNone(frame_data.decompressed_size) + + class DecompressorTestCase(unittest.TestCase): def test_simple_decompress_bad_args(self): From ee3c6c7b44f3bf0e71748296596eed7c506fc76a Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 1 Jun 2025 10:12:01 -0700 Subject: [PATCH 03/16] Add documentation for set_pledged_input_size --- Doc/library/compression.zstd.rst | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index 35bcbc2bfd8eac..d949fde27e2c37 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -247,6 +247,25 @@ Compressing and decompressing data in memory The *mode* argument is a :class:`ZstdCompressor` attribute, either :attr:`~.FLUSH_BLOCK`, or :attr:`~.FLUSH_FRAME`. + .. method:: set_pledged_input_size(size) + + Specify the amount of uncompressed data *size* that will be provided for + the next frame. *size* will be written into the frame header of the next + frame unless :attr:`CompressionParameter.content_size_flag` is ``False`` + or ``0``. A size of ``0`` means that the frame is empty. If *size* is + ``None``, the frame header will record the size as unknown. + + If :attr:`~.last_mode` is not :attr:`~.FLUSH_FRAME`, a + :exc:`RuntimeError` is raised as the compressor is not at the start of + a frame. If the pledged size does not match the actual size of data + provided to :meth:`~.compress`, future calls to :meth:`~.compress` or + :meth:`~.flush` may raise :exc:`ZstdError` and the last chunk of data may + be lost. + + After :meth:`~.flush` or :meth:`~.compress` are called with mode + :attr:`~.FLUSH_FRAME`, the next frame will default to have an unknown + size unless :meth:`!set_pledged_input_size` is called again. + .. attribute:: CONTINUE Collect more data for compression, which may or may not generate output @@ -620,12 +639,17 @@ Advanced parameter control Write the size of the data to be compressed into the Zstandard frame header when known prior to compressing. - This flag only takes effect under the following two scenarios: + This flag only takes effect under the following three scenarios: * Calling :func:`compress` for one-shot compression * Providing all of the data to be compressed in the frame in a single :meth:`ZstdCompressor.compress` call, with the :attr:`ZstdCompressor.FLUSH_FRAME` mode. + * Calling :meth:`ZstdCompressor.set_pledged_input_size` with the exact + amount of data that will be provided to the compressor prior to any + calls to :meth:`ZstdCompressor.compress` for the current frame. + :meth:`!ZstdCompressor.set_pledged_input_size` must be called for each + new frame. All other compression calls may not write the size information into the frame header. From b8c9213aa9cdde45fc049b31f44afe87b8a915d7 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 1 Jun 2025 10:23:02 -0700 Subject: [PATCH 04/16] Document last_mode to be able to reference it --- Doc/library/compression.zstd.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index d949fde27e2c37..89f24883e88223 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -285,6 +285,13 @@ Compressing and decompressing data in memory :meth:`~.compress` will be written into a new frame and *cannot* reference past data. + .. attribute:: last_mode + + The last mode passed to either :meth:`~.compress` or :meth:`~.flush`. + The value can be one of :attr:`~.CONTINUE`, :attr:`~.FLUSH_BLOCK`, or + :attr:`~.FLUSH_FRAME`. The initial value is :attr:`~.FLUSH_FRAME`, + signifying that the compressor is at the start of a new frame. + .. class:: ZstdDecompressor(zstd_dict=None, options=None) From c1cf65dac119419fa5532f08515ec799b45a3de7 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 1 Jun 2025 23:23:48 -0700 Subject: [PATCH 05/16] Refactor set_pledged_input_size argument parsing to use CConverter Also fixup punctuation usage and don't incref Py_None. --- Modules/_zstd/clinic/compressor.c.h | 13 ++++-- Modules/_zstd/compressor.c | 71 +++++++++++++++++++---------- 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/Modules/_zstd/clinic/compressor.c.h b/Modules/_zstd/clinic/compressor.c.h index 58e73e09f0f390..6d04205ea91619 100644 --- a/Modules/_zstd/clinic/compressor.c.h +++ b/Modules/_zstd/clinic/compressor.c.h @@ -264,7 +264,7 @@ PyDoc_STRVAR(_zstd_ZstdCompressor_set_pledged_input_size__doc__, "\n" "This method can be used to ensure the header of the frame about to be written\n" "includes the size of the data, unless CompressionParameter.content_size_flag is\n" -"set to False. If .last_mode != .FLUSH_FRAME, then a RuntimeError is raised.\n" +"set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised.\n" "\n" "It is important to ensure that the pledged data size matches the actual data\n" "size. If they do not match the compressed output data may be corrupted and the\n" @@ -275,15 +275,20 @@ PyDoc_STRVAR(_zstd_ZstdCompressor_set_pledged_input_size__doc__, static PyObject * _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, - PyObject *size); + unsigned long long size); static PyObject * -_zstd_ZstdCompressor_set_pledged_input_size(PyObject *self, PyObject *size) +_zstd_ZstdCompressor_set_pledged_input_size(PyObject *self, PyObject *arg) { PyObject *return_value = NULL; + unsigned long long size; + if (!zstd_contentsize_converter(arg, &size)) { + goto exit; + } return_value = _zstd_ZstdCompressor_set_pledged_input_size_impl((ZstdCompressor *)self, size); +exit: return return_value; } -/*[clinic end generated code: output=6ffee5a8c9b54742 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=128a94ce706328e0 input=a9049054013a1b77]*/ diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index fdd235226500d8..643b3070a1320c 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -46,6 +46,43 @@ typedef struct { #define ZstdCompressor_CAST(op) ((ZstdCompressor *)op) +/*[python input] + +class zstd_contentsize_converter(CConverter): + type = 'unsigned long long' + converter = 'zstd_contentsize_converter' + +[python start generated code]*/ +/*[python end generated code: output=da39a3ee5e6b4b0d input=0932c350d633c7de]*/ + +static int +zstd_contentsize_converter(PyObject *size, unsigned long long *p) +{ + // None means the user indicates the size is unknown. + if (size == Py_None) { + *p = ZSTD_CONTENTSIZE_UNKNOWN; + } + else { + /* ZSTD_CONTENTSIZE_UNKNOWN is 0ULL - 1 + ZSTD_CONTENTSIZE_ERROR is 0ULL - 2 + Users should only pass values <= ZSTD_CONTENTSIZE_ERROR + If they wish to pass ZSTD_CONTENTSIZE_UNKNOWN, they need to pass + None (see above). */ + unsigned long long pledged_size = PyLong_AsUnsignedLongLong(size); + if (pledged_size == (unsigned long long)-1 && PyErr_Occurred()) { + *p = ZSTD_CONTENTSIZE_ERROR; + if (PyErr_ExceptionMatches(PyExc_OverflowError)) { + PyErr_Format(PyExc_ValueError, + "size argument should be a positive int less " + "than %ull", ZSTD_CONTENTSIZE_ERROR); + } + return 0; + } + *p = pledged_size; + } + return 1; +} + #include "clinic/compressor.c.h" static int @@ -694,7 +731,7 @@ _zstd_ZstdCompressor_flush_impl(ZstdCompressor *self, int mode) /*[clinic input] _zstd.ZstdCompressor.set_pledged_input_size - size: object + size: zstd_contentsize The size of the uncompressed data to be provided to the compressor. / @@ -702,7 +739,7 @@ Set the uncompressed content size to be written into the frame header. This method can be used to ensure the header of the frame about to be written includes the size of the data, unless CompressionParameter.content_size_flag is -set to False. If .last_mode != .FLUSH_FRAME, then a RuntimeError is raised. +set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised. It is important to ensure that the pledged data size matches the actual data size. If they do not match the compressed output data may be corrupted and the @@ -711,27 +748,14 @@ final chunk written may be lost. static PyObject * _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, - PyObject *size) -/*[clinic end generated code: output=2fb57610ee586d42 input=272a7881a3526dd1]*/ + unsigned long long size) +/*[clinic end generated code: output=3a09e55cc0e3b4f9 input=563b9a1ddd4facc3]*/ { - uint64_t pledged_size; size_t zstd_ret; PyObject *ret; - /* Get size value */ - if (size == Py_None) { - pledged_size = ZSTD_CONTENTSIZE_UNKNOWN; - } else { - pledged_size = PyLong_AsUnsignedLongLong(size); - if (pledged_size == (uint64_t)-1 && PyErr_Occurred()) { - if (PyErr_ExceptionMatches(PyExc_OverflowError)) { - PyErr_Format(PyExc_ValueError, - "size argument should be a positive int less " - "than %ull", ZSTD_CONTENTSIZE_ERROR); - } - return NULL; - } - } + // Error occured while converting argument, should be unreachable + assert(size != ZSTD_CONTENTSIZE_ERROR); /* Thread-safe code */ PyMutex_Lock(&self->lock); @@ -739,24 +763,21 @@ _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, /* Check the current mode */ if (self->last_mode != ZSTD_e_end) { PyErr_SetString(PyExc_RuntimeError, - "._set_pledged_input_size() method must be called " - "when (.last_mode == .FLUSH_FRAME)."); + "set_pledged_input_size() method must be called " + "when (last_mode == FLUSH_FRAME)"); goto error; } /* Set pledged content size */ - zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, pledged_size); + zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, size); if (ZSTD_isError(zstd_ret)) { _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); set_zstd_error(mod_state, ERR_SET_PLEDGED_INPUT_SIZE, zstd_ret); goto error; } - /* Return None */ ret = Py_None; - Py_INCREF(ret); goto success; - error: ret = NULL; success: From 450c5049375915e5fec9e85e7012bc307d444679 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 1 Jun 2025 23:30:08 -0700 Subject: [PATCH 06/16] Remove parens from error message --- Modules/_zstd/compressor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index 643b3070a1320c..22183b192ee861 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -764,7 +764,7 @@ _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, if (self->last_mode != ZSTD_e_end) { PyErr_SetString(PyExc_RuntimeError, "set_pledged_input_size() method must be called " - "when (last_mode == FLUSH_FRAME)"); + "when last_mode == FLUSH_FRAME"); goto error; } From 0dafae5fd2c5ef7e9611e850cd03109ac32967f1 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 1 Jun 2025 23:39:20 -0700 Subject: [PATCH 07/16] Fix tests --- Lib/test/test_zstd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py index 02069321f8a6e8..ee2f46b842bfda 100644 --- a/Lib/test/test_zstd.py +++ b/Lib/test/test_zstd.py @@ -410,7 +410,7 @@ def test_set_pledged_input_size(self): c.compress(b'123456') self.assertEqual(c.last_mode, c.CONTINUE) with self.assertRaisesRegex(RuntimeError, - r'\.last_mode == \.FLUSH_FRAME'): + r'last_mode == FLUSH_FRAME'): c.set_pledged_input_size(300) # None value From d4b322919c97a428c4a941d4d571da96e92f1f1f Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Mon, 2 Jun 2025 18:15:30 -0700 Subject: [PATCH 08/16] Refactor zstd_contentsize_converter to handle all values --- Lib/test/test_zstd.py | 25 +++++++++++++++++++++++++ Modules/_zstd/compressor.c | 38 ++++++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py index ee2f46b842bfda..4a6f2164c25151 100644 --- a/Lib/test/test_zstd.py +++ b/Lib/test/test_zstd.py @@ -404,6 +404,31 @@ def test_set_pledged_input_size(self): with self.assertRaisesRegex(ValueError, r'should be a positive int less than \d+'): c.set_pledged_input_size(-300) + # overflow + with self.assertRaisesRegex(ValueError, + r'should be a positive int less than \d+'): + c.set_pledged_input_size(2**64) + # ZSTD_CONTENTSIZE_ERROR is invalid + with self.assertRaisesRegex(ValueError, + r'should be a positive int less than \d+'): + c.set_pledged_input_size(2**64-2) + # ZSTD_CONTENTSIZE_UNKNOWN should use None + with self.assertRaisesRegex(ValueError, + r'should be a positive int less than \d+'): + c.set_pledged_input_size(2**64-1) + + # check valid values are settable + c.set_pledged_input_size(2**63) + c.set_pledged_input_size(2**64-3) + + # check that zero means empty frame + c = ZstdCompressor(level=1) + c.set_pledged_input_size(0) + c.compress(b'') + dat = c.flush() + ret = get_frame_info(dat) + self.assertEqual(ret.decompressed_size, 0) + # wrong mode c = ZstdCompressor(level=1) diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index 22183b192ee861..9536a38735826b 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -55,6 +55,15 @@ class zstd_contentsize_converter(CConverter): [python start generated code]*/ /*[python end generated code: output=da39a3ee5e6b4b0d input=0932c350d633c7de]*/ +static inline int +raise_wrong_contentsize(void) +{ + PyErr_Format(PyExc_ValueError, + "size argument should be a positive int less " + "than %ull", ZSTD_CONTENTSIZE_ERROR); + return 0; +} + static int zstd_contentsize_converter(PyObject *size, unsigned long long *p) { @@ -65,19 +74,21 @@ zstd_contentsize_converter(PyObject *size, unsigned long long *p) else { /* ZSTD_CONTENTSIZE_UNKNOWN is 0ULL - 1 ZSTD_CONTENTSIZE_ERROR is 0ULL - 2 - Users should only pass values <= ZSTD_CONTENTSIZE_ERROR - If they wish to pass ZSTD_CONTENTSIZE_UNKNOWN, they need to pass - None (see above). */ + Users should only pass values < ZSTD_CONTENTSIZE_ERROR */ unsigned long long pledged_size = PyLong_AsUnsignedLongLong(size); + /* Here we check for (unsigned long long)-1 as a sign of an error in + PyLong_AsUnsignedLongLong */ if (pledged_size == (unsigned long long)-1 && PyErr_Occurred()) { *p = ZSTD_CONTENTSIZE_ERROR; if (PyErr_ExceptionMatches(PyExc_OverflowError)) { - PyErr_Format(PyExc_ValueError, - "size argument should be a positive int less " - "than %ull", ZSTD_CONTENTSIZE_ERROR); + return raise_wrong_contentsize(); } return 0; } + if (pledged_size >= ZSTD_CONTENTSIZE_ERROR) { + *p = ZSTD_CONTENTSIZE_ERROR; + return raise_wrong_contentsize(); + } *p = pledged_size; } return 1; @@ -752,7 +763,6 @@ _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, /*[clinic end generated code: output=3a09e55cc0e3b4f9 input=563b9a1ddd4facc3]*/ { size_t zstd_ret; - PyObject *ret; // Error occured while converting argument, should be unreachable assert(size != ZSTD_CONTENTSIZE_ERROR); @@ -765,24 +775,20 @@ _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, PyErr_SetString(PyExc_RuntimeError, "set_pledged_input_size() method must be called " "when last_mode == FLUSH_FRAME"); - goto error; + PyMutex_Unlock(&self->lock); + return NULL; } /* Set pledged content size */ zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, size); + PyMutex_Unlock(&self->lock); if (ZSTD_isError(zstd_ret)) { _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); set_zstd_error(mod_state, ERR_SET_PLEDGED_INPUT_SIZE, zstd_ret); - goto error; + return NULL; } - ret = Py_None; - goto success; -error: - ret = NULL; -success: - PyMutex_Unlock(&self->lock); - return ret; + Py_RETURN_NONE; } static PyMethodDef ZstdCompressor_methods[] = { From f59718eb2ab91c295756012c6ca95d76d4a4f63c Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Mon, 2 Jun 2025 18:20:50 -0700 Subject: [PATCH 09/16] Correct doc about handling of unknown size and fix refs --- Doc/library/compression.zstd.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index 89f24883e88223..5427318baadcc1 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -253,18 +253,18 @@ Compressing and decompressing data in memory the next frame. *size* will be written into the frame header of the next frame unless :attr:`CompressionParameter.content_size_flag` is ``False`` or ``0``. A size of ``0`` means that the frame is empty. If *size* is - ``None``, the frame header will record the size as unknown. + ``None``, the frame header will omit the frame size. - If :attr:`~.last_mode` is not :attr:`~.FLUSH_FRAME`, a + If :attr:`last_mode` is not :attr:`FLUSH_FRAME`, a :exc:`RuntimeError` is raised as the compressor is not at the start of a frame. If the pledged size does not match the actual size of data provided to :meth:`~.compress`, future calls to :meth:`~.compress` or - :meth:`~.flush` may raise :exc:`ZstdError` and the last chunk of data may + :meth:`flush` may raise :exc:`ZstdError` and the last chunk of data may be lost. - After :meth:`~.flush` or :meth:`~.compress` are called with mode - :attr:`~.FLUSH_FRAME`, the next frame will default to have an unknown - size unless :meth:`!set_pledged_input_size` is called again. + After :meth:`flush` or :meth:`~.compress` are called with mode + :attr:`FLUSH_FRAME`, the next frame will not include the frame size into + the header unless :meth:`!set_pledged_input_size` is called again. .. attribute:: CONTINUE From 0e40f5517d191288cb2c1627c5726afcaf23f9db Mon Sep 17 00:00:00 2001 From: Emma Smith Date: Tue, 3 Jun 2025 18:29:26 -0700 Subject: [PATCH 10/16] Apply suggestions from code review Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Doc/library/compression.zstd.rst | 2 +- Modules/_zstd/compressor.c | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index 5427318baadcc1..bf113f210a631c 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -646,7 +646,7 @@ Advanced parameter control Write the size of the data to be compressed into the Zstandard frame header when known prior to compressing. - This flag only takes effect under the following three scenarios: + This flag only takes effect under the following scenarios: * Calling :func:`compress` for one-shot compression * Providing all of the data to be compressed in the frame in a single diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index 9536a38735826b..8b96edfb7eb255 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -749,8 +749,8 @@ _zstd.ZstdCompressor.set_pledged_input_size Set the uncompressed content size to be written into the frame header. This method can be used to ensure the header of the frame about to be written -includes the size of the data, unless CompressionParameter.content_size_flag is -set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised. +includes the size of the data, unless the CompressionParameter.content_size_flag +is set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised. It is important to ensure that the pledged data size matches the actual data size. If they do not match the compressed output data may be corrupted and the @@ -762,8 +762,6 @@ _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, unsigned long long size) /*[clinic end generated code: output=3a09e55cc0e3b4f9 input=563b9a1ddd4facc3]*/ { - size_t zstd_ret; - // Error occured while converting argument, should be unreachable assert(size != ZSTD_CONTENTSIZE_ERROR); @@ -780,7 +778,7 @@ _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, } /* Set pledged content size */ - zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, size); + size_t zstd_ret = ZSTD_CCtx_setPledgedSrcSize(self->cctx, size); PyMutex_Unlock(&self->lock); if (ZSTD_isError(zstd_ret)) { _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); From db0ace3e54f95e91bef60025ce833049844e4cac Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Tue, 3 Jun 2025 18:32:46 -0700 Subject: [PATCH 11/16] Eliminate function for raising a value error --- Modules/_zstd/compressor.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index 9536a38735826b..a9ce2fb9d9d73a 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -55,14 +55,6 @@ class zstd_contentsize_converter(CConverter): [python start generated code]*/ /*[python end generated code: output=da39a3ee5e6b4b0d input=0932c350d633c7de]*/ -static inline int -raise_wrong_contentsize(void) -{ - PyErr_Format(PyExc_ValueError, - "size argument should be a positive int less " - "than %ull", ZSTD_CONTENTSIZE_ERROR); - return 0; -} static int zstd_contentsize_converter(PyObject *size, unsigned long long *p) @@ -81,13 +73,19 @@ zstd_contentsize_converter(PyObject *size, unsigned long long *p) if (pledged_size == (unsigned long long)-1 && PyErr_Occurred()) { *p = ZSTD_CONTENTSIZE_ERROR; if (PyErr_ExceptionMatches(PyExc_OverflowError)) { - return raise_wrong_contentsize(); + PyErr_Format(PyExc_ValueError, + "size argument should be a positive int less " + "than %ull", ZSTD_CONTENTSIZE_ERROR); + return 0; } return 0; } if (pledged_size >= ZSTD_CONTENTSIZE_ERROR) { *p = ZSTD_CONTENTSIZE_ERROR; - return raise_wrong_contentsize(); + PyErr_Format(PyExc_ValueError, + "size argument should be a positive int less " + "than %ull", ZSTD_CONTENTSIZE_ERROR); + return 0; } *p = pledged_size; } From 3605e11ede393d28df53786102d2744237abd7ff Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Tue, 3 Jun 2025 18:36:15 -0700 Subject: [PATCH 12/16] Switch from RuntimeError to ValueError --- Lib/test/test_zstd.py | 2 +- Modules/_zstd/compressor.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py index 4a6f2164c25151..72f04659479251 100644 --- a/Lib/test/test_zstd.py +++ b/Lib/test/test_zstd.py @@ -434,7 +434,7 @@ def test_set_pledged_input_size(self): c = ZstdCompressor(level=1) c.compress(b'123456') self.assertEqual(c.last_mode, c.CONTINUE) - with self.assertRaisesRegex(RuntimeError, + with self.assertRaisesRegex(ValueError, r'last_mode == FLUSH_FRAME'): c.set_pledged_input_size(300) diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index a9ce2fb9d9d73a..b08546393fb714 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -770,7 +770,7 @@ _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, /* Check the current mode */ if (self->last_mode != ZSTD_e_end) { - PyErr_SetString(PyExc_RuntimeError, + PyErr_SetString(PyExc_ValueError, "set_pledged_input_size() method must be called " "when last_mode == FLUSH_FRAME"); PyMutex_Unlock(&self->lock); From 4d3c00543fd11a19d9314ff9431f7518a6f0088b Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Tue, 3 Jun 2025 18:43:59 -0700 Subject: [PATCH 13/16] Add motivation for pledging size --- Doc/library/compression.zstd.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index 5427318baadcc1..09b7a57d99dffd 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -253,7 +253,9 @@ Compressing and decompressing data in memory the next frame. *size* will be written into the frame header of the next frame unless :attr:`CompressionParameter.content_size_flag` is ``False`` or ``0``. A size of ``0`` means that the frame is empty. If *size* is - ``None``, the frame header will omit the frame size. + ``None``, the frame header will omit the frame size. Frames that include + the uncompressed data size require less memory to decompress, especially + at higher compression levels. If :attr:`last_mode` is not :attr:`FLUSH_FRAME`, a :exc:`RuntimeError` is raised as the compressor is not at the start of From 6b52d4576b6bcafbe5d31d6ab03ee4c585c801ae Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Tue, 3 Jun 2025 18:44:49 -0700 Subject: [PATCH 14/16] s/RuntimeError/ValueError/ --- Doc/library/compression.zstd.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index 09b7a57d99dffd..a310a21270361e 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -258,7 +258,7 @@ Compressing and decompressing data in memory at higher compression levels. If :attr:`last_mode` is not :attr:`FLUSH_FRAME`, a - :exc:`RuntimeError` is raised as the compressor is not at the start of + :exc:`ValueError` is raised as the compressor is not at the start of a frame. If the pledged size does not match the actual size of data provided to :meth:`~.compress`, future calls to :meth:`~.compress` or :meth:`flush` may raise :exc:`ZstdError` and the last chunk of data may From fe78bac21bb6fb09f751152fbdd599ad43db39aa Mon Sep 17 00:00:00 2001 From: Emma Smith Date: Tue, 3 Jun 2025 18:48:09 -0700 Subject: [PATCH 15/16] Apply suggestions from code review Co-authored-by: Serhiy Storchaka --- Doc/library/compression.zstd.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index bf113f210a631c..4c30cf02d6975b 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -258,11 +258,11 @@ Compressing and decompressing data in memory If :attr:`last_mode` is not :attr:`FLUSH_FRAME`, a :exc:`RuntimeError` is raised as the compressor is not at the start of a frame. If the pledged size does not match the actual size of data - provided to :meth:`~.compress`, future calls to :meth:`~.compress` or + provided to :meth:`.compress`, future calls to :meth:`!compress` or :meth:`flush` may raise :exc:`ZstdError` and the last chunk of data may be lost. - After :meth:`flush` or :meth:`~.compress` are called with mode + After :meth:`flush` or :meth:`.compress` are called with mode :attr:`FLUSH_FRAME`, the next frame will not include the frame size into the header unless :meth:`!set_pledged_input_size` is called again. From 6c5c0877190f9c235bc323ef3280509fdad347ee Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Tue, 3 Jun 2025 18:50:28 -0700 Subject: [PATCH 16/16] Update clinic files --- Modules/_zstd/clinic/compressor.c.h | 6 +++--- Modules/_zstd/compressor.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Modules/_zstd/clinic/compressor.c.h b/Modules/_zstd/clinic/compressor.c.h index 6d04205ea91619..4f8d93fd9e867c 100644 --- a/Modules/_zstd/clinic/compressor.c.h +++ b/Modules/_zstd/clinic/compressor.c.h @@ -263,8 +263,8 @@ PyDoc_STRVAR(_zstd_ZstdCompressor_set_pledged_input_size__doc__, " The size of the uncompressed data to be provided to the compressor.\n" "\n" "This method can be used to ensure the header of the frame about to be written\n" -"includes the size of the data, unless CompressionParameter.content_size_flag is\n" -"set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised.\n" +"includes the size of the data, unless the CompressionParameter.content_size_flag\n" +"is set to False. If last_mode != FLUSH_FRAME, then a RuntimeError is raised.\n" "\n" "It is important to ensure that the pledged data size matches the actual data\n" "size. If they do not match the compressed output data may be corrupted and the\n" @@ -291,4 +291,4 @@ _zstd_ZstdCompressor_set_pledged_input_size(PyObject *self, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=128a94ce706328e0 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=c1d5c2cf06a8becd input=a9049054013a1b77]*/ diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index ba9a61dc7960fa..4a3da9f4278244 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -758,7 +758,7 @@ final chunk written may be lost. static PyObject * _zstd_ZstdCompressor_set_pledged_input_size_impl(ZstdCompressor *self, unsigned long long size) -/*[clinic end generated code: output=3a09e55cc0e3b4f9 input=563b9a1ddd4facc3]*/ +/*[clinic end generated code: output=3a09e55cc0e3b4f9 input=afd8a7d78cff2eb5]*/ { // Error occured while converting argument, should be unreachable assert(size != ZSTD_CONTENTSIZE_ERROR);