From e96dd968643cb9264ab3302b7b6fe1e4d8c80b34 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Wed, 15 Mar 2023 18:57:42 +0900 Subject: [PATCH 1/2] Refactor zlib and add wbits to zlib.compress() --- extra_tests/snippets/stdlib_zlib.py | 5 +- stdlib/src/zlib.rs | 186 +++++++++++++++++----------- 2 files changed, 115 insertions(+), 76 deletions(-) diff --git a/extra_tests/snippets/stdlib_zlib.py b/extra_tests/snippets/stdlib_zlib.py index 42847704a6..308a8d23bf 100644 --- a/extra_tests/snippets/stdlib_zlib.py +++ b/extra_tests/snippets/stdlib_zlib.py @@ -48,8 +48,9 @@ b"x\xda\xf3\xc9/J\xcdU\xc8,(.\xcdUH\xc9\xcf\xc9/R(\xce,QH\xccM-\x01\x00\x83\xd5\t\xc5", ] -for level, text in enumerate(compressed_lorem_list): - assert zlib.compress(lorem, level) == text +for level, expected in enumerate(compressed_lorem_list): + actual = zlib.compress(lorem, level) + assert actual == expected # default level assert zlib.compress(lorem) == zlib.compress(lorem, -1) == zlib.compress(lorem, 6) diff --git a/stdlib/src/zlib.rs b/stdlib/src/zlib.rs index 80e2a71edd..2f41ffa4e9 100644 --- a/stdlib/src/zlib.rs +++ b/stdlib/src/zlib.rs @@ -5,7 +5,7 @@ mod zlib { use crate::vm::{ builtins::{PyBaseExceptionRef, PyBytes, PyBytesRef, PyIntRef, PyTypeRef}, common::lock::PyMutex, - function::{ArgBytesLike, ArgPrimitiveIndex, ArgSize, OptionalArg, OptionalOption}, + function::{ArgBytesLike, ArgPrimitiveIndex, ArgSize, OptionalArg}, PyPayload, PyResult, VirtualMachine, }; use adler32::RollingAdler32 as Adler32; @@ -47,7 +47,7 @@ mod zlib { // copied from zlibmodule.c (commit 530f506ac91338) #[pyattr] - const MAX_WBITS: u8 = 15; + const MAX_WBITS: i8 = 15; #[pyattr] const DEF_BUF_SIZE: usize = 16 * 1024; #[pyattr] @@ -78,8 +78,9 @@ mod zlib { crate::binascii::crc32(data, begin_state) } - fn compression_from_int(level: Option) -> Option { - match level.unwrap_or(Z_DEFAULT_COMPRESSION) { + // TODO: rewrite with TryFromBorrowedObject + fn compression_from_int(level: i32) -> Option { + match level { Z_DEFAULT_COMPRESSION => Some(Compression::default()), valid_level @ Z_NO_COMPRESSION..=Z_BEST_COMPRESSION => { Some(Compression::new(valid_level as u32)) @@ -92,23 +93,33 @@ mod zlib { struct PyFuncCompressArgs { #[pyarg(positional)] data: ArgBytesLike, - #[pyarg(any, optional)] - level: OptionalOption, + #[pyarg(any, default = "Z_DEFAULT_COMPRESSION")] + level: i32, + #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")] + wbits: ArgPrimitiveIndex, } /// Returns a bytes object containing compressed data. #[pyfunction] fn compress(args: PyFuncCompressArgs, vm: &VirtualMachine) -> PyResult { - let data = args.data; - let level = args.level; + let PyFuncCompressArgs { + data, + level, + ref wbits, + } = args; - let compression = compression_from_int(level.flatten()) + let level = compression_from_int(level) .ok_or_else(|| new_zlib_error("Bad compression level", vm))?; - let mut encoder = ZlibEncoder::new(Vec::new(), compression); - data.with_ref(|input_bytes| encoder.write_all(input_bytes).unwrap()); - let encoded_bytes = encoder.finish().unwrap(); - + let encoded_bytes = if args.wbits.value == MAX_WBITS { + let mut encoder = ZlibEncoder::new(Vec::new(), level); + data.with_ref(|input_bytes| encoder.write_all(input_bytes).unwrap()); + encoder.finish().unwrap() + } else { + let mut inner = CompressInner::new(InitOptions::new(wbits.value, vm)?.compress(level)); + data.with_ref(|input_bytes| inner.compress(input_bytes, vm))?; + inner.flush(vm)? + }; Ok(vm.ctx.new_bytes(encoded_bytes)) } @@ -125,6 +136,21 @@ mod zlib { } impl InitOptions { + fn new(wbits: i8, vm: &VirtualMachine) -> PyResult { + let header = wbits > 0; + let wbits = wbits.unsigned_abs(); + match wbits { + 9..=15 => Ok(InitOptions::Standard { + header, + #[cfg(feature = "zlib")] + wbits, + }), + #[cfg(feature = "zlib")] + 25..=31 => Ok(InitOptions::Gzip { wbits: wbits - 16 }), + _ => Err(vm.new_value_error("Invalid initialization option".to_owned())), + } + } + fn decompress(self) -> Decompress { match self { #[cfg(not(feature = "zlib"))] @@ -149,22 +175,6 @@ mod zlib { } } - fn header_from_wbits(wbits: OptionalArg, vm: &VirtualMachine) -> PyResult { - let wbits = wbits.unwrap_or(MAX_WBITS as i8); - let header = wbits > 0; - let wbits = wbits.unsigned_abs(); - match wbits { - 9..=15 => Ok(InitOptions::Standard { - header, - #[cfg(feature = "zlib")] - wbits, - }), - #[cfg(feature = "zlib")] - 25..=31 => Ok(InitOptions::Gzip { wbits: wbits - 16 }), - _ => Err(vm.new_value_error("Invalid initialization option".to_owned())), - } - } - fn _decompress( mut data: &[u8], d: &mut Decompress, @@ -232,43 +242,55 @@ mod zlib { struct PyFuncDecompressArgs { #[pyarg(positional)] data: ArgBytesLike, - #[pyarg(any, optional)] - wbits: OptionalArg>, - #[pyarg(any, optional)] - bufsize: OptionalArg>, + #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")] + wbits: ArgPrimitiveIndex, + #[pyarg(any, default = "ArgPrimitiveIndex { value: DEF_BUF_SIZE }")] + bufsize: ArgPrimitiveIndex, } /// Returns a bytes object containing the uncompressed data. #[pyfunction] - fn decompress(arg: PyFuncDecompressArgs, vm: &VirtualMachine) -> PyResult> { - let data = arg.data; - let wbits = arg.wbits; - let bufsize = arg.bufsize; + fn decompress(args: PyFuncDecompressArgs, vm: &VirtualMachine) -> PyResult> { + let PyFuncDecompressArgs { + data, + wbits, + bufsize, + } = args; data.with_ref(|data| { - let bufsize = bufsize.into_primitive().unwrap_or(DEF_BUF_SIZE); - - let mut d = header_from_wbits(wbits.into_primitive(), vm)?.decompress(); + let mut d = InitOptions::new(wbits.value, vm)?.decompress(); - _decompress(data, &mut d, bufsize, None, false, vm).and_then(|(buf, stream_end)| { - if stream_end { - Ok(buf) - } else { - Err(new_zlib_error( - "Error -5 while decompressing data: incomplete or truncated stream", - vm, - )) - } - }) + _decompress(data, &mut d, bufsize.value, None, false, vm).and_then( + |(buf, stream_end)| { + if stream_end { + Ok(buf) + } else { + Err(new_zlib_error( + "Error -5 while decompressing data: incomplete or truncated stream", + vm, + )) + } + }, + ) }) } + #[derive(FromArgs)] + struct DecompressobjArgs { + #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")] + wbits: ArgPrimitiveIndex, + #[cfg(feature = "zlib")] + #[pyarg(any, optional)] + _zdict: OptionalArg, + } + #[pyfunction] fn decompressobj(args: DecompressobjArgs, vm: &VirtualMachine) -> PyResult { #[allow(unused_mut)] - let mut decompress = header_from_wbits(args.wbits.into_primitive(), vm)?.decompress(); + let mut decompress = InitOptions::new(args.wbits.value, vm)?.decompress(); #[cfg(feature = "zlib")] - if let OptionalArg::Present(dict) = args.zdict { - dict.with_ref(|d| decompress.set_dictionary(d).unwrap()); + if let OptionalArg::Present(_dict) = args._zdict { + // FIXME: always fails + // dict.with_ref(|d| decompress.set_dictionary(d)); } Ok(PyDecompress { decompress: PyMutex::new(decompress), @@ -407,34 +429,44 @@ mod zlib { } #[derive(FromArgs)] - struct DecompressobjArgs { - #[pyarg(any, optional)] - wbits: OptionalArg>, + #[allow(dead_code)] // FIXME: use args + struct CompressobjArgs { + #[pyarg(any, default = "Z_DEFAULT_COMPRESSION")] + level: i32, + // only DEFLATED is valid right now, it's w/e + #[pyarg(any, default = "DEFLATED")] + _method: i32, + #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")] + wbits: ArgPrimitiveIndex, + #[pyarg(any, name = "_memLevel", default = "DEF_MEM_LEVEL")] + _mem_level: u8, + #[cfg(feature = "zlib")] + #[pyarg(any, default = "Z_DEFAULT_STRATEGY")] + _strategy: i32, #[cfg(feature = "zlib")] #[pyarg(any, optional)] - zdict: OptionalArg, + zdict: Option, } #[pyfunction] - fn compressobj( - level: OptionalArg, - // only DEFLATED is valid right now, it's w/e - _method: OptionalArg, - wbits: OptionalArg>, - // these aren't used. - _mem_level: OptionalArg, // this is memLevel in CPython - _strategy: OptionalArg, - _zdict: OptionalArg, - vm: &VirtualMachine, - ) -> PyResult { - let level = compression_from_int(level.into_option()) + fn compressobj(args: CompressobjArgs, vm: &VirtualMachine) -> PyResult { + let CompressobjArgs { + level, + wbits, + #[cfg(feature = "zlib")] + zdict, + .. + } = args; + let level = compression_from_int(level) .ok_or_else(|| vm.new_value_error("invalid initialization option".to_owned()))?; - let compress = header_from_wbits(wbits.into_primitive(), vm)?.compress(level); + #[allow(unused_mut)] + let mut compress = InitOptions::new(wbits.value, vm)?.compress(level); + #[cfg(feature = "zlib")] + if let Some(zdict) = zdict { + zdict.with_ref(|zdict| compress.set_dictionary(zdict).unwrap()); + } Ok(PyCompress { - inner: PyMutex::new(CompressInner { - compress, - unconsumed: Vec::new(), - }), + inner: PyMutex::new(CompressInner::new(compress)), }) } @@ -477,6 +509,12 @@ mod zlib { const CHUNKSIZE: usize = u32::MAX as usize; impl CompressInner { + fn new(compress: Compress) -> Self { + Self { + compress, + unconsumed: Vec::new(), + } + } fn compress(&mut self, data: &[u8], vm: &VirtualMachine) -> PyResult> { let orig_in = self.compress.total_in() as usize; let mut cur_in = 0; From 7e0863ef8125f298eaa839f4a2691c526688421b Mon Sep 17 00:00:00 2001 From: tdub0 <43589631+tdub0@users.noreply.github.com> Date: Mon, 13 Mar 2023 09:34:20 -0700 Subject: [PATCH 2/2] Update _compression, gzip from CPython v3.11.2 --- Lib/_compression.py | 12 ++- Lib/gzip.py | 165 ++++++++++++++++++++++++++++-------------- Lib/test/test_gzip.py | 21 +++++- 3 files changed, 143 insertions(+), 55 deletions(-) diff --git a/Lib/_compression.py b/Lib/_compression.py index b00f31b400..e8b70aa0a3 100644 --- a/Lib/_compression.py +++ b/Lib/_compression.py @@ -1,7 +1,7 @@ """Internal classes used by the gzip, lzma and bz2 modules""" import io - +import sys BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size @@ -110,6 +110,16 @@ def read(self, size=-1): self._pos += len(data) return data + def readall(self): + chunks = [] + # sys.maxsize means the max length of output buffer is unlimited, + # so that the whole input buffer can be decompressed within one + # .decompress() call. + while data := self.read(sys.maxsize): + chunks.append(data) + + return b"".join(chunks) + # Rewind the file to the beginning of the data stream. def _rewind(self): self._fp.seek(0) diff --git a/Lib/gzip.py b/Lib/gzip.py index 475ec326c0..5b20e5ba69 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -399,6 +399,59 @@ def readline(self, size=-1): return self._buffer.readline(size) +def _read_exact(fp, n): + '''Read exactly *n* bytes from `fp` + + This method is required because fp may be unbuffered, + i.e. return short reads. + ''' + data = fp.read(n) + while len(data) < n: + b = fp.read(n - len(data)) + if not b: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + data += b + return data + + +def _read_gzip_header(fp): + '''Read a gzip header from `fp` and progress to the end of the header. + + Returns last mtime if header was present or None otherwise. + ''' + magic = fp.read(2) + if magic == b'': + return None + + if magic != b'\037\213': + raise BadGzipFile('Not a gzipped file (%r)' % magic) + + (method, flag, last_mtime) = struct.unpack(" bytes: + """ + Write a simple gzip header with no extra fields. + :param compresslevel: Compresslevel used to determine the xfl bytes. + :param mtime: The mtime (must support conversion to a 32-bit integer). + :return: A bytes object representing the gzip header. + """ + if mtime is None: + mtime = time.time() + if compresslevel == _COMPRESS_LEVEL_BEST: + xfl = 2 + elif compresslevel == _COMPRESS_LEVEL_FAST: + xfl = 4 + else: + xfl = 0 + # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra + # fields added to header), mtime, xfl and os (255 for unknown OS). + return struct.pack("