From 617e06403d65b743ff7f2ffe83bd8cf9bb0accb8 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 18 Oct 2021 13:47:27 +0200 Subject: [PATCH 1/3] Check gzip headers for corrupted fields --- Lib/gzip.py | 53 +++++++++++++++---- Lib/test/test_gzip.py | 30 +++++++++++ .../2021-10-18-13-46-55.bpo-45509.Upwb60.rst | 1 + 3 files changed, 73 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-10-18-13-46-55.bpo-45509.Upwb60.rst diff --git a/Lib/gzip.py b/Lib/gzip.py index 6773ea3eef0971..693ac425d8d5f3 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -426,29 +426,60 @@ def _read_gzip_header(fp): if magic != b'\037\213': raise BadGzipFile('Not a gzipped file (%r)' % magic) - - (method, flag, last_mtime) = struct.unpack(" Date: Wed, 24 Nov 2021 10:48:00 +0100 Subject: [PATCH 2/3] Minor performance tweaks to _read_gzip_header Call the bool method and cache the result for faster truth checking. Do not test for empty bytes but use "not magic" instead for faster truth checking. --- Lib/gzip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 693ac425d8d5f3..132719a7c6959a 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -421,7 +421,7 @@ def _read_gzip_header(fp): Returns last mtime if header was present or None otherwise. ''' magic = fp.read(2) - if magic == b'': + if not magic: return None if magic != b'\037\213': @@ -432,7 +432,7 @@ def _read_gzip_header(fp): raise BadGzipFile('Unknown compression method') # FHCRC will be checked often. So save the result of the check. - fhcrc = flag & FHCRC + fhcrc = bool(flag & FHCRC) # Only create and append to a list of header parts when FHCRC is set. # In the most common use cases FHCRC is not set. So we optimize for those # cases. From e68e76ede3cad99c094531ccf0596ad70bf1c883 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 24 Nov 2021 11:42:40 +0100 Subject: [PATCH 3/3] Optimize _read_gzip_header for the most common code paths Those are: + Only FNAME set. (Created by gzip and python's GzipFile) + No flags set. (Created by gzip.compress and zlib.compress with wbits=31) --- Lib/gzip.py | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 132719a7c6959a..171b8ba77c0d1a 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -431,48 +431,58 @@ def _read_gzip_header(fp): if method != 8: raise BadGzipFile('Unknown compression method') - # FHCRC will be checked often. So save the result of the check. - fhcrc = bool(flag & FHCRC) - # Only create and append to a list of header parts when FHCRC is set. - # In the most common use cases FHCRC is not set. So we optimize for those - # cases. - if fhcrc: - header_parts = [magic, base_header] + # No flags. No need for further parsing. These headers are returned by + # gzip.compress or zlib.compress(..., wbits=31) + if not flag: + return last_mtime + # Most gzip files will have only FNAME set. For example: produced by gzip + # command line application or python's GzipFile. + if flag == FNAME: + while True: + s = fp.read(1) + if not s: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + if s == b'\000': + break + return last_mtime + + # Processing for more complex flags. + + # Save header parts for FHCRC checking + header_parts = [magic, base_header] if flag & FEXTRA: - # Read the extra field, if present, save the fields if FHCRC is set. + # Read the extra field, if present, save the fields for FHCRC checking. extra_len_bytes = _read_exact(fp, 2) extra_len, = struct.unpack("