Check gzip headers for corrupted fields

rhpvorderman · rhpvorderman · commit 617e06403d65 · 2021-11-23T07:28:28.000+01:00
diff --git a/Lib/gzip.py b/Lib/gzip.py
@@ -426,29 +426,60 @@ def _read_gzip_header(fp):
 
     if magic != b'\037\213':
         raise BadGzipFile('Not a gzipped file (%r)' % magic)
-
-    (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
+    base_header = _read_exact(fp, 8)
+    (method, flag, last_mtime) = struct.unpack("<BBIxx", base_header)
     if method != 8:
         raise BadGzipFile('Unknown compression method')
 
+    # FHCRC will be checked often. So save the result of the check.
+    fhcrc = flag & FHCRC
+    # Only create and append to a list of header parts when FHCRC is set.
+    # In the most common use cases FHCRC is not set. So we optimize for those
+    # cases.
+    if fhcrc:
+        header_parts = [magic, base_header]
+
     if flag & FEXTRA:
-        # Read & discard the extra field, if present
-        extra_len, = struct.unpack("<H", _read_exact(fp, 2))
-        _read_exact(fp, extra_len)
+        # Read the extra field, if present, save the fields if FHCRC is set.
+        extra_len_bytes = _read_exact(fp, 2)
+        extra_len, = struct.unpack("<H", extra_len_bytes)
+        extra = _read_exact(fp, extra_len)
+        if fhcrc:
+            header_parts.extend([extra_len_bytes, extra])
+
     if flag & FNAME:
-        # Read and discard a null-terminated string containing the filename
+        # Read a null-terminated string containing the filename. Save it
+        # if FHCRC is set.
         while True:
             s = fp.read(1)
-            if not s or s==b'\000':
+            if not s:
+                raise EOFError("Compressed file ended before the "
+                               "end-of-stream marker was reached")
+            if fhcrc:
+                header_parts.append(s)
+            if s == b'\000':
                 break
     if flag & FCOMMENT:
-        # Read and discard a null-terminated string containing a comment
+        # Read a null-terminated string containing the filename. Save it
+        # if FHCRC is set.
         while True:
             s = fp.read(1)
-            if not s or s==b'\000':
+            if not s:
+                raise EOFError("Compressed file ended before the "
+                               "end-of-stream marker was reached")
+            if fhcrc:
+                header_parts.append(s)
+            if s == b'\000':
                 break
-    if flag & FHCRC:
-        _read_exact(fp, 2)     # Read & discard the 16-bit header CRC
+
+    if fhcrc:
+        # Read the 16-bit header CRC and check it against the header.
+        header_crc, = struct.unpack("<H", _read_exact(fp, 2))
+        header = b"".join(header_parts)
+        true_crc = zlib.crc32(header) & 0xFFFF
+        if header_crc != true_crc:
+            raise BadGzipFile(f"Corrupted gzip header. Checksums do not "
+                               f"match: {true_crc:04x} != {header_crc:04x}")
     return last_mtime
 
 
diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py
@@ -9,6 +9,7 @@
 import struct
 import sys
 import unittest
+import zlib
 from subprocess import PIPE, Popen
 from test.support import import_helper
 from test.support import os_helper
@@ -570,6 +571,35 @@ def test_decompress_missing_trailer(self):
         compressed_data = gzip.compress(data1)
         self.assertRaises(EOFError, gzip.decompress, compressed_data[:-8])
 
+    def test_truncated_header(self):
+        truncated_headers = [
+            b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x00",             # Missing OS byte
+            b"\x1f\x8b\x08\x02\x00\x00\x00\x00\x00\xff",         # FHRC, but no checksum
+            b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff",         # FEXTRA, but no xlen
+            b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\xaa\x00", # FEXTRA, xlen, but no data
+            b"\x1f\x8b\x08\x08\x00\x00\x00\x00\x00\xff",         # FNAME but no fname
+            b"\x1f\x8b\x08\x10\x00\x00\x00\x00\x00\xff",         # FCOMMENT, but no fcomment
+        ]
+        for header in truncated_headers:
+            with self.subTest(header=header):
+                with self.assertRaises(EOFError):
+                    gzip.decompress(header)
+
+    def test_corrupted_gzip_header(self):
+        header = (b"\x1f\x8b\x08\x1f\x00\x00\x00\x00\x00\xff"  # All flags set
+                  b"\x05\x00"  # Xlen = 5
+                  b"extra"
+                  b"name\x00"
+                  b"comment\x00")
+        true_crc = zlib.crc32(header) & 0xFFFF
+        corrupted_crc = true_crc ^ 0xFFFF
+        corrupted_header = header + corrupted_crc.to_bytes(2, "little")
+        with self.assertRaises(gzip.BadGzipFile) as err:
+            gzip.decompress(corrupted_header)
+        self.assertEqual(str(err.exception),
+                         f"Corrupted gzip header. Checksums do not "
+                         f"match: {true_crc:04x} != {corrupted_crc:04x}")
+
     def test_read_truncated(self):
         data = data1*50
         # Drop the CRC (4 bytes) and file size (4 bytes).
diff --git a/Misc/NEWS.d/next/Library/2021-10-18-13-46-55.bpo-45509.Upwb60.rst b/Misc/NEWS.d/next/Library/2021-10-18-13-46-55.bpo-45509.Upwb60.rst
@@ -0,0 +1 @@
+Gzip headers are now checked for corrupted NAME, COMMENT and HCRC fields.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Gzip headers are now checked for corrupted NAME, COMMENT and HCRC fields.`