Skip to content

Commit 407f3f5

Browse files
tdub0youknowone
authored andcommitted
Update _compression, gzip from CPython v3.11.2
1 parent 5cb17be commit 407f3f5

File tree

3 files changed

+143
-55
lines changed

3 files changed

+143
-55
lines changed

Lib/_compression.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Internal classes used by the gzip, lzma and bz2 modules"""
22

33
import io
4-
4+
import sys
55

66
BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size
77

@@ -110,6 +110,16 @@ def read(self, size=-1):
110110
self._pos += len(data)
111111
return data
112112

113+
def readall(self):
114+
chunks = []
115+
# sys.maxsize means the max length of output buffer is unlimited,
116+
# so that the whole input buffer can be decompressed within one
117+
# .decompress() call.
118+
while data := self.read(sys.maxsize):
119+
chunks.append(data)
120+
121+
return b"".join(chunks)
122+
113123
# Rewind the file to the beginning of the data stream.
114124
def _rewind(self):
115125
self._fp.seek(0)

Lib/gzip.py

+112-53
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,59 @@ def readline(self, size=-1):
399399
return self._buffer.readline(size)
400400

401401

402+
def _read_exact(fp, n):
403+
'''Read exactly *n* bytes from `fp`
404+
405+
This method is required because fp may be unbuffered,
406+
i.e. return short reads.
407+
'''
408+
data = fp.read(n)
409+
while len(data) < n:
410+
b = fp.read(n - len(data))
411+
if not b:
412+
raise EOFError("Compressed file ended before the "
413+
"end-of-stream marker was reached")
414+
data += b
415+
return data
416+
417+
418+
def _read_gzip_header(fp):
419+
'''Read a gzip header from `fp` and progress to the end of the header.
420+
421+
Returns last mtime if header was present or None otherwise.
422+
'''
423+
magic = fp.read(2)
424+
if magic == b'':
425+
return None
426+
427+
if magic != b'\037\213':
428+
raise BadGzipFile('Not a gzipped file (%r)' % magic)
429+
430+
(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
431+
if method != 8:
432+
raise BadGzipFile('Unknown compression method')
433+
434+
if flag & FEXTRA:
435+
# Read & discard the extra field, if present
436+
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
437+
_read_exact(fp, extra_len)
438+
if flag & FNAME:
439+
# Read and discard a null-terminated string containing the filename
440+
while True:
441+
s = fp.read(1)
442+
if not s or s==b'\000':
443+
break
444+
if flag & FCOMMENT:
445+
# Read and discard a null-terminated string containing a comment
446+
while True:
447+
s = fp.read(1)
448+
if not s or s==b'\000':
449+
break
450+
if flag & FHCRC:
451+
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
452+
return last_mtime
453+
454+
402455
class _GzipReader(_compression.DecompressReader):
403456
def __init__(self, fp):
404457
super().__init__(_PaddedFile(fp), zlib.decompressobj,
@@ -411,53 +464,11 @@ def _init_read(self):
411464
self._crc = zlib.crc32(b"")
412465
self._stream_size = 0 # Decompressed size of unconcatenated stream
413466

414-
def _read_exact(self, n):
415-
'''Read exactly *n* bytes from `self._fp`
416-
417-
This method is required because self._fp may be unbuffered,
418-
i.e. return short reads.
419-
'''
420-
421-
data = self._fp.read(n)
422-
while len(data) < n:
423-
b = self._fp.read(n - len(data))
424-
if not b:
425-
raise EOFError("Compressed file ended before the "
426-
"end-of-stream marker was reached")
427-
data += b
428-
return data
429-
430467
def _read_gzip_header(self):
431-
magic = self._fp.read(2)
432-
if magic == b'':
468+
last_mtime = _read_gzip_header(self._fp)
469+
if last_mtime is None:
433470
return False
434-
435-
if magic != b'\037\213':
436-
raise BadGzipFile('Not a gzipped file (%r)' % magic)
437-
438-
(method, flag,
439-
self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
440-
if method != 8:
441-
raise BadGzipFile('Unknown compression method')
442-
443-
if flag & FEXTRA:
444-
# Read & discard the extra field, if present
445-
extra_len, = struct.unpack("<H", self._read_exact(2))
446-
self._read_exact(extra_len)
447-
if flag & FNAME:
448-
# Read and discard a null-terminated string containing the filename
449-
while True:
450-
s = self._fp.read(1)
451-
if not s or s==b'\000':
452-
break
453-
if flag & FCOMMENT:
454-
# Read and discard a null-terminated string containing a comment
455-
while True:
456-
s = self._fp.read(1)
457-
if not s or s==b'\000':
458-
break
459-
if flag & FHCRC:
460-
self._read_exact(2) # Read & discard the 16-bit header CRC
471+
self._last_mtime = last_mtime
461472
return True
462473

463474
def read(self, size=-1):
@@ -520,7 +531,7 @@ def _read_eof(self):
520531
# We check that the computed CRC and size of the
521532
# uncompressed data matches the stored values. Note that the size
522533
# stored is the true file size mod 2**32.
523-
crc32, isize = struct.unpack("<II", self._read_exact(8))
534+
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
524535
if crc32 != self._crc:
525536
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
526537
hex(self._crc)))
@@ -540,21 +551,69 @@ def _rewind(self):
540551
super()._rewind()
541552
self._new_member = True
542553

554+
555+
def _create_simple_gzip_header(compresslevel: int,
556+
mtime = None) -> bytes:
557+
"""
558+
Write a simple gzip header with no extra fields.
559+
:param compresslevel: Compresslevel used to determine the xfl bytes.
560+
:param mtime: The mtime (must support conversion to a 32-bit integer).
561+
:return: A bytes object representing the gzip header.
562+
"""
563+
if mtime is None:
564+
mtime = time.time()
565+
if compresslevel == _COMPRESS_LEVEL_BEST:
566+
xfl = 2
567+
elif compresslevel == _COMPRESS_LEVEL_FAST:
568+
xfl = 4
569+
else:
570+
xfl = 0
571+
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
572+
# fields added to header), mtime, xfl and os (255 for unknown OS).
573+
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
574+
575+
543576
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
544577
"""Compress data in one shot and return the compressed string.
545-
Optional argument is the compression level, in range of 0-9.
578+
579+
compresslevel sets the compression level in range of 0-9.
580+
mtime can be used to set the modification time. The modification time is
581+
set to the current time by default.
546582
"""
547-
buf = io.BytesIO()
548-
with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
549-
f.write(data)
550-
return buf.getvalue()
583+
if mtime == 0:
584+
# Use zlib as it creates the header with 0 mtime by default.
585+
# This is faster and with less overhead.
586+
return zlib.compress(data, level=compresslevel, wbits=31)
587+
header = _create_simple_gzip_header(compresslevel, mtime)
588+
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
589+
# Wbits=-15 creates a raw deflate block.
590+
return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
591+
trailer)
592+
551593

552594
def decompress(data):
553595
"""Decompress a gzip compressed string in one shot.
554596
Return the decompressed string.
555597
"""
556-
with GzipFile(fileobj=io.BytesIO(data)) as f:
557-
return f.read()
598+
decompressed_members = []
599+
while True:
600+
fp = io.BytesIO(data)
601+
if _read_gzip_header(fp) is None:
602+
return b"".join(decompressed_members)
603+
# Use a zlib raw deflate compressor
604+
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
605+
# Read all the data except the header
606+
decompressed = do.decompress(data[fp.tell():])
607+
if not do.eof or len(do.unused_data) < 8:
608+
raise EOFError("Compressed file ended before the end-of-stream "
609+
"marker was reached")
610+
crc, length = struct.unpack("<II", do.unused_data[:8])
611+
if crc != zlib.crc32(decompressed):
612+
raise BadGzipFile("CRC check failed")
613+
if length != (len(decompressed) & 0xffffffff):
614+
raise BadGzipFile("Incorrect length of data produced")
615+
decompressed_members.append(decompressed)
616+
data = do.unused_data[8:].lstrip(b"\x00")
558617

559618

560619
def main():

Lib/test/test_gzip.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from subprocess import PIPE, Popen
1313
from test.support import import_helper
1414
from test.support import os_helper
15-
from test.support import _4G, bigmemtest
15+
from test.support import _4G, bigmemtest, requires_subprocess
1616
from test.support.script_helper import assert_python_ok, assert_python_failure
1717

1818
gzip = import_helper.import_module('gzip')
@@ -552,6 +552,15 @@ def test_compress_mtime(self):
552552
f.read(1) # to set mtime attribute
553553
self.assertEqual(f.mtime, mtime)
554554

555+
def test_compress_correct_level(self):
556+
# gzip.compress calls with mtime == 0 take a different code path.
557+
for mtime in (0, 42):
558+
with self.subTest(mtime=mtime):
559+
nocompress = gzip.compress(data1, compresslevel=0, mtime=mtime)
560+
yescompress = gzip.compress(data1, compresslevel=1, mtime=mtime)
561+
self.assertIn(data1, nocompress)
562+
self.assertNotIn(data1, yescompress)
563+
555564
def test_decompress(self):
556565
for data in (data1, data2):
557566
buf = io.BytesIO()
@@ -562,6 +571,14 @@ def test_decompress(self):
562571
datac = gzip.compress(data)
563572
self.assertEqual(gzip.decompress(datac), data)
564573

574+
def test_decompress_truncated_trailer(self):
575+
compressed_data = gzip.compress(data1)
576+
self.assertRaises(EOFError, gzip.decompress, compressed_data[:-4])
577+
578+
def test_decompress_missing_trailer(self):
579+
compressed_data = gzip.compress(data1)
580+
self.assertRaises(EOFError, gzip.decompress, compressed_data[:-8])
581+
565582
def test_read_truncated(self):
566583
data = data1*50
567584
# Drop the CRC (4 bytes) and file size (4 bytes).
@@ -756,6 +773,7 @@ def wrapper(*args, **kwargs):
756773
class TestCommandLine(unittest.TestCase):
757774
data = b'This is a simple test with gzip'
758775

776+
@requires_subprocess()
759777
def test_decompress_stdin_stdout(self):
760778
with io.BytesIO() as bytes_io:
761779
with gzip.GzipFile(fileobj=bytes_io, mode='wb') as gzip_file:
@@ -791,6 +809,7 @@ def test_decompress_infile_outfile_error(self):
791809
self.assertEqual(rc, 1)
792810
self.assertEqual(out, b'')
793811

812+
@requires_subprocess()
794813
@create_and_remove_directory(TEMPDIR)
795814
def test_compress_stdin_outfile(self):
796815
args = sys.executable, '-m', 'gzip'

0 commit comments

Comments
 (0)