Skip to content

Update _compression, gzip, and test_gzip for CPython v3.11.2 #4688

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion Lib/_compression.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Internal classes used by the gzip, lzma and bz2 modules"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by looking the header, bz2 also seems to depending on here. could you also update it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have no bz2 rust library, unfortunately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I hit the SkipTest: No module named ' bz2' when I copied over and ran the test_bz2.py as there were no changes in the Lib/bz2.py. Would this be connected to the bz2.rs in stdlib/src/


import io

import sys

BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size

Expand Down Expand Up @@ -110,6 +110,16 @@ def read(self, size=-1):
self._pos += len(data)
return data

def readall(self):
chunks = []
# sys.maxsize means the max length of output buffer is unlimited,
# so that the whole input buffer can be decompressed within one
# .decompress() call.
while data := self.read(sys.maxsize):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we don't support this operator yet

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did see a reference on L84 of compiler/ast/src/unparse.rs but it might not be handling it as the assignment operator described in PEP 572.

chunks.append(data)

return b"".join(chunks)

# Rewind the file to the beginning of the data stream.
def _rewind(self):
self._fp.seek(0)
Expand Down
165 changes: 112 additions & 53 deletions Lib/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,59 @@ def readline(self, size=-1):
return self._buffer.readline(size)


def _read_exact(fp, n):
'''Read exactly *n* bytes from `fp`

This method is required because fp may be unbuffered,
i.e. return short reads.
'''
data = fp.read(n)
while len(data) < n:
b = fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data


def _read_gzip_header(fp):
'''Read a gzip header from `fp` and progress to the end of the header.

Returns last mtime if header was present or None otherwise.
'''
magic = fp.read(2)
if magic == b'':
return None

if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)

(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
if method != 8:
raise BadGzipFile('Unknown compression method')

if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
_read_exact(fp, extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = fp.read(1)
if not s or s==b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = fp.read(1)
if not s or s==b'\000':
break
if flag & FHCRC:
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
return last_mtime


class _GzipReader(_compression.DecompressReader):
def __init__(self, fp):
super().__init__(_PaddedFile(fp), zlib.decompressobj,
Expand All @@ -411,53 +464,11 @@ def _init_read(self):
self._crc = zlib.crc32(b"")
self._stream_size = 0 # Decompressed size of unconcatenated stream

def _read_exact(self, n):
'''Read exactly *n* bytes from `self._fp`

This method is required because self._fp may be unbuffered,
i.e. return short reads.
'''

data = self._fp.read(n)
while len(data) < n:
b = self._fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data

def _read_gzip_header(self):
magic = self._fp.read(2)
if magic == b'':
last_mtime = _read_gzip_header(self._fp)
if last_mtime is None:
return False

if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)

(method, flag,
self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
if method != 8:
raise BadGzipFile('Unknown compression method')

if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", self._read_exact(2))
self._read_exact(extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FHCRC:
self._read_exact(2) # Read & discard the 16-bit header CRC
self._last_mtime = last_mtime
return True

def read(self, size=-1):
Expand Down Expand Up @@ -520,7 +531,7 @@ def _read_eof(self):
# We check that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
crc32, isize = struct.unpack("<II", self._read_exact(8))
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
if crc32 != self._crc:
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
hex(self._crc)))
Expand All @@ -540,21 +551,69 @@ def _rewind(self):
super()._rewind()
self._new_member = True


def _create_simple_gzip_header(compresslevel: int,
mtime = None) -> bytes:
"""
Write a simple gzip header with no extra fields.
:param compresslevel: Compresslevel used to determine the xfl bytes.
:param mtime: The mtime (must support conversion to a 32-bit integer).
:return: A bytes object representing the gzip header.
"""
if mtime is None:
mtime = time.time()
if compresslevel == _COMPRESS_LEVEL_BEST:
xfl = 2
elif compresslevel == _COMPRESS_LEVEL_FAST:
xfl = 4
else:
xfl = 0
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
# fields added to header), mtime, xfl and os (255 for unknown OS).
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)


def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
"""Compress data in one shot and return the compressed string.
Optional argument is the compression level, in range of 0-9.

compresslevel sets the compression level in range of 0-9.
mtime can be used to set the modification time. The modification time is
set to the current time by default.
"""
buf = io.BytesIO()
with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
f.write(data)
return buf.getvalue()
if mtime == 0:
# Use zlib as it creates the header with 0 mtime by default.
# This is faster and with less overhead.
return zlib.compress(data, level=compresslevel, wbits=31)
header = _create_simple_gzip_header(compresslevel, mtime)
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
# Wbits=-15 creates a raw deflate block.
return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
trailer)


def decompress(data):
"""Decompress a gzip compressed string in one shot.
Return the decompressed string.
"""
with GzipFile(fileobj=io.BytesIO(data)) as f:
return f.read()
decompressed_members = []
while True:
fp = io.BytesIO(data)
if _read_gzip_header(fp) is None:
return b"".join(decompressed_members)
# Use a zlib raw deflate compressor
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
# Read all the data except the header
decompressed = do.decompress(data[fp.tell():])
if not do.eof or len(do.unused_data) < 8:
raise EOFError("Compressed file ended before the end-of-stream "
"marker was reached")
crc, length = struct.unpack("<II", do.unused_data[:8])
if crc != zlib.crc32(decompressed):
raise BadGzipFile("CRC check failed")
if length != (len(decompressed) & 0xffffffff):
raise BadGzipFile("Incorrect length of data produced")
decompressed_members.append(decompressed)
data = do.unused_data[8:].lstrip(b"\x00")


def main():
Expand Down
21 changes: 20 additions & 1 deletion Lib/test/test_gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from subprocess import PIPE, Popen
from test.support import import_helper
from test.support import os_helper
from test.support import _4G, bigmemtest
from test.support import _4G, bigmemtest, requires_subprocess
from test.support.script_helper import assert_python_ok, assert_python_failure

gzip = import_helper.import_module('gzip')
Expand Down Expand Up @@ -552,6 +552,15 @@ def test_compress_mtime(self):
f.read(1) # to set mtime attribute
self.assertEqual(f.mtime, mtime)

def test_compress_correct_level(self):
# gzip.compress calls with mtime == 0 take a different code path.
for mtime in (0, 42):
with self.subTest(mtime=mtime):
nocompress = gzip.compress(data1, compresslevel=0, mtime=mtime)
yescompress = gzip.compress(data1, compresslevel=1, mtime=mtime)
self.assertIn(data1, nocompress)
self.assertNotIn(data1, yescompress)

def test_decompress(self):
for data in (data1, data2):
buf = io.BytesIO()
Expand All @@ -562,6 +571,14 @@ def test_decompress(self):
datac = gzip.compress(data)
self.assertEqual(gzip.decompress(datac), data)

def test_decompress_truncated_trailer(self):
compressed_data = gzip.compress(data1)
self.assertRaises(EOFError, gzip.decompress, compressed_data[:-4])

def test_decompress_missing_trailer(self):
compressed_data = gzip.compress(data1)
self.assertRaises(EOFError, gzip.decompress, compressed_data[:-8])

def test_read_truncated(self):
data = data1*50
# Drop the CRC (4 bytes) and file size (4 bytes).
Expand Down Expand Up @@ -756,6 +773,7 @@ def wrapper(*args, **kwargs):
class TestCommandLine(unittest.TestCase):
data = b'This is a simple test with gzip'

@requires_subprocess()
def test_decompress_stdin_stdout(self):
with io.BytesIO() as bytes_io:
with gzip.GzipFile(fileobj=bytes_io, mode='wb') as gzip_file:
Expand Down Expand Up @@ -791,6 +809,7 @@ def test_decompress_infile_outfile_error(self):
self.assertEqual(rc, 1)
self.assertEqual(out, b'')

@requires_subprocess()
@create_and_remove_directory(TEMPDIR)
def test_compress_stdin_outfile(self):
args = sys.executable, '-m', 'gzip'
Expand Down
5 changes: 3 additions & 2 deletions extra_tests/snippets/stdlib_zlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@
b"x\xda\xf3\xc9/J\xcdU\xc8,(.\xcdUH\xc9\xcf\xc9/R(\xce,QH\xccM-\x01\x00\x83\xd5\t\xc5",
]

for level, text in enumerate(compressed_lorem_list):
assert zlib.compress(lorem, level) == text
for level, expected in enumerate(compressed_lorem_list):
actual = zlib.compress(lorem, level)
assert actual == expected

# default level
assert zlib.compress(lorem) == zlib.compress(lorem, -1) == zlib.compress(lorem, 6)
Expand Down
Loading