Skip to content

gh-118107: Fix zipimporter ZIP64 handling. #118108

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions Lib/test/test_zipimport.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import sys
import os
import marshal
import glob
import importlib
import importlib.util
import re
import struct
import time
import unittest
Expand Down Expand Up @@ -54,6 +56,7 @@ def module_path_to_dotted_name(path):
TESTPACK2 = "ziptestpackage2"
TEMP_DIR = os.path.abspath("junk95142")
TEMP_ZIP = os.path.abspath("junk95142.zip")
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "zipimport_data")

pyc_file = importlib.util.cache_from_source(TESTMOD + '.py')
pyc_ext = '.pyc'
Expand Down Expand Up @@ -134,7 +137,9 @@ def getZip64Files(self):

def doTest(self, expected_ext, files, *modules, **kw):
self.makeZip(files, **kw)
self.doTestWithPreBuiltZip(expected_ext, *modules, **kw)

def doTestWithPreBuiltZip(self, expected_ext, *modules, **kw):
sys.path.insert(0, TEMP_ZIP)

mod = importlib.import_module(".".join(modules))
Expand Down Expand Up @@ -810,6 +815,122 @@ def testZip64CruftAndComment(self):
files = self.getZip64Files()
self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1))

def testZip64LargeFile(self):
support.requires(
Copy link
Contributor Author

@jsirois jsirois Apr 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had introduced the decorator from Lib/test/test_largefile.py and then stumbled upon the resources support (-u) that seems more widely used; so I went with it instead. The tests only run (and pass) with -ulargefile:

./python -m test test_zipimport -ulargefile -v -m testZip64LargeFile
== CPython 3.13.0a6+ (heads/fix-issue-118107:8d55e2f226, Apr 19 2024, 20:12:18) [GCC 11.4.0]
== Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 little-endian
== Python build: debug
== cwd: /home/jsirois/dev/python/cpython/build/test_python_worker_191158æ
== CPU count: 16
== encodings: locale=UTF-8 FS=utf-8
== resources (1): largefile

Using random seed: 3331549713
0:00:00 load avg: 3.80 Run 1 test sequentially
0:00:00 load avg: 3.80 [1/1] test_zipimport
testZip64LargeFile (test.test_zipimport.CompressedZipImportTestCase.testZip64LargeFile) ... ok
testZip64LargeFile (test.test_zipimport.UncompressedZipImportTestCase.testZip64LargeFile) ... ok

----------------------------------------------------------------------
Ran 2 tests in 42.905s

OK
test_zipimport passed in 43.0 sec

== Tests result: SUCCESS ==

1 test OK.

Total duration: 43.0 sec
Total tests: run=2 (filtered)
Total test files: run=1/1 (filtered)
Result: SUCCESS

"largefile",
f"test generates files >{0xFFFFFFFF} bytes and takes a long time "
"to run"
)

# N.B.: We do alot of gymnastics below in the ZIP_STORED case to save
# and reconstruct a sparse zip on systems that support sparse files.
# Instead of creating a ~8GB zip file mainly consisting of null bytes
# for every run of the test, we create the zip once and save off the
# non-null portions of the resulting file as data blobs with offsets
# that allow re-creating the zip file sparsely. This drops disk space
# usage to ~9KB for the ZIP_STORED case and drops that test time by ~2
# orders of magnitude. For the ZIP_DEFLATED case, however, we bite the
# bullet. The resulting zip file is ~8MB of non-null data; so the sparse
# trick doesn't work and would result in that full ~8MB zip data file
# being checked in to source control.
parts_glob = f"sparse-zip64-c{self.compression:d}-0x*.part"
full_parts_glob = os.path.join(TEST_DATA_DIR, parts_glob)
pre_built_zip_parts = glob.glob(full_parts_glob)

self.addCleanup(os_helper.unlink, TEMP_ZIP)
if not pre_built_zip_parts:
if self.compression != ZIP_STORED:
support.requires(
"cpu",
"test requires a lot of CPU for compression."
)
self.addCleanup(os_helper.unlink, os_helper.TESTFN)
with open(os_helper.TESTFN, "wb") as f:
f.write(b"data")
f.write(os.linesep.encode())
f.seek(0xffff_ffff, os.SEEK_CUR)
f.write(os.linesep.encode())
os.utime(os_helper.TESTFN, (0.0, 0.0))
with ZipFile(
TEMP_ZIP,
"w",
compression=self.compression,
strict_timestamps=False
) as z:
z.write(os_helper.TESTFN, "data1")
z.writestr(
ZipInfo("module.py", (1980, 1, 1, 0, 0, 0)), test_src
)
z.write(os_helper.TESTFN, "data2")

# This "works" but relies on the zip format having a non-empty
# final page due to the trailing central directory to wind up with
# the correct length file.
def make_sparse_zip_parts(name):
empty_page = b"\0" * 4096
with open(name, "rb") as f:
part = None
try:
while True:
offset = f.tell()
data = f.read(len(empty_page))
if not data:
break
if data != empty_page:
if not part:
part_fullname = os.path.join(
TEST_DATA_DIR,
f"sparse-zip64-c{self.compression:d}-"
f"{offset:#011x}.part",
)
os.makedirs(
os.path.dirname(part_fullname),
exist_ok=True
)
part = open(part_fullname, "wb")
print("Created", part_fullname)
part.write(data)
else:
if part:
part.close()
part = None
finally:
if part:
part.close()

if self.compression == ZIP_STORED:
print(f"Creating sparse parts to check in into {TEST_DATA_DIR}:")
make_sparse_zip_parts(TEMP_ZIP)

else:
def extract_offset(name):
if m := re.search(r"-(0x[0-9a-f]{9})\.part$", name):
return int(m.group(1), base=16)
raise ValueError(f"{name=} does not fit expected pattern.")
offset_parts = [(extract_offset(n), n) for n in pre_built_zip_parts]
with open(TEMP_ZIP, "wb") as f:
for offset, part_fn in sorted(offset_parts):
with open(part_fn, "rb") as part:
f.seek(offset, os.SEEK_SET)
f.write(part.read())
# Confirm that the reconstructed zip file works and looks right.
with ZipFile(TEMP_ZIP, "r") as z:
self.assertEqual(
z.getinfo("module.py").date_time, (1980, 1, 1, 0, 0, 0)
)
self.assertEqual(
z.read("module.py"), test_src.encode(),
msg=f"Recreate {full_parts_glob}, unexpected contents."
)
def assertDataEntry(name):
zinfo = z.getinfo(name)
self.assertEqual(zinfo.date_time, (1980, 1, 1, 0, 0, 0))
self.assertGreater(zinfo.file_size, 0xffff_ffff)
assertDataEntry("data1")
assertDataEntry("data2")

self.doTestWithPreBuiltZip(".py", "module")


@support.requires_zlib()
class CompressedZipImportTestCase(UncompressedZipImportTestCase):
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
5 changes: 3 additions & 2 deletions Lib/zipimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,8 +517,9 @@ def _read_directory(archive):
num_extra_values = (len(extra_data) - 4) // 8
if num_extra_values > 3:
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
extra_data, offset=4)
import struct
values = list(struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new test caught the need for the list here. Prior to the fix you'd see:

ERROR: testZip64LargeFile (test.test_zipimport.UncompressedZipImportTestCase.testZip64LargeFile)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<frozen importlib._bootstrap_external>", line 1510, in _path_importer_cache
KeyError: '/home/jsirois/dev/python/cpython/build/test_python_worker_140690æ/junk95142.zip'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/jsirois/dev/python/cpython/Lib/test/test_zipimport.py", line 89, in wrapper
    return fun(*args, **kwargs)
           ~~~^^^^^^^^^^^^^^^^^
  File "/home/jsirois/dev/python/cpython/Lib/test/test_zipimport.py", line 89, in wrapper
    return fun(*args, **kwargs)
           ~~~^^^^^^^^^^^^^^^^^
  File "/home/jsirois/dev/python/cpython/Lib/test/test_zipimport.py", line 849, in testZip64LargeFile
    self.doTestWithPreBuiltZip(".py", "large_file")
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "/home/jsirois/dev/python/cpython/Lib/test/test_zipimport.py", line 157, in doTestWithPreBuiltZip
    mod = importlib.import_module(".".join(modules))
          ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^
  File "/home/jsirois/dev/python/cpython/Lib/importlib/__init__.py", line 88, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap>", line 1387, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1322, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 1262, in _find_spec
  File "<frozen importlib._bootstrap_external>", line 1553, in find_spec
  File "<frozen importlib._bootstrap_external>", line 1525, in _get_spec
  File "<frozen importlib._bootstrap_external>", line 1512, in _path_importer_cache
  File "<frozen importlib._bootstrap_external>", line 1488, in _path_hooks
  File "<frozen zipimport>", line 98, in __init__
  File "<frozen zipimport>", line 528, in _read_directory
AttributeError: 'tuple' object has no attribute 'pop'

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another good catch! glad you added the test case!

extra_data, offset=4))

# N.b. Here be dragons: the ordering of these is different than
# the header fields, and it's really easy to get it wrong since
Expand Down
3 changes: 2 additions & 1 deletion Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -2483,7 +2483,8 @@ TESTSUBDIRS= idlelib/idle_test \
test/typinganndata \
test/wheeldata \
test/xmltestdata \
test/xmltestdata/c14n-20
test/xmltestdata/c14n-20 \
test/zipimport_data

COMPILEALL_OPTS=-j0

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix :mod:`zipimport` reading of ZIP64 files with file entries that are too big or
offset too far.