diff --git a/Lib/test/archiver_tests.py b/Lib/test/archiver_tests.py new file mode 100644 index 0000000000..1a4bbb9e57 --- /dev/null +++ b/Lib/test/archiver_tests.py @@ -0,0 +1,155 @@ +"""Tests common to tarfile and zipfile.""" + +import os +import sys + +from test.support import os_helper + +class OverwriteTests: + + def setUp(self): + os.makedirs(self.testdir) + self.addCleanup(os_helper.rmtree, self.testdir) + + def create_file(self, path, content=b''): + with open(path, 'wb') as f: + f.write(content) + + def open(self, path): + raise NotImplementedError + + def extractall(self, ar): + raise NotImplementedError + + + def test_overwrite_file_as_file(self): + target = os.path.join(self.testdir, 'test') + self.create_file(target, b'content') + with self.open(self.ar_with_file) as ar: + self.extractall(ar) + self.assertTrue(os.path.isfile(target)) + with open(target, 'rb') as f: + self.assertEqual(f.read(), b'newcontent') + + def test_overwrite_dir_as_dir(self): + target = os.path.join(self.testdir, 'test') + os.mkdir(target) + with self.open(self.ar_with_dir) as ar: + self.extractall(ar) + self.assertTrue(os.path.isdir(target)) + + def test_overwrite_dir_as_implicit_dir(self): + target = os.path.join(self.testdir, 'test') + os.mkdir(target) + with self.open(self.ar_with_implicit_dir) as ar: + self.extractall(ar) + self.assertTrue(os.path.isdir(target)) + self.assertTrue(os.path.isfile(os.path.join(target, 'file'))) + with open(os.path.join(target, 'file'), 'rb') as f: + self.assertEqual(f.read(), b'newcontent') + + def test_overwrite_dir_as_file(self): + target = os.path.join(self.testdir, 'test') + os.mkdir(target) + with self.open(self.ar_with_file) as ar: + with self.assertRaises(PermissionError if sys.platform == 'win32' + else IsADirectoryError): + self.extractall(ar) + self.assertTrue(os.path.isdir(target)) + + def test_overwrite_file_as_dir(self): + target = os.path.join(self.testdir, 'test') + self.create_file(target, b'content') + with self.open(self.ar_with_dir) as ar: + with self.assertRaises(FileExistsError): + self.extractall(ar) + self.assertTrue(os.path.isfile(target)) + with open(target, 'rb') as f: + self.assertEqual(f.read(), b'content') + + def test_overwrite_file_as_implicit_dir(self): + target = os.path.join(self.testdir, 'test') + self.create_file(target, b'content') + with self.open(self.ar_with_implicit_dir) as ar: + with self.assertRaises(FileNotFoundError if sys.platform == 'win32' + else NotADirectoryError): + self.extractall(ar) + self.assertTrue(os.path.isfile(target)) + with open(target, 'rb') as f: + self.assertEqual(f.read(), b'content') + + @os_helper.skip_unless_symlink + def test_overwrite_file_symlink_as_file(self): + # XXX: It is potential security vulnerability. + target = os.path.join(self.testdir, 'test') + target2 = os.path.join(self.testdir, 'test2') + self.create_file(target2, b'content') + os.symlink('test2', target) + with self.open(self.ar_with_file) as ar: + self.extractall(ar) + self.assertTrue(os.path.islink(target)) + self.assertTrue(os.path.isfile(target2)) + with open(target2, 'rb') as f: + self.assertEqual(f.read(), b'newcontent') + + @os_helper.skip_unless_symlink + def test_overwrite_broken_file_symlink_as_file(self): + # XXX: It is potential security vulnerability. + target = os.path.join(self.testdir, 'test') + target2 = os.path.join(self.testdir, 'test2') + os.symlink('test2', target) + with self.open(self.ar_with_file) as ar: + self.extractall(ar) + self.assertTrue(os.path.islink(target)) + self.assertTrue(os.path.isfile(target2)) + with open(target2, 'rb') as f: + self.assertEqual(f.read(), b'newcontent') + + @os_helper.skip_unless_symlink + def test_overwrite_dir_symlink_as_dir(self): + # XXX: It is potential security vulnerability. + target = os.path.join(self.testdir, 'test') + target2 = os.path.join(self.testdir, 'test2') + os.mkdir(target2) + os.symlink('test2', target, target_is_directory=True) + with self.open(self.ar_with_dir) as ar: + self.extractall(ar) + self.assertTrue(os.path.islink(target)) + self.assertTrue(os.path.isdir(target2)) + + @os_helper.skip_unless_symlink + def test_overwrite_dir_symlink_as_implicit_dir(self): + # XXX: It is potential security vulnerability. + target = os.path.join(self.testdir, 'test') + target2 = os.path.join(self.testdir, 'test2') + os.mkdir(target2) + os.symlink('test2', target, target_is_directory=True) + with self.open(self.ar_with_implicit_dir) as ar: + self.extractall(ar) + self.assertTrue(os.path.islink(target)) + self.assertTrue(os.path.isdir(target2)) + self.assertTrue(os.path.isfile(os.path.join(target2, 'file'))) + with open(os.path.join(target2, 'file'), 'rb') as f: + self.assertEqual(f.read(), b'newcontent') + + @os_helper.skip_unless_symlink + def test_overwrite_broken_dir_symlink_as_dir(self): + target = os.path.join(self.testdir, 'test') + target2 = os.path.join(self.testdir, 'test2') + os.symlink('test2', target, target_is_directory=True) + with self.open(self.ar_with_dir) as ar: + with self.assertRaises(FileExistsError): + self.extractall(ar) + self.assertTrue(os.path.islink(target)) + self.assertFalse(os.path.exists(target2)) + + @os_helper.skip_unless_symlink + def test_overwrite_broken_dir_symlink_as_implicit_dir(self): + target = os.path.join(self.testdir, 'test') + target2 = os.path.join(self.testdir, 'test2') + os.symlink('test2', target, target_is_directory=True) + with self.open(self.ar_with_implicit_dir) as ar: + with self.assertRaises(FileExistsError): + self.extractall(ar) + self.assertTrue(os.path.islink(target)) + self.assertFalse(os.path.exists(target2)) diff --git a/Lib/test/test_zipfile/__init__.py b/Lib/test/test_zipfile/__init__.py new file mode 100644 index 0000000000..4b16ecc311 --- /dev/null +++ b/Lib/test/test_zipfile/__init__.py @@ -0,0 +1,5 @@ +import os +from test.support import load_package_tests + +def load_tests(*args): + return load_package_tests(os.path.dirname(__file__), *args) diff --git a/Lib/test/test_zipfile/__main__.py b/Lib/test/test_zipfile/__main__.py new file mode 100644 index 0000000000..e25ac946ed --- /dev/null +++ b/Lib/test/test_zipfile/__main__.py @@ -0,0 +1,7 @@ +import unittest + +from . import load_tests # noqa: F401 + + +if __name__ == "__main__": + unittest.main() diff --git a/Lib/test/test_zipfile/_path/__init__.py b/Lib/test/test_zipfile/_path/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Lib/test/test_zipfile/_path/_functools.py b/Lib/test/test_zipfile/_path/_functools.py new file mode 100644 index 0000000000..75f2b20e06 --- /dev/null +++ b/Lib/test/test_zipfile/_path/_functools.py @@ -0,0 +1,9 @@ +import functools + + +# from jaraco.functools 3.5.2 +def compose(*funcs): + def compose_two(f1, f2): + return lambda *args, **kwargs: f1(f2(*args, **kwargs)) + + return functools.reduce(compose_two, funcs) diff --git a/Lib/test/test_zipfile/_path/_itertools.py b/Lib/test/test_zipfile/_path/_itertools.py new file mode 100644 index 0000000000..f735dd2173 --- /dev/null +++ b/Lib/test/test_zipfile/_path/_itertools.py @@ -0,0 +1,79 @@ +import itertools +from collections import deque +from itertools import islice + + +# from jaraco.itertools 6.3.0 +class Counter: + """ + Wrap an iterable in an object that stores the count of items + that pass through it. + + >>> items = Counter(range(20)) + >>> items.count + 0 + >>> values = list(items) + >>> items.count + 20 + """ + + def __init__(self, i): + self.count = 0 + self.iter = zip(itertools.count(1), i) + + def __iter__(self): + return self + + def __next__(self): + self.count, result = next(self.iter) + return result + + +# from more_itertools v8.13.0 +def always_iterable(obj, base_type=(str, bytes)): + if obj is None: + return iter(()) + + if (base_type is not None) and isinstance(obj, base_type): + return iter((obj,)) + + try: + return iter(obj) + except TypeError: + return iter((obj,)) + + +# from more_itertools v9.0.0 +def consume(iterator, n=None): + """Advance *iterable* by *n* steps. If *n* is ``None``, consume it + entirely. + Efficiently exhausts an iterator without returning values. Defaults to + consuming the whole iterator, but an optional second argument may be + provided to limit consumption. + >>> i = (x for x in range(10)) + >>> next(i) + 0 + >>> consume(i, 3) + >>> next(i) + 4 + >>> consume(i) + >>> next(i) + Traceback (most recent call last): + File "", line 1, in + StopIteration + If the iterator has fewer items remaining than the provided limit, the + whole iterator will be consumed. + >>> i = (x for x in range(3)) + >>> consume(i, 5) + >>> next(i) + Traceback (most recent call last): + File "", line 1, in + StopIteration + """ + # Use functions that consume iterators at C speed. + if n is None: + # feed the entire iterator into a zero-length deque + deque(iterator, maxlen=0) + else: + # advance to the empty slice starting at position n + next(islice(iterator, n, n), None) diff --git a/Lib/test/test_zipfile/_path/_support.py b/Lib/test/test_zipfile/_path/_support.py new file mode 100644 index 0000000000..1afdf3b3a7 --- /dev/null +++ b/Lib/test/test_zipfile/_path/_support.py @@ -0,0 +1,9 @@ +import importlib +import unittest + + +def import_or_skip(name): + try: + return importlib.import_module(name) + except ImportError: # pragma: no cover + raise unittest.SkipTest(f'Unable to import {name}') diff --git a/Lib/test/test_zipfile/_path/_test_params.py b/Lib/test/test_zipfile/_path/_test_params.py new file mode 100644 index 0000000000..bc95b4ebf4 --- /dev/null +++ b/Lib/test/test_zipfile/_path/_test_params.py @@ -0,0 +1,39 @@ +import types +import functools + +from ._itertools import always_iterable + + +def parameterize(names, value_groups): + """ + Decorate a test method to run it as a set of subtests. + + Modeled after pytest.parametrize. + """ + + def decorator(func): + @functools.wraps(func) + def wrapped(self): + for values in value_groups: + resolved = map(Invoked.eval, always_iterable(values)) + params = dict(zip(always_iterable(names), resolved)) + with self.subTest(**params): + func(self, **params) + + return wrapped + + return decorator + + +class Invoked(types.SimpleNamespace): + """ + Wrap a function to be invoked for each usage. + """ + + @classmethod + def wrap(cls, func): + return cls(func=func) + + @classmethod + def eval(cls, cand): + return cand.func() if isinstance(cand, cls) else cand diff --git a/Lib/test/test_zipfile/_path/test_complexity.py b/Lib/test/test_zipfile/_path/test_complexity.py new file mode 100644 index 0000000000..7050937738 --- /dev/null +++ b/Lib/test/test_zipfile/_path/test_complexity.py @@ -0,0 +1,103 @@ +import io +import itertools +import math +import re +import string +import unittest +import zipfile + +from ._functools import compose +from ._itertools import consume + +from ._support import import_or_skip + + +big_o = import_or_skip('big_o') +pytest = import_or_skip('pytest') + + +class TestComplexity(unittest.TestCase): + @pytest.mark.flaky + def test_implied_dirs_performance(self): + best, others = big_o.big_o( + compose(consume, zipfile.CompleteDirs._implied_dirs), + lambda size: [ + '/'.join(string.ascii_lowercase + str(n)) for n in range(size) + ], + max_n=1000, + min_n=1, + ) + assert best <= big_o.complexities.Linear + + def make_zip_path(self, depth=1, width=1) -> zipfile.Path: + """ + Construct a Path with width files at every level of depth. + """ + zf = zipfile.ZipFile(io.BytesIO(), mode='w') + pairs = itertools.product(self.make_deep_paths(depth), self.make_names(width)) + for path, name in pairs: + zf.writestr(f"{path}{name}.txt", b'') + zf.filename = "big un.zip" + return zipfile.Path(zf) + + @classmethod + def make_names(cls, width, letters=string.ascii_lowercase): + """ + >>> list(TestComplexity.make_names(2)) + ['a', 'b'] + >>> list(TestComplexity.make_names(30)) + ['aa', 'ab', ..., 'bd'] + """ + # determine how many products are needed to produce width + n_products = math.ceil(math.log(width, len(letters))) + inputs = (letters,) * n_products + combinations = itertools.product(*inputs) + names = map(''.join, combinations) + return itertools.islice(names, width) + + @classmethod + def make_deep_paths(cls, depth): + return map(cls.make_deep_path, range(depth)) + + @classmethod + def make_deep_path(cls, depth): + return ''.join(('d/',) * depth) + + def test_baseline_regex_complexity(self): + best, others = big_o.big_o( + lambda path: re.fullmatch(r'[^/]*\\.txt', path), + self.make_deep_path, + max_n=100, + min_n=1, + ) + assert best <= big_o.complexities.Constant + + @pytest.mark.flaky + def test_glob_depth(self): + best, others = big_o.big_o( + lambda path: consume(path.glob('*.txt')), + self.make_zip_path, + max_n=100, + min_n=1, + ) + assert best <= big_o.complexities.Quadratic + + @pytest.mark.flaky + def test_glob_width(self): + best, others = big_o.big_o( + lambda path: consume(path.glob('*.txt')), + lambda size: self.make_zip_path(width=size), + max_n=100, + min_n=1, + ) + assert best <= big_o.complexities.Linear + + @pytest.mark.flaky + def test_glob_width_and_depth(self): + best, others = big_o.big_o( + lambda path: consume(path.glob('*.txt')), + lambda size: self.make_zip_path(depth=size, width=size), + max_n=10, + min_n=1, + ) + assert best <= big_o.complexities.Linear diff --git a/Lib/test/test_zipfile/_path/test_path.py b/Lib/test/test_zipfile/_path/test_path.py new file mode 100644 index 0000000000..30e8a21a6a --- /dev/null +++ b/Lib/test/test_zipfile/_path/test_path.py @@ -0,0 +1,581 @@ +import io +import itertools +import contextlib +import pathlib +import pickle +import sys +import unittest +import zipfile + +from ._functools import compose +from ._itertools import Counter + +from ._test_params import parameterize, Invoked + +from test.support.os_helper import temp_dir + + +class jaraco: + class itertools: + Counter = Counter + + +def add_dirs(zf): + """ + Given a writable zip file zf, inject directory entries for + any directories implied by the presence of children. + """ + for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()): + zf.writestr(name, b"") + return zf + + +def build_alpharep_fixture(): + """ + Create a zip file with this structure: + + . + ├── a.txt + ├── b + │ ├── c.txt + │ ├── d + │ │ └── e.txt + │ └── f.txt + ├── g + │ └── h + │ └── i.txt + └── j + ├── k.bin + ├── l.baz + └── m.bar + + This fixture has the following key characteristics: + + - a file at the root (a) + - a file two levels deep (b/d/e) + - multiple files in a directory (b/c, b/f) + - a directory containing only a directory (g/h) + - a directory with files of different extensions (j/klm) + + "alpha" because it uses alphabet + "rep" because it's a representative example + """ + data = io.BytesIO() + zf = zipfile.ZipFile(data, "w") + zf.writestr("a.txt", b"content of a") + zf.writestr("b/c.txt", b"content of c") + zf.writestr("b/d/e.txt", b"content of e") + zf.writestr("b/f.txt", b"content of f") + zf.writestr("g/h/i.txt", b"content of i") + zf.writestr("j/k.bin", b"content of k") + zf.writestr("j/l.baz", b"content of l") + zf.writestr("j/m.bar", b"content of m") + zf.filename = "alpharep.zip" + return zf + + +alpharep_generators = [ + Invoked.wrap(build_alpharep_fixture), + Invoked.wrap(compose(add_dirs, build_alpharep_fixture)), +] + +pass_alpharep = parameterize(['alpharep'], alpharep_generators) + + +class TestPath(unittest.TestCase): + def setUp(self): + self.fixtures = contextlib.ExitStack() + self.addCleanup(self.fixtures.close) + + def zipfile_ondisk(self, alpharep): + tmpdir = pathlib.Path(self.fixtures.enter_context(temp_dir())) + buffer = alpharep.fp + alpharep.close() + path = tmpdir / alpharep.filename + with path.open("wb") as strm: + strm.write(buffer.getvalue()) + return path + + @pass_alpharep + def test_iterdir_and_types(self, alpharep): + root = zipfile.Path(alpharep) + assert root.is_dir() + a, b, g, j = root.iterdir() + assert a.is_file() + assert b.is_dir() + assert g.is_dir() + c, f, d = b.iterdir() + assert c.is_file() and f.is_file() + (e,) = d.iterdir() + assert e.is_file() + (h,) = g.iterdir() + (i,) = h.iterdir() + assert i.is_file() + + @pass_alpharep + def test_is_file_missing(self, alpharep): + root = zipfile.Path(alpharep) + assert not root.joinpath('missing.txt').is_file() + + @pass_alpharep + def test_iterdir_on_file(self, alpharep): + root = zipfile.Path(alpharep) + a, b, g, j = root.iterdir() + with self.assertRaises(ValueError): + a.iterdir() + + @pass_alpharep + def test_subdir_is_dir(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'b').is_dir() + assert (root / 'b/').is_dir() + assert (root / 'g').is_dir() + assert (root / 'g/').is_dir() + + @pass_alpharep + def test_open(self, alpharep): + root = zipfile.Path(alpharep) + a, b, g, j = root.iterdir() + with a.open(encoding="utf-8") as strm: + data = strm.read() + self.assertEqual(data, "content of a") + with a.open('r', "utf-8") as strm: # not a kw, no gh-101144 TypeError + data = strm.read() + self.assertEqual(data, "content of a") + + def test_open_encoding_utf16(self): + in_memory_file = io.BytesIO() + zf = zipfile.ZipFile(in_memory_file, "w") + zf.writestr("path/16.txt", "This was utf-16".encode("utf-16")) + zf.filename = "test_open_utf16.zip" + root = zipfile.Path(zf) + (path,) = root.iterdir() + u16 = path.joinpath("16.txt") + with u16.open('r', "utf-16") as strm: + data = strm.read() + assert data == "This was utf-16" + with u16.open(encoding="utf-16") as strm: + data = strm.read() + assert data == "This was utf-16" + + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_open_encoding_errors(self): + in_memory_file = io.BytesIO() + zf = zipfile.ZipFile(in_memory_file, "w") + zf.writestr("path/bad-utf8.bin", b"invalid utf-8: \xff\xff.") + zf.filename = "test_read_text_encoding_errors.zip" + root = zipfile.Path(zf) + (path,) = root.iterdir() + u16 = path.joinpath("bad-utf8.bin") + + # encoding= as a positional argument for gh-101144. + data = u16.read_text("utf-8", errors="ignore") + assert data == "invalid utf-8: ." + with u16.open("r", "utf-8", errors="surrogateescape") as f: + assert f.read() == "invalid utf-8: \udcff\udcff." + + # encoding= both positional and keyword is an error; gh-101144. + with self.assertRaisesRegex(TypeError, "encoding"): + data = u16.read_text("utf-8", encoding="utf-8") + + # both keyword arguments work. + with u16.open("r", encoding="utf-8", errors="strict") as f: + # error during decoding with wrong codec. + with self.assertRaises(UnicodeDecodeError): + f.read() + + @unittest.skipIf( + not getattr(sys.flags, 'warn_default_encoding', 0), + "Requires warn_default_encoding", + ) + @pass_alpharep + def test_encoding_warnings(self, alpharep): + """EncodingWarning must blame the read_text and open calls.""" + assert sys.flags.warn_default_encoding + root = zipfile.Path(alpharep) + with self.assertWarns(EncodingWarning) as wc: + root.joinpath("a.txt").read_text() + assert __file__ == wc.filename + with self.assertWarns(EncodingWarning) as wc: + root.joinpath("a.txt").open("r").close() + assert __file__ == wc.filename + + def test_open_write(self): + """ + If the zipfile is open for write, it should be possible to + write bytes or text to it. + """ + zf = zipfile.Path(zipfile.ZipFile(io.BytesIO(), mode='w')) + with zf.joinpath('file.bin').open('wb') as strm: + strm.write(b'binary contents') + with zf.joinpath('file.txt').open('w', encoding="utf-8") as strm: + strm.write('text file') + + def test_open_extant_directory(self): + """ + Attempting to open a directory raises IsADirectoryError. + """ + zf = zipfile.Path(add_dirs(build_alpharep_fixture())) + with self.assertRaises(IsADirectoryError): + zf.joinpath('b').open() + + @pass_alpharep + def test_open_binary_invalid_args(self, alpharep): + root = zipfile.Path(alpharep) + with self.assertRaises(ValueError): + root.joinpath('a.txt').open('rb', encoding='utf-8') + with self.assertRaises(ValueError): + root.joinpath('a.txt').open('rb', 'utf-8') + + def test_open_missing_directory(self): + """ + Attempting to open a missing directory raises FileNotFoundError. + """ + zf = zipfile.Path(add_dirs(build_alpharep_fixture())) + with self.assertRaises(FileNotFoundError): + zf.joinpath('z').open() + + @pass_alpharep + def test_read(self, alpharep): + root = zipfile.Path(alpharep) + a, b, g, j = root.iterdir() + assert a.read_text(encoding="utf-8") == "content of a" + # Also check positional encoding arg (gh-101144). + assert a.read_text("utf-8") == "content of a" + assert a.read_bytes() == b"content of a" + + @pass_alpharep + def test_joinpath(self, alpharep): + root = zipfile.Path(alpharep) + a = root.joinpath("a.txt") + assert a.is_file() + e = root.joinpath("b").joinpath("d").joinpath("e.txt") + assert e.read_text(encoding="utf-8") == "content of e" + + @pass_alpharep + def test_joinpath_multiple(self, alpharep): + root = zipfile.Path(alpharep) + e = root.joinpath("b", "d", "e.txt") + assert e.read_text(encoding="utf-8") == "content of e" + + @pass_alpharep + def test_traverse_truediv(self, alpharep): + root = zipfile.Path(alpharep) + a = root / "a.txt" + assert a.is_file() + e = root / "b" / "d" / "e.txt" + assert e.read_text(encoding="utf-8") == "content of e" + + @pass_alpharep + def test_pathlike_construction(self, alpharep): + """ + zipfile.Path should be constructable from a path-like object + """ + zipfile_ondisk = self.zipfile_ondisk(alpharep) + pathlike = pathlib.Path(str(zipfile_ondisk)) + zipfile.Path(pathlike) + + @pass_alpharep + def test_traverse_pathlike(self, alpharep): + root = zipfile.Path(alpharep) + root / pathlib.Path("a") + + @pass_alpharep + def test_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'a').parent.at == '' + assert (root / 'a' / 'b').parent.at == 'a/' + + @pass_alpharep + def test_dir_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'b').parent.at == '' + assert (root / 'b/').parent.at == '' + + @pass_alpharep + def test_missing_dir_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'missing dir/').parent.at == '' + + @pass_alpharep + def test_mutability(self, alpharep): + """ + If the underlying zipfile is changed, the Path object should + reflect that change. + """ + root = zipfile.Path(alpharep) + a, b, g, j = root.iterdir() + alpharep.writestr('foo.txt', 'foo') + alpharep.writestr('bar/baz.txt', 'baz') + assert any(child.name == 'foo.txt' for child in root.iterdir()) + assert (root / 'foo.txt').read_text(encoding="utf-8") == 'foo' + (baz,) = (root / 'bar').iterdir() + assert baz.read_text(encoding="utf-8") == 'baz' + + HUGE_ZIPFILE_NUM_ENTRIES = 2**13 + + def huge_zipfile(self): + """Create a read-only zipfile with a huge number of entries entries.""" + strm = io.BytesIO() + zf = zipfile.ZipFile(strm, "w") + for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)): + zf.writestr(entry, entry) + zf.mode = 'r' + return zf + + def test_joinpath_constant_time(self): + """ + Ensure joinpath on items in zipfile is linear time. + """ + root = zipfile.Path(self.huge_zipfile()) + entries = jaraco.itertools.Counter(root.iterdir()) + for entry in entries: + entry.joinpath('suffix') + # Check the file iterated all items + assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES + + @pass_alpharep + def test_read_does_not_close(self, alpharep): + alpharep = self.zipfile_ondisk(alpharep) + with zipfile.ZipFile(alpharep) as file: + for rep in range(2): + zipfile.Path(file, 'a.txt').read_text(encoding="utf-8") + + @pass_alpharep + def test_subclass(self, alpharep): + class Subclass(zipfile.Path): + pass + + root = Subclass(alpharep) + assert isinstance(root / 'b', Subclass) + + @pass_alpharep + def test_filename(self, alpharep): + root = zipfile.Path(alpharep) + assert root.filename == pathlib.Path('alpharep.zip') + + @pass_alpharep + def test_root_name(self, alpharep): + """ + The name of the root should be the name of the zipfile + """ + root = zipfile.Path(alpharep) + assert root.name == 'alpharep.zip' == root.filename.name + + @pass_alpharep + def test_suffix(self, alpharep): + """ + The suffix of the root should be the suffix of the zipfile. + The suffix of each nested file is the final component's last suffix, if any. + Includes the leading period, just like pathlib.Path. + """ + root = zipfile.Path(alpharep) + assert root.suffix == '.zip' == root.filename.suffix + + b = root / "b.txt" + assert b.suffix == ".txt" + + c = root / "c" / "filename.tar.gz" + assert c.suffix == ".gz" + + d = root / "d" + assert d.suffix == "" + + @pass_alpharep + def test_suffixes(self, alpharep): + """ + The suffix of the root should be the suffix of the zipfile. + The suffix of each nested file is the final component's last suffix, if any. + Includes the leading period, just like pathlib.Path. + """ + root = zipfile.Path(alpharep) + assert root.suffixes == ['.zip'] == root.filename.suffixes + + b = root / 'b.txt' + assert b.suffixes == ['.txt'] + + c = root / 'c' / 'filename.tar.gz' + assert c.suffixes == ['.tar', '.gz'] + + d = root / 'd' + assert d.suffixes == [] + + e = root / '.hgrc' + assert e.suffixes == [] + + @pass_alpharep + def test_suffix_no_filename(self, alpharep): + alpharep.filename = None + root = zipfile.Path(alpharep) + assert root.joinpath('example').suffix == "" + assert root.joinpath('example').suffixes == [] + + @pass_alpharep + def test_stem(self, alpharep): + """ + The final path component, without its suffix + """ + root = zipfile.Path(alpharep) + assert root.stem == 'alpharep' == root.filename.stem + + b = root / "b.txt" + assert b.stem == "b" + + c = root / "c" / "filename.tar.gz" + assert c.stem == "filename.tar" + + d = root / "d" + assert d.stem == "d" + + assert (root / ".gitignore").stem == ".gitignore" + + @pass_alpharep + def test_root_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert root.parent == pathlib.Path('.') + root.root.filename = 'foo/bar.zip' + assert root.parent == pathlib.Path('foo') + + @pass_alpharep + def test_root_unnamed(self, alpharep): + """ + It is an error to attempt to get the name + or parent of an unnamed zipfile. + """ + alpharep.filename = None + root = zipfile.Path(alpharep) + with self.assertRaises(TypeError): + root.name + with self.assertRaises(TypeError): + root.parent + + # .name and .parent should still work on subs + sub = root / "b" + assert sub.name == "b" + assert sub.parent + + @pass_alpharep + def test_match_and_glob(self, alpharep): + root = zipfile.Path(alpharep) + assert not root.match("*.txt") + + assert list(root.glob("b/c.*")) == [zipfile.Path(alpharep, "b/c.txt")] + assert list(root.glob("b/*.txt")) == [ + zipfile.Path(alpharep, "b/c.txt"), + zipfile.Path(alpharep, "b/f.txt"), + ] + + @pass_alpharep + def test_glob_recursive(self, alpharep): + root = zipfile.Path(alpharep) + files = root.glob("**/*.txt") + assert all(each.match("*.txt") for each in files) + + assert list(root.glob("**/*.txt")) == list(root.rglob("*.txt")) + + @pass_alpharep + def test_glob_subdirs(self, alpharep): + root = zipfile.Path(alpharep) + + assert list(root.glob("*/i.txt")) == [] + assert list(root.rglob("*/i.txt")) == [zipfile.Path(alpharep, "g/h/i.txt")] + + @pass_alpharep + def test_glob_does_not_overmatch_dot(self, alpharep): + root = zipfile.Path(alpharep) + + assert list(root.glob("*.xt")) == [] + + @pass_alpharep + def test_glob_single_char(self, alpharep): + root = zipfile.Path(alpharep) + + assert list(root.glob("a?txt")) == [zipfile.Path(alpharep, "a.txt")] + assert list(root.glob("a[.]txt")) == [zipfile.Path(alpharep, "a.txt")] + assert list(root.glob("a[?]txt")) == [] + + @pass_alpharep + def test_glob_chars(self, alpharep): + root = zipfile.Path(alpharep) + + assert list(root.glob("j/?.b[ai][nz]")) == [ + zipfile.Path(alpharep, "j/k.bin"), + zipfile.Path(alpharep, "j/l.baz"), + ] + + def test_glob_empty(self): + root = zipfile.Path(zipfile.ZipFile(io.BytesIO(), 'w')) + with self.assertRaises(ValueError): + root.glob('') + + @pass_alpharep + def test_eq_hash(self, alpharep): + root = zipfile.Path(alpharep) + assert root == zipfile.Path(alpharep) + + assert root != (root / "a.txt") + assert (root / "a.txt") == (root / "a.txt") + + root = zipfile.Path(alpharep) + assert root in {root} + + @pass_alpharep + def test_is_symlink(self, alpharep): + """ + See python/cpython#82102 for symlink support beyond this object. + """ + + root = zipfile.Path(alpharep) + assert not root.is_symlink() + + @pass_alpharep + def test_relative_to(self, alpharep): + root = zipfile.Path(alpharep) + relative = root.joinpath("b", "c.txt").relative_to(root / "b") + assert str(relative) == "c.txt" + + relative = root.joinpath("b", "d", "e.txt").relative_to(root / "b") + assert str(relative) == "d/e.txt" + + @pass_alpharep + def test_inheritance(self, alpharep): + cls = type('PathChild', (zipfile.Path,), {}) + file = cls(alpharep).joinpath('some dir').parent + assert isinstance(file, cls) + + @parameterize( + ['alpharep', 'path_type', 'subpath'], + itertools.product( + alpharep_generators, + [str, pathlib.Path], + ['', 'b/'], + ), + ) + def test_pickle(self, alpharep, path_type, subpath): + zipfile_ondisk = path_type(self.zipfile_ondisk(alpharep)) + + saved_1 = pickle.dumps(zipfile.Path(zipfile_ondisk, at=subpath)) + restored_1 = pickle.loads(saved_1) + first, *rest = restored_1.iterdir() + assert first.read_text(encoding='utf-8').startswith('content of ') + + @pass_alpharep + def test_extract_orig_with_implied_dirs(self, alpharep): + """ + A zip file wrapped in a Path should extract even with implied dirs. + """ + source_path = self.zipfile_ondisk(alpharep) + zf = zipfile.ZipFile(source_path) + # wrap the zipfile for its side effect + zipfile.Path(zf) + zf.extractall(source_path.parent) + + @pass_alpharep + def test_getinfo_missing(self, alpharep): + """ + Validate behavior of getinfo on original zipfile after wrapping. + """ + zipfile.Path(alpharep) + with self.assertRaises(KeyError): + alpharep.getinfo('does-not-exist') diff --git a/Lib/test/test_zipfile/_path/write-alpharep.py b/Lib/test/test_zipfile/_path/write-alpharep.py new file mode 100644 index 0000000000..48c09b5371 --- /dev/null +++ b/Lib/test/test_zipfile/_path/write-alpharep.py @@ -0,0 +1,4 @@ +from . import test_path + + +__name__ == '__main__' and test_path.build_alpharep_fixture().extractall('alpharep') diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile/test_core.py similarity index 78% rename from Lib/test/test_zipfile.py rename to Lib/test/test_zipfile/test_core.py index 43178ca26b..97f21e05ba 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile/test_core.py @@ -4,9 +4,7 @@ import io import itertools import os -import pathlib import posixpath -import string import struct import subprocess import sys @@ -14,16 +12,20 @@ import unittest import unittest.mock as mock import zipfile -import functools from tempfile import TemporaryFile from random import randint, random, randbytes +from test import archiver_tests from test.support import script_helper -from test.support import (findfile, requires_zlib, requires_bz2, - requires_lzma, captured_stdout) -from test.support.os_helper import TESTFN, unlink, rmtree, temp_dir, temp_cwd +from test.support import ( + findfile, requires_zlib, requires_bz2, requires_lzma, + captured_stdout, captured_stderr, requires_subprocess +) +from test.support.os_helper import ( + TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count, FakePath +) TESTFN2 = TESTFN + "2" @@ -120,8 +122,9 @@ def zip_test(self, f, compression, compresslevel=None): self.assertEqual(info.filename, nm) self.assertEqual(info.file_size, len(self.data)) - # Check that testzip doesn't raise an exception - zipfp.testzip() + # Check that testzip thinks the archive is ok + # (it returns None if all contents could be read properly) + self.assertIsNone(zipfp.testzip()) # TODO: RUSTPYTHON @unittest.expectedFailure @@ -158,7 +161,7 @@ def test_open(self): self.zip_open_test(f, self.compression) def test_open_with_pathlike(self): - path = pathlib.Path(TESTFN2) + path = FakePath(TESTFN2) self.zip_open_test(path, self.compression) with zipfile.ZipFile(path, "r", self.compression) as zipfp: self.assertIsInstance(zipfp.filename, str) @@ -447,6 +450,27 @@ def write(self, data): self.assertEqual(zipfp.read('file1'), b'data1') self.assertEqual(zipfp.read('file2'), b'data2') + def test_zipextfile_attrs(self): + fname = "somefile.txt" + with zipfile.ZipFile(TESTFN2, mode="w") as zipfp: + zipfp.writestr(fname, "bogus") + + with zipfile.ZipFile(TESTFN2, mode="r") as zipfp: + with zipfp.open(fname) as fid: + self.assertEqual(fid.name, fname) + self.assertRaises(io.UnsupportedOperation, fid.fileno) + self.assertEqual(fid.mode, 'r') + self.assertIs(fid.readable(), True) + self.assertIs(fid.writable(), False) + self.assertIs(fid.seekable(), True) + self.assertIs(fid.closed, False) + self.assertIs(fid.closed, True) + self.assertEqual(fid.name, fname) + self.assertEqual(fid.mode, 'r') + self.assertRaises(io.UnsupportedOperation, fid.fileno) + self.assertRaises(ValueError, fid.readable) + self.assertIs(fid.writable(), False) + self.assertRaises(ValueError, fid.seekable) def tearDown(self): unlink(TESTFN) @@ -578,17 +602,16 @@ def test_write_default_name(self): def test_io_on_closed_zipextfile(self): fname = "somefile.txt" - with zipfile.ZipFile(TESTFN2, mode="w") as zipfp: + with zipfile.ZipFile(TESTFN2, mode="w", compression=self.compression) as zipfp: zipfp.writestr(fname, "bogus") with zipfile.ZipFile(TESTFN2, mode="r") as zipfp: with zipfp.open(fname) as fid: fid.close() + self.assertIs(fid.closed, True) self.assertRaises(ValueError, fid.read) self.assertRaises(ValueError, fid.seek, 0) self.assertRaises(ValueError, fid.tell) - self.assertRaises(ValueError, fid.readable) - self.assertRaises(ValueError, fid.seekable) def test_write_to_readonly(self): """Check that trying to call write() on a readonly ZipFile object @@ -748,8 +771,8 @@ def zip_test(self, f, compression): self.assertEqual(info.filename, nm) self.assertEqual(info.file_size, len(self.data)) - # Check that testzip doesn't raise an exception - zipfp.testzip() + # Check that testzip thinks the archive is valid + self.assertIsNone(zipfp.testzip()) # TODO: RUSTPYTHON @unittest.expectedFailure @@ -1083,6 +1106,159 @@ def test_generated_valid_zip64_extra(self): self.assertEqual(zinfo.header_offset, expected_header_offset) self.assertEqual(zf.read(zinfo), expected_content) + def test_force_zip64(self): + """Test that forcing zip64 extensions correctly notes this in the zip file""" + + # GH-103861 describes an issue where forcing a small file to use zip64 + # extensions would add a zip64 extra record, but not change the data + # sizes to 0xFFFFFFFF to indicate to the extractor that the zip64 + # record should be read. Additionally, it would not set the required + # version to indicate that zip64 extensions are required to extract it. + # This test replicates the situation and reads the raw data to specifically ensure: + # - The required extract version is always >= ZIP64_VERSION + # - The compressed and uncompressed size in the file headers are both + # 0xFFFFFFFF (ie. point to zip64 record) + # - The zip64 record is provided and has the correct sizes in it + # Other aspects of the zip are checked as well, but verifying the above is the main goal. + # Because this is hard to verify by parsing the data as a zip, the raw + # bytes are checked to ensure that they line up with the zip spec. + # The spec for this can be found at: https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + # The relevent sections for this test are: + # - 4.3.7 for local file header + # - 4.5.3 for zip64 extra field + + data = io.BytesIO() + with zipfile.ZipFile(data, mode="w", allowZip64=True) as zf: + with zf.open("text.txt", mode="w", force_zip64=True) as zi: + zi.write(b"_") + + zipdata = data.getvalue() + + # pull out and check zip information + ( + header, vers, os, flags, comp, csize, usize, fn_len, + ex_total_len, filename, ex_id, ex_len, ex_usize, ex_csize, cd_sig + ) = struct.unpack("<4sBBHH8xIIHH8shhQQx4s", zipdata[:63]) + + self.assertEqual(header, b"PK\x03\x04") # local file header + self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract + self.assertEqual(os, 0) # compatible with MS-DOS + self.assertEqual(flags, 0) # no flags + self.assertEqual(comp, 0) # compression method = stored + self.assertEqual(csize, 0xFFFFFFFF) # sizes are in zip64 extra + self.assertEqual(usize, 0xFFFFFFFF) + self.assertEqual(fn_len, 8) # filename len + self.assertEqual(ex_total_len, 20) # size of extra records + self.assertEqual(ex_id, 1) # Zip64 extra record + self.assertEqual(ex_len, 16) # 16 bytes of data + self.assertEqual(ex_usize, 1) # uncompressed size + self.assertEqual(ex_csize, 1) # compressed size + self.assertEqual(cd_sig, b"PK\x01\x02") # ensure the central directory header is next + + z = zipfile.ZipFile(io.BytesIO(zipdata)) + zinfos = z.infolist() + self.assertEqual(len(zinfos), 1) + self.assertGreaterEqual(zinfos[0].extract_version, zipfile.ZIP64_VERSION) # requires zip64 to extract + + def test_unseekable_zip_unknown_filesize(self): + """Test that creating a zip with/without seeking will raise a RuntimeError if zip64 was required but not used""" + + def make_zip(fp): + with zipfile.ZipFile(fp, mode="w", allowZip64=True) as zf: + with zf.open("text.txt", mode="w", force_zip64=False) as zi: + zi.write(b"_" * (zipfile.ZIP64_LIMIT + 1)) + + self.assertRaises(RuntimeError, make_zip, io.BytesIO()) + self.assertRaises(RuntimeError, make_zip, Unseekable(io.BytesIO())) + + def test_zip64_required_not_allowed_fail(self): + """Test that trying to add a large file to a zip that doesn't allow zip64 extensions fails on add""" + def make_zip(fp): + with zipfile.ZipFile(fp, mode="w", allowZip64=False) as zf: + # pretend zipfile.ZipInfo.from_file was used to get the name and filesize + info = zipfile.ZipInfo("text.txt") + info.file_size = zipfile.ZIP64_LIMIT + 1 + zf.open(info, mode="w") + + self.assertRaises(zipfile.LargeZipFile, make_zip, io.BytesIO()) + self.assertRaises(zipfile.LargeZipFile, make_zip, Unseekable(io.BytesIO())) + + def test_unseekable_zip_known_filesize(self): + """Test that creating a zip without seeking will use zip64 extensions if the file size is provided up-front""" + + # This test ensures that the zip will use a zip64 data descriptor (same + # as a regular data descriptor except the sizes are 8 bytes instead of + # 4) record to communicate the size of a file if the zip is being + # written to an unseekable stream. + # Because this sort of thing is hard to verify by parsing the data back + # in as a zip, this test looks at the raw bytes created to ensure that + # the correct data has been generated. + # The spec for this can be found at: https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + # The relevent sections for this test are: + # - 4.3.7 for local file header + # - 4.3.9 for the data descriptor + # - 4.5.3 for zip64 extra field + + file_size = zipfile.ZIP64_LIMIT + 1 + + def make_zip(fp): + with zipfile.ZipFile(fp, mode="w", allowZip64=True) as zf: + # pretend zipfile.ZipInfo.from_file was used to get the name and filesize + info = zipfile.ZipInfo("text.txt") + info.file_size = file_size + with zf.open(info, mode="w", force_zip64=False) as zi: + zi.write(b"_" * file_size) + return fp + + # check seekable file information + seekable_data = make_zip(io.BytesIO()).getvalue() + ( + header, vers, os, flags, comp, csize, usize, fn_len, + ex_total_len, filename, ex_id, ex_len, ex_usize, ex_csize, + cd_sig + ) = struct.unpack("<4sBBHH8xIIHH8shhQQ{}x4s".format(file_size), seekable_data[:62 + file_size]) + + self.assertEqual(header, b"PK\x03\x04") # local file header + self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract + self.assertEqual(os, 0) # compatible with MS-DOS + self.assertEqual(flags, 0) # no flags set + self.assertEqual(comp, 0) # compression method = stored + self.assertEqual(csize, 0xFFFFFFFF) # sizes are in zip64 extra + self.assertEqual(usize, 0xFFFFFFFF) + self.assertEqual(fn_len, 8) # filename len + self.assertEqual(ex_total_len, 20) # size of extra records + self.assertEqual(ex_id, 1) # Zip64 extra record + self.assertEqual(ex_len, 16) # 16 bytes of data + self.assertEqual(ex_usize, file_size) # uncompressed size + self.assertEqual(ex_csize, file_size) # compressed size + self.assertEqual(cd_sig, b"PK\x01\x02") # ensure the central directory header is next + + # check unseekable file information + unseekable_data = make_zip(Unseekable(io.BytesIO())).fp.getvalue() + ( + header, vers, os, flags, comp, csize, usize, fn_len, + ex_total_len, filename, ex_id, ex_len, ex_usize, ex_csize, + dd_header, dd_usize, dd_csize, cd_sig + ) = struct.unpack("<4sBBHH8xIIHH8shhQQ{}x4s4xQQ4s".format(file_size), unseekable_data[:86 + file_size]) + + self.assertEqual(header, b"PK\x03\x04") # local file header + self.assertGreaterEqual(vers, zipfile.ZIP64_VERSION) # requires zip64 to extract + self.assertEqual(os, 0) # compatible with MS-DOS + self.assertEqual("{:b}".format(flags), "1000") # streaming flag set + self.assertEqual(comp, 0) # compression method = stored + self.assertEqual(csize, 0xFFFFFFFF) # sizes are in zip64 extra + self.assertEqual(usize, 0xFFFFFFFF) + self.assertEqual(fn_len, 8) # filename len + self.assertEqual(ex_total_len, 20) # size of extra records + self.assertEqual(ex_id, 1) # Zip64 extra record + self.assertEqual(ex_len, 16) # 16 bytes of data + self.assertEqual(ex_usize, 0) # uncompressed size - 0 to defer to data descriptor + self.assertEqual(ex_csize, 0) # compressed size - 0 to defer to data descriptor + self.assertEqual(dd_header, b"PK\07\x08") # data descriptor + self.assertEqual(dd_usize, file_size) # file size (8 bytes because zip64) + self.assertEqual(dd_csize, file_size) # compressed size (8 bytes because zip64) + self.assertEqual(cd_sig, b"PK\x01\x02") # ensure the central directory header is next + @requires_zlib() class DeflateTestZip64InSmallFiles(AbstractTestZip64InSmallFiles, @@ -1134,6 +1310,21 @@ def test_issue44439(self): self.assertEqual(data.write(q), LENGTH) self.assertEqual(zip.getinfo('data').file_size, LENGTH) + def test_zipwritefile_attrs(self): + fname = "somefile.txt" + with zipfile.ZipFile(TESTFN2, mode="w", compression=self.compression) as zipfp: + with zipfp.open(fname, 'w') as fid: + self.assertRaises(io.UnsupportedOperation, fid.fileno) + self.assertIs(fid.readable(), False) + self.assertIs(fid.writable(), True) + self.assertIs(fid.seekable(), False) + self.assertIs(fid.closed, False) + self.assertIs(fid.closed, True) + self.assertRaises(io.UnsupportedOperation, fid.fileno) + self.assertIs(fid.readable(), False) + self.assertIs(fid.writable(), True) + self.assertIs(fid.seekable(), False) + class StoredWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_STORED @@ -1344,7 +1535,7 @@ def test_write_pathlike(self): fp.write("print(42)\n") with TemporaryFile() as t, zipfile.PyZipFile(t, "w") as zipfp: - zipfp.writepy(pathlib.Path(TESTFN2) / "mod1.py") + zipfp.writepy(FakePath(os.path.join(TESTFN2, "mod1.py"))) names = zipfp.namelist() self.assertCompiledIn('mod1.py', names) finally: @@ -1402,7 +1593,7 @@ def test_extract_with_target(self): def test_extract_with_target_pathlike(self): with temp_dir() as extdir: - self._test_extract_with_target(pathlib.Path(extdir)) + self._test_extract_with_target(FakePath(extdir)) def test_extract_all(self): with temp_cwd(): @@ -1437,7 +1628,7 @@ def test_extract_all_with_target(self): def test_extract_all_with_target_pathlike(self): with temp_dir() as extdir: - self._test_extract_all_with_target(pathlib.Path(extdir)) + self._test_extract_all_with_target(FakePath(extdir)) def check_file(self, filename, content): self.assertTrue(os.path.isfile(filename)) @@ -1450,6 +1641,8 @@ def test_sanitize_windows_name(self): self.assertEqual(san(r',,?,C:,foo,bar/z', ','), r'_,C_,foo,bar/z') self.assertEqual(san(r'a\b,ce|f"g?h*i', ','), r'a\b,c_d_e_f_g_h_i') self.assertEqual(san('../../foo../../ba..r', '/'), r'foo/ba..r') + self.assertEqual(san(' / /foo / /ba r', '/'), r'foo/ba r') + self.assertEqual(san(' . /. /foo ./ . /. ./ba .r', '/'), r'foo/ba .r') def test_extract_hackers_arcnames_common_cases(self): common_hacknames = [ @@ -1480,10 +1673,10 @@ def test_extract_hackers_arcnames_windows_only(self): (r'C:\foo\bar', 'foo/bar'), (r'//conky/mountpoint/foo/bar', 'foo/bar'), (r'\\conky\mountpoint\foo\bar', 'foo/bar'), - (r'///conky/mountpoint/foo/bar', 'conky/mountpoint/foo/bar'), - (r'\\\conky\mountpoint\foo\bar', 'conky/mountpoint/foo/bar'), - (r'//conky//mountpoint/foo/bar', 'conky/mountpoint/foo/bar'), - (r'\\conky\\mountpoint\foo\bar', 'conky/mountpoint/foo/bar'), + (r'///conky/mountpoint/foo/bar', 'mountpoint/foo/bar'), + (r'\\\conky\mountpoint\foo\bar', 'mountpoint/foo/bar'), + (r'//conky//mountpoint/foo/bar', 'mountpoint/foo/bar'), + (r'\\conky\\mountpoint\foo\bar', 'mountpoint/foo/bar'), (r'//?/C:/foo/bar', 'foo/bar'), (r'\\?\C:\foo\bar', 'foo/bar'), (r'C:/../C:/foo/bar', 'C_/foo/bar'), @@ -1545,6 +1738,33 @@ def _test_extract_hackers_arcnames(self, hacknames): unlink(TESTFN2) +class OverwriteTests(archiver_tests.OverwriteTests, unittest.TestCase): + testdir = TESTFN + + @classmethod + def setUpClass(cls): + p = cls.ar_with_file = TESTFN + '-with-file.zip' + cls.addClassCleanup(unlink, p) + with zipfile.ZipFile(p, 'w') as zipfp: + zipfp.writestr('test', b'newcontent') + + p = cls.ar_with_dir = TESTFN + '-with-dir.zip' + cls.addClassCleanup(unlink, p) + with zipfile.ZipFile(p, 'w') as zipfp: + zipfp.mkdir('test') + + p = cls.ar_with_implicit_dir = TESTFN + '-with-implicit-dir.zip' + cls.addClassCleanup(unlink, p) + with zipfile.ZipFile(p, 'w') as zipfp: + zipfp.writestr('test/file', b'newcontent') + + def open(self, path): + return zipfile.ZipFile(path, 'r') + + def extractall(self, ar): + ar.extractall(self.testdir) + + class OtherTests(unittest.TestCase): def test_open_via_zip_info(self): # Create the ZIP archive @@ -1570,7 +1790,7 @@ def test_writestr_extended_local_header_issue1202(self): with zipfile.ZipFile(TESTFN2, 'w') as orig_zip: for data in 'abcdefghijklmnop': zinfo = zipfile.ZipInfo(data) - zinfo.flag_bits |= 0x08 # Include an extended local header. + zinfo.flag_bits |= zipfile._MASK_USE_DATA_DESCRIPTOR # Include an extended local header. orig_zip.writestr(zinfo, data) def test_close(self): @@ -1627,6 +1847,44 @@ def test_write_unicode_filenames(self): self.assertEqual(zf.filelist[0].filename, "foo.txt") self.assertEqual(zf.filelist[1].filename, "\xf6.txt") + def create_zipfile_with_extra_data(self, filename, extra_data_name): + with zipfile.ZipFile(TESTFN, mode='w') as zf: + filename_encoded = filename.encode("utf-8") + # create a ZipInfo object with Unicode path extra field + zip_info = zipfile.ZipInfo(filename) + + tag_for_unicode_path = b'\x75\x70' + version_of_unicode_path = b'\x01' + + import zlib + filename_crc = struct.pack('= 0: + filename = filename[0:null_byte] + # This is used to ensure paths in generated ZIP files always use + # forward slashes as the directory separator, as required by the + # ZIP format specification. + if os.sep != "/" and os.sep in filename: + filename = filename.replace(os.sep, "/") + if os.altsep and os.altsep != "/" and os.altsep in filename: + filename = filename.replace(os.altsep, "/") + return filename + class ZipInfo (object): """Class with attributes describing each file in the ZIP archive.""" @@ -348,21 +381,15 @@ class ZipInfo (object): 'compress_size', 'file_size', '_raw_time', + '_end_offset', ) def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): self.orig_filename = filename # Original file name in archive - # Terminate the file name at the first null byte. Null bytes in file - # names are used as tricks by viruses in archives. - null_byte = filename.find(chr(0)) - if null_byte >= 0: - filename = filename[0:null_byte] - # This is used to ensure paths in generated ZIP files always use - # forward slashes as the directory separator, as required by the - # ZIP format specification. - if os.sep != "/" and os.sep in filename: - filename = filename.replace(os.sep, "/") + # Terminate the file name at the first null byte and + # ensure paths always use forward slashes as the directory separator. + filename = _sanitize_filename(filename) self.filename = filename # Normalized file name self.date_time = date_time # year, month, day, hour, min, sec @@ -389,6 +416,7 @@ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): self.external_attr = 0 # External file attributes self.compress_size = 0 # Size of the compressed file self.file_size = 0 # Size of the uncompressed file + self._end_offset = None # Start of the next local header or central directory # Other attributes are set by class ZipFile: # header_offset Byte offset to the file header # CRC CRC-32 of the uncompressed file @@ -416,11 +444,16 @@ def __repr__(self): return ''.join(result) def FileHeader(self, zip64=None): - """Return the per-file header as a bytes object.""" + """Return the per-file header as a bytes object. + + When the optional zip64 arg is None rather than a bool, we will + decide based upon the file_size and compress_size, if known, + False otherwise. + """ dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) - if self.flag_bits & 0x08: + if self.flag_bits & _MASK_USE_DATA_DESCRIPTOR: # Set these to zero because we write them after the file data CRC = compress_size = file_size = 0 else: @@ -432,16 +465,13 @@ def FileHeader(self, zip64=None): min_version = 0 if zip64 is None: + # We always explicitly pass zip64 within this module.... This + # remains for anyone using ZipInfo.FileHeader as a public API. zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT if zip64: fmt = ' ZIP64_LIMIT or compress_size > ZIP64_LIMIT: - if not zip64: - raise LargeZipFile("Filesize would require ZIP64 extensions") - # File is larger than what fits into a 4 byte integer, - # fall back to the ZIP64 extension file_size = 0xffffffff compress_size = 0xffffffff min_version = ZIP64_VERSION @@ -465,9 +495,9 @@ def _encodeFilenameFlags(self): try: return self.filename.encode('ascii'), self.flag_bits except UnicodeEncodeError: - return self.filename.encode('utf-8'), self.flag_bits | 0x800 + return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME - def _decodeExtra(self): + def _decodeExtra(self, filename_crc): # Try to decode the extra field. extra = self.extra unpack = struct.unpack @@ -493,6 +523,22 @@ def _decodeExtra(self): except struct.error: raise BadZipFile(f"Corrupt zip64 extra field. " f"{field} not found.") from None + elif tp == 0x7075: + data = extra[4:ln+4] + # Unicode Path Extra Field + try: + up_version, up_name_crc = unpack('> 8) & 0xff else: @@ -1052,17 +1107,17 @@ def seekable(self): raise ValueError("I/O operation on closed file.") return self._seekable - def seek(self, offset, whence=0): + def seek(self, offset, whence=os.SEEK_SET): if self.closed: raise ValueError("seek on closed file.") if not self._seekable: raise io.UnsupportedOperation("underlying stream is not seekable") curr_pos = self.tell() - if whence == 0: # Seek from start of file + if whence == os.SEEK_SET: new_pos = offset - elif whence == 1: # Seek from current position + elif whence == os.SEEK_CUR: new_pos = curr_pos + offset - elif whence == 2: # Seek from EOF + elif whence == os.SEEK_END: new_pos = self._orig_file_size + offset else: raise ValueError("whence must be os.SEEK_SET (0), " @@ -1081,10 +1136,23 @@ def seek(self, offset, whence=0): # Just move the _offset index if the new position is in the _readbuffer self._offset = buff_offset read_offset = 0 + # Fast seek uncompressed unencrypted file + elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0: + # disable CRC checking after first seeking - it would be invalid + self._expected_crc = None + # seek actual file taking already buffered data into account + read_offset -= len(self._readbuffer) - self._offset + self._fileobj.seek(read_offset, os.SEEK_CUR) + self._left -= read_offset + read_offset = 0 + # flush read buffer + self._readbuffer = b'' + self._offset = 0 elif read_offset < 0: # Position is before the current position. Reset the ZipExtFile self._fileobj.seek(self._orig_compress_start) self._running_crc = self._orig_start_crc + self._expected_crc = self._orig_crc self._compress_left = self._orig_compress_size self._left = self._orig_file_size self._readbuffer = b'' @@ -1164,21 +1232,20 @@ def close(self): self._zinfo.CRC = self._crc self._zinfo.file_size = self._file_size + if not self._zip64: + if self._file_size > ZIP64_LIMIT: + raise RuntimeError("File size too large, try using force_zip64") + if self._compress_size > ZIP64_LIMIT: + raise RuntimeError("Compressed size too large, try using force_zip64") + # Write updated header info - if self._zinfo.flag_bits & 0x08: + if self._zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: # Write CRC and file sizes after the file data fmt = ' ZIP64_LIMIT: - raise RuntimeError( - 'File size unexpectedly exceeded ZIP64 limit') - if self._compress_size > ZIP64_LIMIT: - raise RuntimeError( - 'Compressed size unexpectedly exceeded ZIP64 limit') # Seek backwards and write file header (which will now include # correct CRC and file sizes) @@ -1223,7 +1290,7 @@ class ZipFile: _windows_illegal_name_trans_table = None def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, - compresslevel=None, *, strict_timestamps=True): + compresslevel=None, *, strict_timestamps=True, metadata_encoding=None): """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', or append 'a'.""" if mode not in ('r', 'w', 'x', 'a'): @@ -1242,6 +1309,12 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, self.pwd = None self._comment = b'' self._strict_timestamps = strict_timestamps + self.metadata_encoding = metadata_encoding + + # Check that we don't try to write with nonconforming codecs + if self.metadata_encoding and mode != 'r': + raise ValueError( + "metadata_encoding is only supported for reading files") # Check if we were passed a file-like object if isinstance(file, os.PathLike): @@ -1374,13 +1447,14 @@ def _RealGetContents(self): if self.debug > 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) - flags = centdir[5] - if flags & 0x800: + orig_filename_crc = crc32(filename) + flags = centdir[_CD_FLAG_BITS] + if flags & _MASK_UTF_FILENAME: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding - filename = filename.decode('cp437') + filename = filename.decode(self.metadata_encoding or 'cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) @@ -1397,8 +1471,7 @@ def _RealGetContents(self): x._raw_time = t x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) - - x._decodeExtra() + x._decodeExtra(orig_filename_crc) x.header_offset = x.header_offset + concat self.filelist.append(x) self.NameToInfo[x.filename] = x @@ -1411,6 +1484,12 @@ def _RealGetContents(self): if self.debug > 2: print("total", total) + end_offset = self.start_dir + for zinfo in sorted(self.filelist, + key=lambda zinfo: zinfo.header_offset, + reverse=True): + zinfo._end_offset = end_offset + end_offset = zinfo.header_offset def namelist(self): """Return a list of file names in the archive.""" @@ -1431,7 +1510,10 @@ def printdir(self, file=None): file=file) def testzip(self): - """Read all the files and check the CRC.""" + """Read all the files and check the CRC. + + Return None if all files could be read successfully, or the name + of the offending file otherwise.""" chunk_size = 2 ** 20 for zinfo in self.filelist: try: @@ -1480,7 +1562,8 @@ def comment(self, comment): self._didModify = True def read(self, name, pwd=None): - """Return file bytes for name.""" + """Return file bytes for name. 'pwd' is the password to decrypt + encrypted files.""" with self.open(name, "r", pwd) as fp: return fp.read() @@ -1502,8 +1585,6 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): """ if mode not in {"r", "w"}: raise ValueError('open() requires mode "r" or "w"') - if pwd and not isinstance(pwd, bytes): - raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) if pwd and (mode == "w"): raise ValueError("pwd is only supported for reading files") if not self.fp: @@ -1545,32 +1626,38 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) if fheader[_FH_EXTRA_FIELD_LENGTH]: - zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + zef_file.seek(fheader[_FH_EXTRA_FIELD_LENGTH], whence=1) - if zinfo.flag_bits & 0x20: + if zinfo.flag_bits & _MASK_COMPRESSED_PATCH: # Zip 2.7: compressed patched data raise NotImplementedError("compressed patched data (flag bit 5)") - if zinfo.flag_bits & 0x40: + if zinfo.flag_bits & _MASK_STRONG_ENCRYPTION: # strong encryption raise NotImplementedError("strong encryption (flag bit 6)") - if fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] & 0x800: + if fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] & _MASK_UTF_FILENAME: # UTF-8 filename fname_str = fname.decode("utf-8") else: - fname_str = fname.decode("cp437") + fname_str = fname.decode(self.metadata_encoding or "cp437") if fname_str != zinfo.orig_filename: raise BadZipFile( 'File name in directory %r and header %r differ.' % (zinfo.orig_filename, fname)) + if (zinfo._end_offset is not None and + zef_file.tell() + zinfo.compress_size > zinfo._end_offset): + raise BadZipFile(f"Overlapped entries: {zinfo.orig_filename!r} (possible zip bomb)") + # check for encrypted flag & handle password - is_encrypted = zinfo.flag_bits & 0x1 + is_encrypted = zinfo.flag_bits & _MASK_ENCRYPTED if is_encrypted: if not pwd: pwd = self.pwd + if pwd and not isinstance(pwd, bytes): + raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) if not pwd: raise RuntimeError("File %r is encrypted, password " "required for extraction" % name) @@ -1600,16 +1687,17 @@ def _open_to_write(self, zinfo, force_zip64=False): zinfo.flag_bits = 0x00 if zinfo.compress_type == ZIP_LZMA: # Compressed data includes an end-of-stream (EOS) marker - zinfo.flag_bits |= 0x02 + zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 if not self._seekable: - zinfo.flag_bits |= 0x08 + zinfo.flag_bits |= _MASK_USE_DATA_DESCRIPTOR if not zinfo.external_attr: zinfo.external_attr = 0o600 << 16 # permissions: ?rw------- # Compressed size can be larger than uncompressed size - zip64 = self._allowZip64 and \ - (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT) + zip64 = force_zip64 or (zinfo.file_size * 1.05 > ZIP64_LIMIT) + if not self._allowZip64 and zip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") if self._seekable: self.fp.seek(self.start_dir) @@ -1627,7 +1715,8 @@ def extract(self, member, path=None, pwd=None): """Extract a member from the archive to the current working directory, using its full name. Its file information is extracted as accurately as possible. `member' may be a filename or a ZipInfo object. You can - specify a different directory using `path'. + specify a different directory using `path'. You can specify the + password to decrypt the file using 'pwd'. """ if path is None: path = os.getcwd() @@ -1640,7 +1729,8 @@ def extractall(self, path=None, members=None, pwd=None): """Extract all members from the archive to the current working directory. `path' specifies a different directory to extract to. `members' is optional and must be a subset of the list returned - by namelist(). + by namelist(). You can specify the password to decrypt all files + using 'pwd'. """ if members is None: members = self.namelist() @@ -1662,8 +1752,8 @@ def _sanitize_windows_name(cls, arcname, pathsep): table = str.maketrans(illegal, '_' * len(illegal)) cls._windows_illegal_name_trans_table = table arcname = arcname.translate(table) - # remove trailing dots - arcname = (x.rstrip('.') for x in arcname.split(pathsep)) + # remove trailing dots and spaces + arcname = (x.rstrip(' .') for x in arcname.split(pathsep)) # rejoin, removing empty parts. arcname = pathsep.join(x for x in arcname if x) return arcname @@ -1691,6 +1781,9 @@ def _extract_member(self, member, targetpath, pwd): # filter illegal characters on Windows arcname = self._sanitize_windows_name(arcname, os.path.sep) + if not arcname and not member.is_dir(): + raise ValueError("Empty filename.") + targetpath = os.path.join(targetpath, arcname) targetpath = os.path.normpath(targetpath) @@ -1751,6 +1844,7 @@ def write(self, filename, arcname=None, if zinfo.is_dir(): zinfo.compress_size = 0 zinfo.CRC = 0 + self.mkdir(zinfo) else: if compress_type is not None: zinfo.compress_type = compress_type @@ -1762,23 +1856,6 @@ def write(self, filename, arcname=None, else: zinfo._compresslevel = self.compresslevel - if zinfo.is_dir(): - with self._lock: - if self._seekable: - self.fp.seek(self.start_dir) - zinfo.header_offset = self.fp.tell() # Start of header bytes - if zinfo.compress_type == ZIP_LZMA: - # Compressed data includes an end-of-stream (EOS) marker - zinfo.flag_bits |= 0x02 - - self._writecheck(zinfo) - self._didModify = True - - self.filelist.append(zinfo) - self.NameToInfo[zinfo.filename] = zinfo - self.fp.write(zinfo.FileHeader(False)) - self.start_dir = self.fp.tell() - else: with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: shutil.copyfileobj(src, dest, 1024*8) @@ -1796,7 +1873,7 @@ def writestr(self, zinfo_or_arcname, data, date_time=time.localtime(time.time())[:6]) zinfo.compress_type = self.compression zinfo._compresslevel = self.compresslevel - if zinfo.filename[-1] == '/': + if zinfo.filename.endswith('/'): zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x zinfo.external_attr |= 0x10 # MS-DOS directory flag else: @@ -1823,6 +1900,41 @@ def writestr(self, zinfo_or_arcname, data, with self.open(zinfo, mode='w') as dest: dest.write(data) + def mkdir(self, zinfo_or_directory_name, mode=511): + """Creates a directory inside the zip archive.""" + if isinstance(zinfo_or_directory_name, ZipInfo): + zinfo = zinfo_or_directory_name + if not zinfo.is_dir(): + raise ValueError("The given ZipInfo does not describe a directory") + elif isinstance(zinfo_or_directory_name, str): + directory_name = zinfo_or_directory_name + if not directory_name.endswith("/"): + directory_name += "/" + zinfo = ZipInfo(directory_name) + zinfo.compress_size = 0 + zinfo.CRC = 0 + zinfo.external_attr = ((0o40000 | mode) & 0xFFFF) << 16 + zinfo.file_size = 0 + zinfo.external_attr |= 0x10 + else: + raise TypeError("Expected type str or ZipInfo") + + with self._lock: + if self._seekable: + self.fp.seek(self.start_dir) + zinfo.header_offset = self.fp.tell() # Start of header bytes + if zinfo.compress_type == ZIP_LZMA: + # Compressed data includes an end-of-stream (EOS) marker + zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 + + self._writecheck(zinfo) + self._didModify = True + + self.filelist.append(zinfo) + self.NameToInfo[zinfo.filename] = zinfo + self.fp.write(zinfo.FileHeader(False)) + self.start_dir = self.fp.tell() + def __del__(self): """Call the "close()" method in case the user forgot.""" self.close() @@ -2124,300 +2236,6 @@ def _compile(file, optimize=-1): return (fname, archivename) -def _parents(path): - """ - Given a path with elements separated by - posixpath.sep, generate all parents of that path. - - >>> list(_parents('b/d')) - ['b'] - >>> list(_parents('/b/d/')) - ['/b'] - >>> list(_parents('b/d/f/')) - ['b/d', 'b'] - >>> list(_parents('b')) - [] - >>> list(_parents('')) - [] - """ - return itertools.islice(_ancestry(path), 1, None) - - -def _ancestry(path): - """ - Given a path with elements separated by - posixpath.sep, generate all elements of that path - - >>> list(_ancestry('b/d')) - ['b/d', 'b'] - >>> list(_ancestry('/b/d/')) - ['/b/d', '/b'] - >>> list(_ancestry('b/d/f/')) - ['b/d/f', 'b/d', 'b'] - >>> list(_ancestry('b')) - ['b'] - >>> list(_ancestry('')) - [] - """ - path = path.rstrip(posixpath.sep) - while path and path != posixpath.sep: - yield path - path, tail = posixpath.split(path) - - -_dedupe = dict.fromkeys -"""Deduplicate an iterable in original order""" - - -def _difference(minuend, subtrahend): - """ - Return items in minuend not in subtrahend, retaining order - with O(1) lookup. - """ - return itertools.filterfalse(set(subtrahend).__contains__, minuend) - - -class CompleteDirs(ZipFile): - """ - A ZipFile subclass that ensures that implied directories - are always included in the namelist. - """ - - @staticmethod - def _implied_dirs(names): - parents = itertools.chain.from_iterable(map(_parents, names)) - as_dirs = (p + posixpath.sep for p in parents) - return _dedupe(_difference(as_dirs, names)) - - def namelist(self): - names = super(CompleteDirs, self).namelist() - return names + list(self._implied_dirs(names)) - - def _name_set(self): - return set(self.namelist()) - - def resolve_dir(self, name): - """ - If the name represents a directory, return that name - as a directory (with the trailing slash). - """ - names = self._name_set() - dirname = name + '/' - dir_match = name not in names and dirname in names - return dirname if dir_match else name - - @classmethod - def make(cls, source): - """ - Given a source (filename or zipfile), return an - appropriate CompleteDirs subclass. - """ - if isinstance(source, CompleteDirs): - return source - - if not isinstance(source, ZipFile): - return cls(source) - - # Only allow for FastLookup when supplied zipfile is read-only - if 'r' not in source.mode: - cls = CompleteDirs - - source.__class__ = cls - return source - - -class FastLookup(CompleteDirs): - """ - ZipFile subclass to ensure implicit - dirs exist and are resolved rapidly. - """ - - def namelist(self): - with contextlib.suppress(AttributeError): - return self.__names - self.__names = super(FastLookup, self).namelist() - return self.__names - - def _name_set(self): - with contextlib.suppress(AttributeError): - return self.__lookup - self.__lookup = super(FastLookup, self)._name_set() - return self.__lookup - - -class Path: - """ - A pathlib-compatible interface for zip files. - - Consider a zip file with this structure:: - - . - ├── a.txt - └── b - ├── c.txt - └── d - └── e.txt - - >>> data = io.BytesIO() - >>> zf = ZipFile(data, 'w') - >>> zf.writestr('a.txt', 'content of a') - >>> zf.writestr('b/c.txt', 'content of c') - >>> zf.writestr('b/d/e.txt', 'content of e') - >>> zf.filename = 'mem/abcde.zip' - - Path accepts the zipfile object itself or a filename - - >>> root = Path(zf) - - From there, several path operations are available. - - Directory iteration (including the zip file itself): - - >>> a, b = root.iterdir() - >>> a - Path('mem/abcde.zip', 'a.txt') - >>> b - Path('mem/abcde.zip', 'b/') - - name property: - - >>> b.name - 'b' - - join with divide operator: - - >>> c = b / 'c.txt' - >>> c - Path('mem/abcde.zip', 'b/c.txt') - >>> c.name - 'c.txt' - - Read text: - - >>> c.read_text() - 'content of c' - - existence: - - >>> c.exists() - True - >>> (b / 'missing.txt').exists() - False - - Coercion to string: - - >>> import os - >>> str(c).replace(os.sep, posixpath.sep) - 'mem/abcde.zip/b/c.txt' - - At the root, ``name``, ``filename``, and ``parent`` - resolve to the zipfile. Note these attributes are not - valid and will raise a ``ValueError`` if the zipfile - has no filename. - - >>> root.name - 'abcde.zip' - >>> str(root.filename).replace(os.sep, posixpath.sep) - 'mem/abcde.zip' - >>> str(root.parent) - 'mem' - """ - - __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" - - def __init__(self, root, at=""): - """ - Construct a Path from a ZipFile or filename. - - Note: When the source is an existing ZipFile object, - its type (__class__) will be mutated to a - specialized type. If the caller wishes to retain the - original type, the caller should either create a - separate ZipFile object or pass a filename. - """ - self.root = FastLookup.make(root) - self.at = at - - def open(self, mode='r', *args, pwd=None, **kwargs): - """ - Open this entry as text or binary following the semantics - of ``pathlib.Path.open()`` by passing arguments through - to io.TextIOWrapper(). - """ - if self.is_dir(): - raise IsADirectoryError(self) - zip_mode = mode[0] - if not self.exists() and zip_mode == 'r': - raise FileNotFoundError(self) - stream = self.root.open(self.at, zip_mode, pwd=pwd) - if 'b' in mode: - if args or kwargs: - raise ValueError("encoding args invalid for binary operation") - return stream - else: - kwargs["encoding"] = io.text_encoding(kwargs.get("encoding")) - return io.TextIOWrapper(stream, *args, **kwargs) - - @property - def name(self): - return pathlib.Path(self.at).name or self.filename.name - - @property - def filename(self): - return pathlib.Path(self.root.filename).joinpath(self.at) - - def read_text(self, *args, **kwargs): - kwargs["encoding"] = io.text_encoding(kwargs.get("encoding")) - with self.open('r', *args, **kwargs) as strm: - return strm.read() - - def read_bytes(self): - with self.open('rb') as strm: - return strm.read() - - def _is_child(self, path): - return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/") - - def _next(self, at): - return self.__class__(self.root, at) - - def is_dir(self): - return not self.at or self.at.endswith("/") - - def is_file(self): - return self.exists() and not self.is_dir() - - def exists(self): - return self.at in self.root._name_set() - - def iterdir(self): - if not self.is_dir(): - raise ValueError("Can't listdir a file") - subs = map(self._next, self.root.namelist()) - return filter(self._is_child, subs) - - def __str__(self): - return posixpath.join(self.root.filename, self.at) - - def __repr__(self): - return self.__repr.format(self=self) - - def joinpath(self, *other): - next = posixpath.join(self.at, *other) - return self._next(self.root.resolve_dir(next)) - - __truediv__ = joinpath - - @property - def parent(self): - if not self.at: - return self.filename.parent - parent_at = posixpath.dirname(self.at.rstrip('/')) - if parent_at: - parent_at += '/' - return self._next(parent_at) - - def main(args=None): import argparse @@ -2434,11 +2252,15 @@ def main(args=None): help='Create zipfile from sources') group.add_argument('-t', '--test', metavar='', help='Test if a zipfile is valid') + parser.add_argument('--metadata-encoding', metavar='', + help='Specify encoding of member names for -l, -e and -t') args = parser.parse_args(args) + encoding = args.metadata_encoding + if args.test is not None: src = args.test - with ZipFile(src, 'r') as zf: + with ZipFile(src, 'r', metadata_encoding=encoding) as zf: badfile = zf.testzip() if badfile: print("The following enclosed file is corrupted: {!r}".format(badfile)) @@ -2446,15 +2268,20 @@ def main(args=None): elif args.list is not None: src = args.list - with ZipFile(src, 'r') as zf: + with ZipFile(src, 'r', metadata_encoding=encoding) as zf: zf.printdir() elif args.extract is not None: src, curdir = args.extract - with ZipFile(src, 'r') as zf: + with ZipFile(src, 'r', metadata_encoding=encoding) as zf: zf.extractall(curdir) elif args.create is not None: + if encoding: + print("Non-conforming encodings not supported with -c.", + file=sys.stderr) + sys.exit(1) + zip_name = args.create.pop(0) files = args.create @@ -2479,5 +2306,9 @@ def addToZip(zf, path, zippath): addToZip(zf, path, zippath) -if __name__ == "__main__": - main() +from ._path import ( # noqa: E402 + Path, + + # used privately for tests + CompleteDirs, # noqa: F401 +) diff --git a/Lib/zipfile/__main__.py b/Lib/zipfile/__main__.py new file mode 100644 index 0000000000..868d99efc3 --- /dev/null +++ b/Lib/zipfile/__main__.py @@ -0,0 +1,4 @@ +from . import main + +if __name__ == "__main__": + main() diff --git a/Lib/zipfile/_path/__init__.py b/Lib/zipfile/_path/__init__.py new file mode 100644 index 0000000000..78c413563b --- /dev/null +++ b/Lib/zipfile/_path/__init__.py @@ -0,0 +1,395 @@ +import io +import posixpath +import zipfile +import itertools +import contextlib +import pathlib +import re + +from .glob import translate + + +__all__ = ['Path'] + + +def _parents(path): + """ + Given a path with elements separated by + posixpath.sep, generate all parents of that path. + + >>> list(_parents('b/d')) + ['b'] + >>> list(_parents('/b/d/')) + ['/b'] + >>> list(_parents('b/d/f/')) + ['b/d', 'b'] + >>> list(_parents('b')) + [] + >>> list(_parents('')) + [] + """ + return itertools.islice(_ancestry(path), 1, None) + + +def _ancestry(path): + """ + Given a path with elements separated by + posixpath.sep, generate all elements of that path + + >>> list(_ancestry('b/d')) + ['b/d', 'b'] + >>> list(_ancestry('/b/d/')) + ['/b/d', '/b'] + >>> list(_ancestry('b/d/f/')) + ['b/d/f', 'b/d', 'b'] + >>> list(_ancestry('b')) + ['b'] + >>> list(_ancestry('')) + [] + """ + path = path.rstrip(posixpath.sep) + while path and path != posixpath.sep: + yield path + path, tail = posixpath.split(path) + + +_dedupe = dict.fromkeys +"""Deduplicate an iterable in original order""" + + +def _difference(minuend, subtrahend): + """ + Return items in minuend not in subtrahend, retaining order + with O(1) lookup. + """ + return itertools.filterfalse(set(subtrahend).__contains__, minuend) + + +class InitializedState: + """ + Mix-in to save the initialization state for pickling. + """ + + def __init__(self, *args, **kwargs): + self.__args = args + self.__kwargs = kwargs + super().__init__(*args, **kwargs) + + def __getstate__(self): + return self.__args, self.__kwargs + + def __setstate__(self, state): + args, kwargs = state + super().__init__(*args, **kwargs) + + +class CompleteDirs(InitializedState, zipfile.ZipFile): + """ + A ZipFile subclass that ensures that implied directories + are always included in the namelist. + + >>> list(CompleteDirs._implied_dirs(['foo/bar.txt', 'foo/bar/baz.txt'])) + ['foo/', 'foo/bar/'] + >>> list(CompleteDirs._implied_dirs(['foo/bar.txt', 'foo/bar/baz.txt', 'foo/bar/'])) + ['foo/'] + """ + + @staticmethod + def _implied_dirs(names): + parents = itertools.chain.from_iterable(map(_parents, names)) + as_dirs = (p + posixpath.sep for p in parents) + return _dedupe(_difference(as_dirs, names)) + + def namelist(self): + names = super().namelist() + return names + list(self._implied_dirs(names)) + + def _name_set(self): + return set(self.namelist()) + + def resolve_dir(self, name): + """ + If the name represents a directory, return that name + as a directory (with the trailing slash). + """ + names = self._name_set() + dirname = name + '/' + dir_match = name not in names and dirname in names + return dirname if dir_match else name + + def getinfo(self, name): + """ + Supplement getinfo for implied dirs. + """ + try: + return super().getinfo(name) + except KeyError: + if not name.endswith('/') or name not in self._name_set(): + raise + return zipfile.ZipInfo(filename=name) + + @classmethod + def make(cls, source): + """ + Given a source (filename or zipfile), return an + appropriate CompleteDirs subclass. + """ + if isinstance(source, CompleteDirs): + return source + + if not isinstance(source, zipfile.ZipFile): + return cls(source) + + # Only allow for FastLookup when supplied zipfile is read-only + if 'r' not in source.mode: + cls = CompleteDirs + + source.__class__ = cls + return source + + +class FastLookup(CompleteDirs): + """ + ZipFile subclass to ensure implicit + dirs exist and are resolved rapidly. + """ + + def namelist(self): + with contextlib.suppress(AttributeError): + return self.__names + self.__names = super().namelist() + return self.__names + + def _name_set(self): + with contextlib.suppress(AttributeError): + return self.__lookup + self.__lookup = super()._name_set() + return self.__lookup + + +def _extract_text_encoding(encoding=None, *args, **kwargs): + # stacklevel=3 so that the caller of the caller see any warning. + return io.text_encoding(encoding, 3), args, kwargs + + +class Path: + """ + A pathlib-compatible interface for zip files. + + Consider a zip file with this structure:: + + . + ├── a.txt + └── b + ├── c.txt + └── d + └── e.txt + + >>> data = io.BytesIO() + >>> zf = ZipFile(data, 'w') + >>> zf.writestr('a.txt', 'content of a') + >>> zf.writestr('b/c.txt', 'content of c') + >>> zf.writestr('b/d/e.txt', 'content of e') + >>> zf.filename = 'mem/abcde.zip' + + Path accepts the zipfile object itself or a filename + + >>> root = Path(zf) + + From there, several path operations are available. + + Directory iteration (including the zip file itself): + + >>> a, b = root.iterdir() + >>> a + Path('mem/abcde.zip', 'a.txt') + >>> b + Path('mem/abcde.zip', 'b/') + + name property: + + >>> b.name + 'b' + + join with divide operator: + + >>> c = b / 'c.txt' + >>> c + Path('mem/abcde.zip', 'b/c.txt') + >>> c.name + 'c.txt' + + Read text: + + >>> c.read_text(encoding='utf-8') + 'content of c' + + existence: + + >>> c.exists() + True + >>> (b / 'missing.txt').exists() + False + + Coercion to string: + + >>> import os + >>> str(c).replace(os.sep, posixpath.sep) + 'mem/abcde.zip/b/c.txt' + + At the root, ``name``, ``filename``, and ``parent`` + resolve to the zipfile. Note these attributes are not + valid and will raise a ``ValueError`` if the zipfile + has no filename. + + >>> root.name + 'abcde.zip' + >>> str(root.filename).replace(os.sep, posixpath.sep) + 'mem/abcde.zip' + >>> str(root.parent) + 'mem' + """ + + __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" + + def __init__(self, root, at=""): + """ + Construct a Path from a ZipFile or filename. + + Note: When the source is an existing ZipFile object, + its type (__class__) will be mutated to a + specialized type. If the caller wishes to retain the + original type, the caller should either create a + separate ZipFile object or pass a filename. + """ + self.root = FastLookup.make(root) + self.at = at + + def __eq__(self, other): + """ + >>> Path(zipfile.ZipFile(io.BytesIO(), 'w')) == 'foo' + False + """ + if self.__class__ is not other.__class__: + return NotImplemented + return (self.root, self.at) == (other.root, other.at) + + def __hash__(self): + return hash((self.root, self.at)) + + def open(self, mode='r', *args, pwd=None, **kwargs): + """ + Open this entry as text or binary following the semantics + of ``pathlib.Path.open()`` by passing arguments through + to io.TextIOWrapper(). + """ + if self.is_dir(): + raise IsADirectoryError(self) + zip_mode = mode[0] + if not self.exists() and zip_mode == 'r': + raise FileNotFoundError(self) + stream = self.root.open(self.at, zip_mode, pwd=pwd) + if 'b' in mode: + if args or kwargs: + raise ValueError("encoding args invalid for binary operation") + return stream + # Text mode: + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + return io.TextIOWrapper(stream, encoding, *args, **kwargs) + + def _base(self): + return pathlib.PurePosixPath(self.at or self.root.filename) + + @property + def name(self): + return self._base().name + + @property + def suffix(self): + return self._base().suffix + + @property + def suffixes(self): + return self._base().suffixes + + @property + def stem(self): + return self._base().stem + + @property + def filename(self): + return pathlib.Path(self.root.filename).joinpath(self.at) + + def read_text(self, *args, **kwargs): + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + with self.open('r', encoding, *args, **kwargs) as strm: + return strm.read() + + def read_bytes(self): + with self.open('rb') as strm: + return strm.read() + + def _is_child(self, path): + return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/") + + def _next(self, at): + return self.__class__(self.root, at) + + def is_dir(self): + return not self.at or self.at.endswith("/") + + def is_file(self): + return self.exists() and not self.is_dir() + + def exists(self): + return self.at in self.root._name_set() + + def iterdir(self): + if not self.is_dir(): + raise ValueError("Can't listdir a file") + subs = map(self._next, self.root.namelist()) + return filter(self._is_child, subs) + + def match(self, path_pattern): + return pathlib.PurePosixPath(self.at).match(path_pattern) + + def is_symlink(self): + """ + Return whether this path is a symlink. Always false (python/cpython#82102). + """ + return False + + def glob(self, pattern): + if not pattern: + raise ValueError(f"Unacceptable pattern: {pattern!r}") + + prefix = re.escape(self.at) + matches = re.compile(prefix + translate(pattern)).fullmatch + return map(self._next, filter(matches, self.root.namelist())) + + def rglob(self, pattern): + return self.glob(f'**/{pattern}') + + def relative_to(self, other, *extra): + return posixpath.relpath(str(self), str(other.joinpath(*extra))) + + def __str__(self): + return posixpath.join(self.root.filename, self.at) + + def __repr__(self): + return self.__repr.format(self=self) + + def joinpath(self, *other): + next = posixpath.join(self.at, *other) + return self._next(self.root.resolve_dir(next)) + + __truediv__ = joinpath + + @property + def parent(self): + if not self.at: + return self.filename.parent + parent_at = posixpath.dirname(self.at.rstrip('/')) + if parent_at: + parent_at += '/' + return self._next(parent_at) diff --git a/Lib/zipfile/_path/glob.py b/Lib/zipfile/_path/glob.py new file mode 100644 index 0000000000..4a2e665e27 --- /dev/null +++ b/Lib/zipfile/_path/glob.py @@ -0,0 +1,40 @@ +import re + + +def translate(pattern): + r""" + Given a glob pattern, produce a regex that matches it. + + >>> translate('*.txt') + '[^/]*\\.txt' + >>> translate('a?txt') + 'a.txt' + >>> translate('**/*') + '.*/[^/]*' + """ + return ''.join(map(replace, separate(pattern))) + + +def separate(pattern): + """ + Separate out character sets to avoid translating their contents. + + >>> [m.group(0) for m in separate('*.txt')] + ['*.txt'] + >>> [m.group(0) for m in separate('a[?]txt')] + ['a', '[?]', 'txt'] + """ + return re.finditer(r'([^\[]+)|(?P[\[].*?[\]])|([\[][^\]]*$)', pattern) + + +def replace(match): + """ + Perform the replacements for a match from :func:`separate`. + """ + + return match.group('set') or ( + re.escape(match.group(0)) + .replace('\\*\\*', r'.*') + .replace('\\*', r'[^/]*') + .replace('\\?', r'.') + )