From cad857753382aa8dc7e6b788879550995454a53d Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 24 Mar 2023 17:50:16 +0800 Subject: [PATCH 01/11] Add `ZipFile.remove()` (#51067) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a revision of commit 659eb048cc9cac73c46349eb29845bc5cd630f09 (PR #19358), notably with following changes: - Add documentation and tests. - Raise `ValueError` for a bad mode, as in other methods. - Support multi-member removal in `_remove_members()`. - Support non-physical removal in `_remove_members()`. - Move physical file data in chunks to prevent excessive memory usage on large files. - Fix missing entry in `self.NameToInfo` when removing a duplicated archive name. - Also update `ZipInfo._end_offset` for physically moved files. Co-authored-by: Éric (cherry picked from commit e6bc82a3995f42df1c666c8889a2e4e4f938ff67 (PR #103033)) --- Doc/library/zipfile.rst | 17 +++ Lib/test/test_zipfile/test_core.py | 233 +++++++++++++++++++++++++++++ Lib/test/test_zipfile64.py | 63 ++++++++ Lib/zipfile/__init__.py | 90 +++++++++++ 4 files changed, 403 insertions(+) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 6a4fa67332e179..bd6eefe0442bf8 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -518,6 +518,23 @@ ZipFile Objects .. versionadded:: 3.11 +.. method:: ZipFile.remove(zinfo_or_arcname) + + Removes a member from the archive. *zinfo_or_arcname* is either the full + path of the member, or a :class:`ZipInfo` instance. + + The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``. + + Calling :meth:`remove` on a closed ZipFile will raise a :exc:`ValueError`. + + .. note:: + + Removing a member in an archive may involve a move of many internal data + records, which can be I/O intensive for a large ZIP file. + + .. versionadded:: next + + The following data attributes are also available: .. attribute:: ZipFile.filename diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index ada96813709aea..09acc9b47e6708 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1360,6 +1360,239 @@ class LzmaWriterTests(AbstractWriterTests, unittest.TestCase): class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_ZSTANDARD +class AbstractRemoveTests: + + def _test_removing_indexes(self, test_files, indexes): + """Test underlying _remove_members() for removing members at given + indexes.""" + # calculate the expected results + expected_files = [] + with zipfile.ZipFile(TESTFN, 'w') as zh: + for i, (file, data) in enumerate(test_files): + if i not in indexes: + zh.writestr(file, data) + expected_files.append(file) + expected_size = os.path.getsize(TESTFN) + + # prepare the test zip + with zipfile.ZipFile(TESTFN, 'w') as zh: + for file, data in test_files: + zh.writestr(file, data) + + # do the removal and check the result + with zipfile.ZipFile(TESTFN, 'a') as zh: + members = {zh.infolist()[i] for i in indexes} + zh._remove_members(members) + + # make sure internal caches have reflected the change + # and are consistent + self.assertEqual(zh.namelist(), expected_files) + for file, _ in test_files: + if file in zh.namelist(): + self.assertEqual(zh.getinfo(file).filename, file) + else: + with self.assertRaises(KeyError): + zh.getinfo(file) + + self.assertIsNone(zh.testzip()) + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def _test_removing_combinations(self, test_files, n=None): + """Test underlying _remove_members() for removing random combinations + of members.""" + ln = len(test_files) + if n is None: + # iterate n from 1 to all + for n in range(1, ln + 1): + for indexes in itertools.combinations(range(ln), n): + with self.subTest(remove=indexes): + self._test_removing_indexes(test_files, indexes) + else: + for indexes in itertools.combinations(range(ln), n): + with self.subTest(remove=indexes): + self._test_removing_indexes(test_files, indexes) + + def test_basic(self): + # Test underlying _remove_members() for removing random combinations of members. + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + self._test_removing_combinations(test_files) + + def test_duplicated_arcname(self): + # Test underlying _remove_members() for removing any one of random duplicated members. + dupl_file = 'file.txt' + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + ln = len(test_files) + for n in range(2, ln + 1): + for dups in itertools.combinations(range(ln), n): + files = [] + for i, (file, data) in enumerate(test_files): + file_ = dupl_file if i in dups else file + files.append((file_, data)) + + for index in dups: + indexes = [index] + with self.subTest(dups=dups, indexes=indexes): + self._test_removing_indexes(files, indexes) + + def test_non_physical(self): + # Test underlying _remove_members() for non-physical removing. + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + ln = len(test_files) + for n in range(1, ln + 1): + for indexes in itertools.combinations(range(ln), n): + with self.subTest(remove=indexes): + # prepare the test zip + expected = {} + with zipfile.ZipFile(TESTFN, 'w') as zh: + for i, (file, data) in enumerate(test_files): + zh.writestr(file, data) + if i not in indexes: + expected[file] = zh.getinfo(file).header_offset + + # do the removal and check the result + with zipfile.ZipFile(TESTFN, 'a') as zh: + members = {zh.infolist()[i] for i in indexes} + zh._remove_members(members, remove_physical=False) + self.assertEqual(zh.namelist(), list(expected)) + for file, offset in expected.items(): + self.assertEqual(zh.getinfo(file).header_offset, offset) + self.assertIsNone(zh.testzip()) + + def test_verify(self): + # Test if params are passed to underlying _remove_members() correctly, + # or never passed if conditions not met. + file0 = 'file0.txt' + file = 'datafile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + + # closed: error and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a') as zh: + zh.close() + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # writing: error and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a') as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with zh.open(file0, 'w') as fh: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # mode 'r': error and do nothing + with zipfile.ZipFile(TESTFN, 'r') as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # mode 'a': the most general use case + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + # -- remove with arcname + with zipfile.ZipFile(TESTFN, 'a') as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zh.remove(file) + mock_fn.assert_called_once_with({zh.getinfo(file)}) + # -- remove with zinfo + with zipfile.ZipFile(TESTFN, 'a') as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zinfo = zh.getinfo(file) + zh.remove(zinfo) + mock_fn.assert_called_once_with({zinfo}) + # -- remove with nonexist arcname + with zipfile.ZipFile(TESTFN, 'a') as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(KeyError): + zh.remove('nonexist.file') + mock_fn.assert_not_called() + # -- remove with nonexist zinfo (even if same name) + with zipfile.ZipFile(TESTFN, 'a') as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zinfo = zipfile.ZipInfo(file) + with self.assertRaises(KeyError): + zh.remove(zinfo) + mock_fn.assert_not_called() + + # mode 'w': like 'a'; allows removing a just written member + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zh.remove(file) + mock_fn.assert_called_once_with({zh.getinfo(file)}) + + # mode 'x': like 'w' + os.remove(TESTFN) + with zipfile.ZipFile(TESTFN, 'x') as zh: + zh.writestr(file, data) + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zh.remove(file) + mock_fn.assert_called_once_with({zh.getinfo(file)}) + + def test_zip64(self): + # Test if members use zip64. + file = 'datafile.txt' + file1 = 'pre.txt' + file2 = 'post.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + data1 = b'Lorem ipsum dolor sit amet, consectetur adipiscing elit' + data2 = b'Duis aute irure dolor in reprehenderit in voluptate velit esse' + with zipfile.ZipFile(TESTFN, 'w') as zh: + with zh.open(file1, 'w', force_zip64=True) as fh: + fh.write(data1) + with zh.open(file2, 'w', force_zip64=True) as fh: + fh.write(data2) + expected_size = os.path.getsize(TESTFN) + + with zipfile.ZipFile(TESTFN, 'w') as zh: + with zh.open(file1, 'w', force_zip64=True) as fh: + fh.write(data1) + with zh.open(file, 'w', force_zip64=True) as fh: + fh.write(data) + with zh.open(file2, 'w', force_zip64=True) as fh: + fh.write(data2) + with zipfile.ZipFile(TESTFN, 'a') as zh: + zh.remove(file) + self.assertIsNone(zh.testzip()) + self.assertEqual(os.path.getsize(TESTFN), expected_size) + +class StoredRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_STORED + +@requires_zlib() +class DeflateRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_DEFLATED + +@requires_bz2() +class Bzip2RemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_BZIP2 + +@requires_lzma() +class LzmaRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_LZMA + + class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): if name + 'o' not in namelist: diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index 2e1affe0252858..84d1862f08bf31 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -87,6 +87,69 @@ def tearDown(self): os_helper.unlink(TESTFN2) +class TestRemove(unittest.TestCase): + def setUp(self): + # Create test data. + line_gen = ("Test of zipfile line %d." % i for i in range(1000000)) + self.data = '\n'.join(line_gen).encode('ascii') + + def _write_large_file(self, fh): + # It will contain enough copies of self.data to reach about 8 GiB. + filecount = 8*1024**3 // len(self.data) + + next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL + for num in range(filecount): + fh.write(self.data) + # Print still working message since this test can be really slow + if next_time <= time.monotonic(): + next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL + print(( + ' writing %d of %d, be patient...' % + (num, filecount)), file=sys.__stdout__) + sys.__stdout__.flush() + + def test_remove_large_file(self): + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_remove_large_file(f) + self.assertFalse(f.closed) + + def _test_remove_large_file(self, f): + file = 'datafile.txt' + file1 = 'dummy.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with zipfile.ZipFile(f, 'w') as zh: + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + zh.writestr(file, data) + + with zipfile.ZipFile(f, 'a') as zh: + zh.remove(file1) + self.assertIsNone(zh.testzip()) + + def test_remove_before_large_file(self): + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_remove_before_large_file(f) + self.assertFalse(f.closed) + + def _test_remove_before_large_file(self, f): + file = 'datafile.txt' + file1 = 'dummy.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with zipfile.ZipFile(f, 'w') as zh: + zh.writestr(file, data) + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + expected_size = zh.getinfo(file1).file_size + + with zipfile.ZipFile(f, 'a') as zh: + zh.remove(file) + self.assertIsNone(zh.testzip()) + + class OtherTests(unittest.TestCase): def testMoreThan64kFiles(self): # This test checks that more than 64k files can be added to an archive, diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 18caeb3e04a2b5..6c4af17050b892 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1866,6 +1866,32 @@ def extractall(self, path=None, members=None, pwd=None): for zipinfo in members: self._extract_member(zipinfo, path, pwd) + def remove(self, zinfo_or_arcname): + """Remove a member from the archive.""" + + if self.mode not in ('w', 'x', 'a'): + raise ValueError("remove() requires mode 'w', 'x', or 'a'") + if not self.fp: + raise ValueError( + "Attempt to write to ZIP archive that was already closed") + if self._writing: + raise ValueError( + "Can't write to ZIP archive while an open writing handle exists." + ) + + # Make sure we have an existing info object + if isinstance(zinfo_or_arcname, ZipInfo): + zinfo = zinfo_or_arcname + # make sure zinfo exists + if zinfo not in self.filelist: + raise KeyError( + 'There is no item %r in the archive' % zinfo_or_arcname) + else: + # get the info object + zinfo = self.getinfo(zinfo_or_arcname) + + return self._remove_members({zinfo}) + @classmethod def _sanitize_windows_name(cls, arcname, pathsep): """Replace bad characters and remove trailing dots from parts.""" @@ -1930,6 +1956,70 @@ def _extract_member(self, member, targetpath, pwd): return targetpath + def _remove_members(self, members, *, remove_physical=True, chunk_size=2**20): + """Remove members in a zip file. + + All members (as zinfo) should exist in the zip; otherwise the zip file + will erroneously end in an inconsistent state. + """ + fp = self.fp + entry_offset = 0 + member_seen = False + + # get a sorted filelist by header offset, in case the dir order + # doesn't match the actual entry order + filelist = sorted(self.filelist, key=lambda x: x.header_offset) + for i, info in enumerate(filelist): + is_member = info in members + + if not (member_seen or is_member): + continue + + # get the total size of the entry + try: + offset = filelist[i + 1].header_offset + except IndexError: + offset = self.start_dir + entry_size = offset - info.header_offset + + if is_member: + member_seen = True + entry_offset += entry_size + + # update caches + self.filelist.remove(info) + try: + del self.NameToInfo[info.filename] + except KeyError: + pass + continue + + # update the header and move entry data to the new position + if remove_physical: + old_header_offset = info.header_offset + info.header_offset -= entry_offset + read_size = 0 + while read_size < entry_size: + fp.seek(old_header_offset + read_size) + data = fp.read(min(entry_size - read_size, chunk_size)) + fp.seek(info.header_offset + read_size) + fp.write(data) + fp.flush() + read_size += len(data) + + # Avoid missing entry if entries have a duplicated name. + # Reverse the order as NameToInfo normally stores the last added one. + for info in reversed(self.filelist): + self.NameToInfo.setdefault(info.filename, info) + + # update state + if remove_physical: + self.start_dir -= entry_offset + self._didModify = True + + # seek to the start of the central dir + fp.seek(self.start_dir) + def _writecheck(self, zinfo): """Check for errors before writing a file to the archive.""" if zinfo.filename in self.NameToInfo: From 96a07a26b273cd414f48f2bfcef579b337399289 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Wed, 21 May 2025 23:13:23 +0800 Subject: [PATCH 02/11] Fix and improve tests --- Lib/test/test_zipfile/test_core.py | 70 ++++++++++++++---------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 09acc9b47e6708..f89b3942cad854 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1362,12 +1362,12 @@ class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): class AbstractRemoveTests: - def _test_removing_indexes(self, test_files, indexes): + def _test_removing_members(self, test_files, indexes): """Test underlying _remove_members() for removing members at given indexes.""" # calculate the expected results expected_files = [] - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: for i, (file, data) in enumerate(test_files): if i not in indexes: zh.writestr(file, data) @@ -1375,12 +1375,12 @@ def _test_removing_indexes(self, test_files, indexes): expected_size = os.path.getsize(TESTFN) # prepare the test zip - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: for file, data in test_files: zh.writestr(file, data) # do the removal and check the result - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: members = {zh.infolist()[i] for i in indexes} zh._remove_members(members) @@ -1401,19 +1401,14 @@ def _test_removing_combinations(self, test_files, n=None): """Test underlying _remove_members() for removing random combinations of members.""" ln = len(test_files) - if n is None: - # iterate n from 1 to all - for n in range(1, ln + 1): - for indexes in itertools.combinations(range(ln), n): - with self.subTest(remove=indexes): - self._test_removing_indexes(test_files, indexes) - else: + for n in (range(1, ln + 1) if n is None else (n,)): for indexes in itertools.combinations(range(ln), n): with self.subTest(remove=indexes): - self._test_removing_indexes(test_files, indexes) + self._test_removing_members(test_files, indexes) def test_basic(self): - # Test underlying _remove_members() for removing random combinations of members. + """Test underlying _remove_members() for removing random combinations + of members.""" test_files = [ ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), @@ -1423,7 +1418,8 @@ def test_basic(self): self._test_removing_combinations(test_files) def test_duplicated_arcname(self): - # Test underlying _remove_members() for removing any one of random duplicated members. + """Test underlying _remove_members() for removing any one of random + duplicated members.""" dupl_file = 'file.txt' test_files = [ ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), @@ -1441,11 +1437,11 @@ def test_duplicated_arcname(self): for index in dups: indexes = [index] - with self.subTest(dups=dups, indexes=indexes): - self._test_removing_indexes(files, indexes) + with self.subTest(dups=dups, remove=indexes): + self._test_removing_members(files, indexes) def test_non_physical(self): - # Test underlying _remove_members() for non-physical removing. + """Test underlying _remove_members() for non-physical removing.""" test_files = [ ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), @@ -1458,14 +1454,14 @@ def test_non_physical(self): with self.subTest(remove=indexes): # prepare the test zip expected = {} - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: for i, (file, data) in enumerate(test_files): zh.writestr(file, data) if i not in indexes: expected[file] = zh.getinfo(file).header_offset # do the removal and check the result - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: members = {zh.infolist()[i] for i in indexes} zh._remove_members(members, remove_physical=False) self.assertEqual(zh.namelist(), list(expected)) @@ -1474,16 +1470,16 @@ def test_non_physical(self): self.assertIsNone(zh.testzip()) def test_verify(self): - # Test if params are passed to underlying _remove_members() correctly, - # or never passed if conditions not met. + """Test if params are passed to underlying _remove_members() correctly, + or never passed if conditions not met.""" file0 = 'file0.txt' file = 'datafile.txt' data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' # closed: error and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: zh.writestr(file, data) - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: zh.close() with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: with self.assertRaises(ValueError): @@ -1491,9 +1487,9 @@ def test_verify(self): mock_fn.assert_not_called() # writing: error and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: zh.writestr(file, data) - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: with zh.open(file0, 'w') as fh: with self.assertRaises(ValueError): @@ -1501,34 +1497,34 @@ def test_verify(self): mock_fn.assert_not_called() # mode 'r': error and do nothing - with zipfile.ZipFile(TESTFN, 'r') as zh: + with zipfile.ZipFile(TESTFN, 'r', self.compression) as zh: with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: with self.assertRaises(ValueError): zh.remove(file) mock_fn.assert_not_called() # mode 'a': the most general use case - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: zh.writestr(file, data) # -- remove with arcname - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: zh.remove(file) mock_fn.assert_called_once_with({zh.getinfo(file)}) # -- remove with zinfo - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: zinfo = zh.getinfo(file) zh.remove(zinfo) mock_fn.assert_called_once_with({zinfo}) # -- remove with nonexist arcname - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: with self.assertRaises(KeyError): zh.remove('nonexist.file') mock_fn.assert_not_called() # -- remove with nonexist zinfo (even if same name) - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: zinfo = zipfile.ZipInfo(file) with self.assertRaises(KeyError): @@ -1536,7 +1532,7 @@ def test_verify(self): mock_fn.assert_not_called() # mode 'w': like 'a'; allows removing a just written member - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: zh.writestr(file, data) with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: zh.remove(file) @@ -1544,35 +1540,35 @@ def test_verify(self): # mode 'x': like 'w' os.remove(TESTFN) - with zipfile.ZipFile(TESTFN, 'x') as zh: + with zipfile.ZipFile(TESTFN, 'x', self.compression) as zh: zh.writestr(file, data) with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: zh.remove(file) mock_fn.assert_called_once_with({zh.getinfo(file)}) def test_zip64(self): - # Test if members use zip64. + """Test if members use zip64.""" file = 'datafile.txt' file1 = 'pre.txt' file2 = 'post.txt' data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' data1 = b'Lorem ipsum dolor sit amet, consectetur adipiscing elit' data2 = b'Duis aute irure dolor in reprehenderit in voluptate velit esse' - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: with zh.open(file1, 'w', force_zip64=True) as fh: fh.write(data1) with zh.open(file2, 'w', force_zip64=True) as fh: fh.write(data2) expected_size = os.path.getsize(TESTFN) - with zipfile.ZipFile(TESTFN, 'w') as zh: + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: with zh.open(file1, 'w', force_zip64=True) as fh: fh.write(data1) with zh.open(file, 'w', force_zip64=True) as fh: fh.write(data) with zh.open(file2, 'w', force_zip64=True) as fh: fh.write(data2) - with zipfile.ZipFile(TESTFN, 'a') as zh: + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: zh.remove(file) self.assertIsNone(zh.testzip()) self.assertEqual(os.path.getsize(TESTFN), expected_size) From 602b909b5a9b653e3db14bdf5a0eeae6091b8d7f Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Wed, 21 May 2025 19:06:03 +0800 Subject: [PATCH 03/11] Update `ZipInfo._end_offset` --- Lib/zipfile/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 6c4af17050b892..dda1ff071920b1 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1998,6 +1998,8 @@ def _remove_members(self, members, *, remove_physical=True, chunk_size=2**20): if remove_physical: old_header_offset = info.header_offset info.header_offset -= entry_offset + if info._end_offset is not None: + info._end_offset -= entry_offset read_size = 0 while read_size < entry_size: fp.seek(old_header_offset + read_size) From bb90f48f60ddf8244097fc8b274cc5ebde052f94 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 22 May 2025 00:13:34 +0800 Subject: [PATCH 04/11] No more allow mode 'w'/'x' - File is not truncated in mode 'w'/'x', which results non-shrinked file. - This cannot be simply resolved by adding truncation for mode 'w'/'x', which may be used on an unseekable file buffer and truncation is not allowed. --- Doc/library/zipfile.rst | 2 +- Lib/test/test_zipfile/test_core.py | 32 ++++++++++++++++-------------- Lib/zipfile/__init__.py | 9 ++++++--- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index bd6eefe0442bf8..4f566168938056 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -523,7 +523,7 @@ ZipFile Objects Removes a member from the archive. *zinfo_or_arcname* is either the full path of the member, or a :class:`ZipInfo` instance. - The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``. + The archive must be opened with mode ``'a'``. Calling :meth:`remove` on a closed ZipFile will raise a :exc:`ValueError`. diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index f89b3942cad854..ff67aa045f06de 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1503,6 +1503,23 @@ def test_verify(self): zh.remove(file) mock_fn.assert_not_called() + # mode 'w': error and do nothing + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + zh.writestr(file, data) + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # mode 'x': error and do nothing + os.remove(TESTFN) + with zipfile.ZipFile(TESTFN, 'x', self.compression) as zh: + zh.writestr(file, data) + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + # mode 'a': the most general use case with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: zh.writestr(file, data) @@ -1531,21 +1548,6 @@ def test_verify(self): zh.remove(zinfo) mock_fn.assert_not_called() - # mode 'w': like 'a'; allows removing a just written member - with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: - zh.writestr(file, data) - with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: - zh.remove(file) - mock_fn.assert_called_once_with({zh.getinfo(file)}) - - # mode 'x': like 'w' - os.remove(TESTFN) - with zipfile.ZipFile(TESTFN, 'x', self.compression) as zh: - zh.writestr(file, data) - with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: - zh.remove(file) - mock_fn.assert_called_once_with({zh.getinfo(file)}) - def test_zip64(self): """Test if members use zip64.""" file = 'datafile.txt' diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index dda1ff071920b1..3069e8697cf8c2 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1867,10 +1867,13 @@ def extractall(self, path=None, members=None, pwd=None): self._extract_member(zipinfo, path, pwd) def remove(self, zinfo_or_arcname): - """Remove a member from the archive.""" + """Remove a member from the archive. - if self.mode not in ('w', 'x', 'a'): - raise ValueError("remove() requires mode 'w', 'x', or 'a'") + The archive must be open with mode 'a', since mode 'w'/'x' may be used + on an unseekable file buffer, which disallows truncation.""" + + if self.mode != 'a': + raise ValueError("remove() requires mode 'a'") if not self.fp: raise ValueError( "Attempt to write to ZIP archive that was already closed") From 2d74501e033d90d00dc70b1ca1df18a0267fd01e Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Wed, 21 May 2025 23:28:44 +0800 Subject: [PATCH 05/11] Test for zip64 more thoroughly --- Lib/test/test_zipfile/test_core.py | 37 +++++++++--------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index ff67aa045f06de..ae59ae421a09bc 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1362,7 +1362,7 @@ class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): class AbstractRemoveTests: - def _test_removing_members(self, test_files, indexes): + def _test_removing_members(self, test_files, indexes, force_zip64=False): """Test underlying _remove_members() for removing members at given indexes.""" # calculate the expected results @@ -1370,14 +1370,16 @@ def _test_removing_members(self, test_files, indexes): with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: for i, (file, data) in enumerate(test_files): if i not in indexes: - zh.writestr(file, data) + with zh.open(file, 'w', force_zip64=force_zip64) as fh: + fh.write(data) expected_files.append(file) expected_size = os.path.getsize(TESTFN) # prepare the test zip with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: for file, data in test_files: - zh.writestr(file, data) + with zh.open(file, 'w', force_zip64=force_zip64) as fh: + fh.write(data) # do the removal and check the result with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: @@ -1550,30 +1552,13 @@ def test_verify(self): def test_zip64(self): """Test if members use zip64.""" - file = 'datafile.txt' - file1 = 'pre.txt' - file2 = 'post.txt' - data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' - data1 = b'Lorem ipsum dolor sit amet, consectetur adipiscing elit' - data2 = b'Duis aute irure dolor in reprehenderit in voluptate velit esse' - with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: - with zh.open(file1, 'w', force_zip64=True) as fh: - fh.write(data1) - with zh.open(file2, 'w', force_zip64=True) as fh: - fh.write(data2) - expected_size = os.path.getsize(TESTFN) + test_files = [ + ('pre.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('datafile', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ('post.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ] - with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: - with zh.open(file1, 'w', force_zip64=True) as fh: - fh.write(data1) - with zh.open(file, 'w', force_zip64=True) as fh: - fh.write(data) - with zh.open(file2, 'w', force_zip64=True) as fh: - fh.write(data2) - with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - zh.remove(file) - self.assertIsNone(zh.testzip()) - self.assertEqual(os.path.getsize(TESTFN), expected_size) + self._test_removing_members(test_files, [1], force_zip64=True) class StoredRemoveTests(AbstractRemoveTests, unittest.TestCase): compression = zipfile.ZIP_STORED From 63d56e5bc9bbd2e1cad7d61b53fad2379c5bce5c Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 22 May 2025 06:57:48 +0800 Subject: [PATCH 06/11] Add tests for zstd --- Lib/test/test_zipfile/test_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index ae59ae421a09bc..259bda57d99e14 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1575,6 +1575,10 @@ class Bzip2RemoveTests(AbstractRemoveTests, unittest.TestCase): class LzmaRemoveTests(AbstractRemoveTests, unittest.TestCase): compression = zipfile.ZIP_LZMA +@requires_zstd() +class ZstdRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_ZSTANDARD + class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): From ab3a1871623238aa7288d10f90aedd7675be2822 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 22 May 2025 17:32:27 +0800 Subject: [PATCH 07/11] Remove unneeded seek - The seek will be automatically called in `ZipFile.close`. --- Lib/zipfile/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 3069e8697cf8c2..282701abe869a3 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -2022,9 +2022,6 @@ def _remove_members(self, members, *, remove_physical=True, chunk_size=2**20): self.start_dir -= entry_offset self._didModify = True - # seek to the start of the central dir - fp.seek(self.start_dir) - def _writecheck(self, zinfo): """Check for errors before writing a file to the archive.""" if zinfo.filename in self.NameToInfo: From 1c074c5c87815931bbc81da6081bdf44531c9097 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 22 May 2025 17:38:13 +0800 Subject: [PATCH 08/11] Add lock for thread safety --- Lib/zipfile/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 282701abe869a3..02e80b390eec7f 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1893,7 +1893,8 @@ def remove(self, zinfo_or_arcname): # get the info object zinfo = self.getinfo(zinfo_or_arcname) - return self._remove_members({zinfo}) + with self._lock: + return self._remove_members({zinfo}) @classmethod def _sanitize_windows_name(cls, arcname, pathsep): From 8b4f70955544ccb18351a96cedf60851f26b4b63 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 22 May 2025 12:52:38 +0000 Subject: [PATCH 09/11] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst new file mode 100644 index 00000000000000..0502d7fed164c6 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst @@ -0,0 +1 @@ +Add `ZipFile.remove()` From 7b7f54a1cfe8c89cd2c6452c113fb815334e51cc Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 23 May 2025 22:12:47 +0800 Subject: [PATCH 10/11] Suppress duplicated name warning during generating test files --- Lib/test/test_zipfile/test_core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 259bda57d99e14..0b457bda6b3347 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1440,7 +1440,10 @@ def test_duplicated_arcname(self): for index in dups: indexes = [index] with self.subTest(dups=dups, remove=indexes): - self._test_removing_members(files, indexes) + import warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self._test_removing_members(files, indexes) def test_non_physical(self): """Test underlying _remove_members() for non-physical removing.""" From c95ef5308bb958c563eeba67b5dddddb709b73f5 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 24 May 2025 18:10:51 +0800 Subject: [PATCH 11/11] Fix rst format --- .../2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst index 0502d7fed164c6..6a696828991836 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst @@ -1 +1 @@ -Add `ZipFile.remove()` +Add ``ZipFile.remove()``