diff --git a/localstack-core/localstack/services/s3/provider.py b/localstack-core/localstack/services/s3/provider.py index cac3ac9d42de4..09a7dcb59564f 100644 --- a/localstack-core/localstack/services/s3/provider.py +++ b/localstack-core/localstack/services/s3/provider.py @@ -282,6 +282,7 @@ get_system_metadata_from_request, get_unique_key_id, is_bucket_name_valid, + is_version_older_than_other, parse_copy_source_range_header, parse_post_object_tagging_xml, parse_range_header, @@ -1856,9 +1857,9 @@ def list_object_versions( continue # it is possible that the version_id_marker related object has been deleted, in that case, start - # as soon as the next version id is smaller than the version id marker (meaning this version was + # as soon as the next version id is older than the version id marker (meaning this version was # next after the now-deleted version) - elif version.version_id < version_id_marker: + elif is_version_older_than_other(version.version_id, version_id_marker): version_key_marker_found = True elif not version_key_marker_found: diff --git a/localstack-core/localstack/services/s3/utils.py b/localstack-core/localstack/services/s3/utils.py index cec7d3be6187b..8592de4712594 100644 --- a/localstack-core/localstack/services/s3/utils.py +++ b/localstack-core/localstack/services/s3/utils.py @@ -1039,11 +1039,14 @@ def parse_post_object_tagging_xml(tagging: str) -> Optional[dict]: def generate_safe_version_id() -> str: - # the safe b64 encoding is inspired by the stdlib base64.urlsafe_b64encode - # and also using stdlib secrets.token_urlsafe, but with a different alphabet adapted for S3 - # VersionId cannot have `-` in it, as it fails in XML - # we need an ever-increasing number, in order to properly implement pagination around ListObjectVersions - # by prepending the version-id with a global increasing number, we can lexicographically sort the versions + """ + Generate a safe version id for XML rendering. + VersionId cannot have `-` in it, as it fails in XML + Combine an ever-increasing part in the 8 first characters, and a random element. + We need the sequence part in order to properly implement pagination around ListObjectVersions. + By prefixing the version-id with a global increasing number, we can sort the versions + :return: an S3 VersionId containing a timestamp part in the first 8 characters + """ tok = next(global_version_id_sequence()).to_bytes(length=6) + token_bytes(18) return base64.b64encode(tok, altchars=b"._").rstrip(b"=").decode("ascii") @@ -1053,3 +1056,11 @@ def global_version_id_sequence(): start = int(time.time() * 1000) # itertools.count is thread safe over the GIL since its getAndIncrement operation is a single python bytecode op return itertools.count(start) + + +def is_version_older_than_other(version_id: str, other: str): + """ + Compare the sequence part of a VersionId against the sequence part of a VersionIdMarker. Used for pagination + See `generate_safe_version_id` + """ + return base64.b64decode(version_id, altchars=b"._") < base64.b64decode(other, altchars=b"._") diff --git a/tests/aws/services/s3/test_s3_list_operations.py b/tests/aws/services/s3/test_s3_list_operations.py index 50020f0ae6034..78b98a725bd9b 100644 --- a/tests/aws/services/s3/test_s3_list_operations.py +++ b/tests/aws/services/s3/test_s3_list_operations.py @@ -492,7 +492,10 @@ def test_list_objects_versions_with_prefix( @markers.aws.validated def test_list_objects_versions_with_prefix_only_and_pagination( - self, s3_bucket, snapshot, aws_client, aws_http_client_factory + self, + s3_bucket, + snapshot, + aws_client, ): snapshot.add_transformer(snapshot.transform.s3_api()) aws_client.s3.put_bucket_versioning( @@ -559,6 +562,36 @@ def test_list_objects_versions_with_prefix_only_and_pagination( ) snapshot.match("list-object-version-prefix-page-2-after-delete", page_2_response) + @markers.aws.validated + def test_list_objects_versions_with_prefix_only_and_pagination_many_versions( + self, + s3_bucket, + aws_client, + ): + aws_client.s3.put_bucket_versioning( + Bucket=s3_bucket, + VersioningConfiguration={"Status": "Enabled"}, + ) + # with our internal pagination system, we use characters from the alphabet (lower and upper) + digits + # by creating more than 100 objects, we can make sure we circle all the way around our sequencing, and properly + # paginate over all of them + for _ in range(101): + aws_client.s3.put_object(Bucket=s3_bucket, Key="prefixed_key") + + paginator = aws_client.s3.get_paginator("list_object_versions") + # even if the PageIterator looks like it should be an iterator, it's actually an iterable and needs to be + # wrapped in `iter` + page_iterator = iter( + paginator.paginate( + Bucket=s3_bucket, Prefix="prefix", PaginationConfig={"PageSize": 100} + ) + ) + page_1 = next(page_iterator) + assert len(page_1["Versions"]) == 100 + + page_2 = next(page_iterator) + assert len(page_2["Versions"]) == 1 + @markers.aws.validated def test_s3_list_object_versions_timestamp_precision( self, s3_bucket, aws_client, aws_http_client_factory diff --git a/tests/aws/services/s3/test_s3_list_operations.validation.json b/tests/aws/services/s3/test_s3_list_operations.validation.json index b05ae49aeac70..f65f3290f7556 100644 --- a/tests/aws/services/s3/test_s3_list_operations.validation.json +++ b/tests/aws/services/s3/test_s3_list_operations.validation.json @@ -23,6 +23,9 @@ "tests/aws/services/s3/test_s3_list_operations.py::TestS3ListObjectVersions::test_list_objects_versions_with_prefix_only_and_pagination": { "last_validated_date": "2025-02-13T03:52:21+00:00" }, + "tests/aws/services/s3/test_s3_list_operations.py::TestS3ListObjectVersions::test_list_objects_versions_with_prefix_only_and_pagination_many_versions": { + "last_validated_date": "2025-02-13T20:24:26+00:00" + }, "tests/aws/services/s3/test_s3_list_operations.py::TestS3ListObjectVersions::test_s3_list_object_versions_timestamp_precision": { "last_validated_date": "2025-01-21T18:15:06+00:00" }, diff --git a/tests/unit/services/s3/test_s3.py b/tests/unit/services/s3/test_s3.py index 4e41757709056..1420ff4de5e84 100644 --- a/tests/unit/services/s3/test_s3.py +++ b/tests/unit/services/s3/test_s3.py @@ -1,6 +1,7 @@ import datetime import os import re +import string import zoneinfo from io import BytesIO from urllib.parse import urlparse @@ -707,3 +708,21 @@ def test_s3_context_manager(self, tmpdir): pass temp_storage_backend.close() + + +class TestS3VersionIdGenerator: + def test_version_is_xml_safe(self): + # assert than we don't have unsafe characters in 500 different versions id + safe_characters = string.ascii_letters + string.digits + "._" + assert all( + all(char in safe_characters for char in s3_utils.generate_safe_version_id()) + for _ in range(500) + ) + + def test_version_id_ordering(self): + version_ids = [s3_utils.generate_safe_version_id() for _ in range(500)] + + # assert that every version id can be ordered with each other + for index, version_id in enumerate(version_ids[1:]): + previous_version = version_ids[index] + assert s3_utils.is_version_older_than_other(previous_version, version_id)