localstack · gregfurman · Feb 26, 2025 · Nov 27, 2024 · Nov 28, 2024 · Nov 29, 2024
diff --git a/localstack-core/localstack/services/lambda_/event_source_mapping/esm_worker_factory.py b/localstack-core/localstack/services/lambda_/event_source_mapping/esm_worker_factory.py
@@ -32,7 +32,10 @@
 from localstack.services.lambda_.event_source_mapping.pollers.dynamodb_poller import DynamoDBPoller
 from localstack.services.lambda_.event_source_mapping.pollers.kinesis_poller import KinesisPoller
 from localstack.services.lambda_.event_source_mapping.pollers.poller import Poller
-from localstack.services.lambda_.event_source_mapping.pollers.sqs_poller import SqsPoller
+from localstack.services.lambda_.event_source_mapping.pollers.sqs_poller import (
+    DEFAULT_MAX_WAIT_TIME_SECONDS,
+    SqsPoller,
+)
 from localstack.services.lambda_.event_source_mapping.senders.lambda_sender import LambdaSender
 from localstack.utils.aws.arns import parse_arn
 from localstack.utils.aws.client_types import ServicePrincipal
@@ -111,6 +114,24 @@ def get_esm_worker(self) -> EsmWorker:
                 role_arn=self.function_role_arn,
                 service_principal=ServicePrincipal.lambda_,
                 source_arn=self.esm_config["FunctionArn"],
+                client_config=botocore.config.Config(
+                    retries={"total_max_attempts": 1},  # Disable retries
+                    read_timeout=max(
+                        self.esm_config.get(
+                            "MaximumBatchingWindowInSeconds", DEFAULT_MAX_WAIT_TIME_SECONDS
+                        ),
+                        60,
+                    )
+                    + 5,  # Extend read timeout (with 5s buffer) for long-polling
+                    # Setting tcp_keepalive to true allows the boto client to keep
+                    # a long-running TCP connection when making calls to the gateway.
+                    # This ensures long-poll calls do not prematurely have their socket
+                    # connection marked as stale if no data is transferred for a given
+                    # period of time hence preventing premature drops or resets of the
+                    # connection.
+                    # See https://aws.amazon.com/blogs/networking-and-content-delivery/implementing-long-running-tcp-connections-within-vpc-networking/
+                    tcp_keepalive=True,
+                ),
             )
 
         filter_criteria = self.esm_config.get("FilterCriteria", {"Filters": []})

diff --git a/localstack-core/localstack/services/lambda_/event_source_mapping/pollers/sqs_poller.py b/localstack-core/localstack/services/lambda_/event_source_mapping/pollers/sqs_poller.py
@@ -16,13 +16,21 @@
     Poller,
     parse_batch_item_failures,
 )
-from localstack.services.sqs.constants import HEADER_LOCALSTACK_SQS_OVERRIDE_MESSAGE_COUNT
+from localstack.services.lambda_.event_source_mapping.senders.sender_utils import (
+    batched,
+)
+from localstack.services.sqs.constants import (
+    HEADER_LOCALSTACK_SQS_OVERRIDE_MESSAGE_COUNT,
+    HEADER_LOCALSTACK_SQS_OVERRIDE_WAIT_TIME_SECONDS,
+)
 from localstack.utils.aws.arns import parse_arn
 from localstack.utils.strings import first_char_to_lower
 
 LOG = logging.getLogger(__name__)
 
 DEFAULT_MAX_RECEIVE_COUNT = 10
+# See https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-short-and-long-polling.html
+DEFAULT_MAX_WAIT_TIME_SECONDS = 20
 
 
 class SqsPoller(Poller):
@@ -71,13 +79,34 @@ def handle_message_count_override(params, context, **kwargs):
 
             context[HEADER_LOCALSTACK_SQS_OVERRIDE_MESSAGE_COUNT] = str(requested_count)
 
+        def handle_message_wait_time_seconds_override(params, context, **kwargs):
+            requested_wait = params.pop("sqs_override_wait_time_seconds", None)
+            if not requested_wait or requested_wait <= DEFAULT_MAX_WAIT_TIME_SECONDS:
+                return
+
+            context[HEADER_LOCALSTACK_SQS_OVERRIDE_WAIT_TIME_SECONDS] = str(requested_wait)
+
         def handle_inject_headers(params, context, **kwargs):
-            if override := context.pop(HEADER_LOCALSTACK_SQS_OVERRIDE_MESSAGE_COUNT, None):
-                params["headers"][HEADER_LOCALSTACK_SQS_OVERRIDE_MESSAGE_COUNT] = override
+            if override_message_count := context.pop(
+                HEADER_LOCALSTACK_SQS_OVERRIDE_MESSAGE_COUNT, None
+            ):
+                params["headers"][HEADER_LOCALSTACK_SQS_OVERRIDE_MESSAGE_COUNT] = (
+                    override_message_count
+                )
+
+            if override_wait_time := context.pop(
+                HEADER_LOCALSTACK_SQS_OVERRIDE_WAIT_TIME_SECONDS, None
+            ):
+                params["headers"][HEADER_LOCALSTACK_SQS_OVERRIDE_WAIT_TIME_SECONDS] = (
+                    override_wait_time
+                )
 
         event_system.register(
             "provide-client-params.sqs.ReceiveMessage", handle_message_count_override
         )
+        event_system.register(
+            "provide-client-params.sqs.ReceiveMessage", handle_message_wait_time_seconds_override
+        )
         # Since we delete SQS messages after processing, this allows us to remove up to 10K entries at a time.
         event_system.register(
             "provide-client-params.sqs.DeleteMessageBatch", handle_message_count_override
@@ -98,30 +127,62 @@ def event_source(self) -> str:
         return "aws:sqs"
 
     def poll_events(self) -> None:
-        # SQS pipe source: https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-pipes-sqs.html
-        # "The 9 Ways an SQS Message can be Deleted": https://lucvandonkersgoed.com/2022/01/20/the-9-ways-an-sqs-message-can-be-deleted/
-        # TODO: implement batch window expires based on MaximumBatchingWindowInSeconds
-        # TODO: implement invocation payload size quota
-        # TODO: consider long-polling vs. short-polling trade-off. AWS uses long-polling:
-        #  https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-pipes-sqs.html#pipes-sqs-scaling
+        # In order to improve performance, we've adopted long-polling for the SQS poll operation `ReceiveMessage` [1].
+        # * Our LS-internal optimizations leverage custom boto-headers to set larger batch sizes and longer wait times than what the AWS API allows [2].
+        # * Higher batch collection durations and no. of records retrieved per request mean fewer calls to the LocalStack gateway [3] when polling an event-source [4].
+        # * LocalStack shutdown works because the LocalStack gateway shuts down and terminates the open connection.
+        # * Provider lifecycle hooks have been added to ensure blocking long-poll calls are gracefully interrupted and returned.
+        #
+        # Pros (+) / Cons (-):
+        # + Alleviates pressure on the gateway since each `ReceiveMessage` call only returns once we reach the desired `BatchSize` or the `WaitTimeSeconds` elapses.
+        # + Matches the AWS behavior also using long-polling
+        # - Blocks a LocalStack gateway thread (default 1k) for every open connection, which could lead to resource contention if used at scale.
+        #
+        # Refs / Notes:
+        # [1] Amazon SQS short and long polling: https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-short-and-long-polling.html
+        # [2] PR (2025-02): https://github.com/localstack/localstack/pull/12002
+        # [3] Note: Under high volumes of requests, the LocalStack gateway becomes a major performance bottleneck.
+        # [4] ESM blog mentioning long-polling: https://aws.amazon.com/de/blogs/aws/aws-lambda-adds-amazon-simple-queue-service-to-supported-event-sources/
+
+        # TODO: Handle exceptions differently i.e QueueNotExist or ConnectionFailed should retry with backoff
         response = self.source_client.receive_message(
             QueueUrl=self.queue_url,
             MaxNumberOfMessages=min(self.batch_size, DEFAULT_MAX_RECEIVE_COUNT),
+            WaitTimeSeconds=min(self.maximum_batching_window, DEFAULT_MAX_WAIT_TIME_SECONDS),
             MessageAttributeNames=["All"],
             MessageSystemAttributeNames=[MessageSystemAttributeName.All],
             # Override how many messages we can receive per call
             sqs_override_max_message_count=self.batch_size,
+            # Override how long to wait until batching conditions are met
+            sqs_override_wait_time_seconds=self.maximum_batching_window,
         )
-        if messages := response.get("Messages"):
-            LOG.debug("Polled %d events from %s", len(messages), self.source_arn)
+
+        messages = response.get("Messages", [])
+        if not messages:
+            # TODO: Consider this case triggering longer wait-times (with backoff) between poll_events calls in the outer-loop.
+            return
+
+        LOG.debug("Polled %d events from %s", len(messages), self.source_arn)
+        # TODO: implement invocation payload size quota
+        # NOTE: Split up a batch into mini-batches of up to 2.5K records each. This is to prevent exceeding the 6MB size-limit
+        # imposed on payloads sent to a Lambda as well as LocalStack Lambdas failing to handle large payloads efficiently.
+        # See https://docs.aws.amazon.com/lambda/latest/dg/invocation-eventsourcemapping.html#invocation-eventsourcemapping-batching
+        for message_batch in batched(messages, 2500):
+            if len(message_batch) < len(messages):
+                LOG.debug(
+                    "Splitting events from %s into mini-batch (%d/%d)",
+                    self.source_arn,
+                    len(message_batch),
+                    len(messages),
+                )
             try:
                 if self.is_fifo_queue:
                     # TODO: think about starvation behavior because once failing message could block other groups
-                    fifo_groups = split_by_message_group_id(messages)
+                    fifo_groups = split_by_message_group_id(message_batch)
                     for fifo_group_messages in fifo_groups.values():
                         self.handle_messages(fifo_group_messages)
                 else:
-                    self.handle_messages(messages)
+                    self.handle_messages(message_batch)
 
             # TODO: unify exception handling across pollers: should we catch and raise?
             except Exception as e:

diff --git a/tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.py b/tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.py
@@ -1,4 +1,5 @@
 import json
+import math
 import time
 
 import pytest
@@ -1077,9 +1078,6 @@ def get_msg_from_q():
         events = retry(get_msg_from_q, retries=15, sleep=5)
         snapshot.match("Records", events)
 
-    # FIXME: this fails due to ESM not correctly collecting and sending batches
-    # where size exceeds 10 messages.
-    @markers.snapshot.skip_snapshot_verify(paths=["$..total_batches_received"])
     @markers.aws.validated
     def test_sqs_event_source_mapping_batching_reserved_concurrency(
         self,
@@ -1117,10 +1115,16 @@ def test_sqs_event_source_mapping_batching_reserved_concurrency(
         queue_url = sqs_create_queue(QueueName=source_queue_name)
         queue_arn = sqs_get_queue_arn(queue_url)
 
+        for b in range(3):
+            aws_client.sqs.send_message_batch(
+                QueueUrl=queue_url,
+                Entries=[{"Id": f"{i}-{b}", "MessageBody": f"{i}-{b}-message"} for i in range(10)],
+            )
+
         create_event_source_mapping_response = aws_client.lambda_.create_event_source_mapping(
             EventSourceArn=queue_arn,
             FunctionName=function_name,
-            MaximumBatchingWindowInSeconds=10,
+            MaximumBatchingWindowInSeconds=1,
             BatchSize=20,
             ScalingConfig={
                 "MaximumConcurrency": 2
@@ -1131,12 +1135,6 @@ def test_sqs_event_source_mapping_batching_reserved_concurrency(
         snapshot.match("create-event-source-mapping-response", create_event_source_mapping_response)
         _await_event_source_mapping_enabled(aws_client.lambda_, mapping_uuid)
 
-        for b in range(3):
-            aws_client.sqs.send_message_batch(
-                QueueUrl=queue_url,
-                Entries=[{"Id": f"{i}-{b}", "MessageBody": f"{i}-{b}-message"} for i in range(10)],
-            )
-
         batches = []
 
         def get_msg_from_q():
@@ -1566,13 +1564,7 @@ def test_duplicate_event_source_mappings(
             20,
             100,
             1_000,
-            pytest.param(
-                10_000,
-                marks=pytest.mark.skip(
-                    reason="Flushing based on payload sizes not yet implemented so large payloads are causing issues."
-                ),
-                id="10000",
-            ),
+            10_000,
         ],
     )
     @markers.aws.only_localstack
@@ -1617,17 +1609,72 @@ def test_sqs_event_source_mapping_batch_size_override(
         cleanups.append(lambda: aws_client.lambda_.delete_event_source_mapping(UUID=mapping_uuid))
         _await_event_source_mapping_enabled(aws_client.lambda_, mapping_uuid)
 
+        expected_invocations = math.ceil(batch_size / 2500)
         events = retry(
             check_expected_lambda_log_events_length,
             retries=10,
             sleep=1,
             function_name=function_name,
+            expected_length=expected_invocations,
+            logs_client=aws_client.logs,
+        )
+
+        assert sum(len(event.get("Records", [])) for event in events) == batch_size
+
+        rs = aws_client.sqs.receive_message(QueueUrl=queue_url)
+        assert rs.get("Messages", []) == []
+
+    @markers.aws.only_localstack
+    def test_sqs_event_source_mapping_batching_window_size_override(
+        self,
+        create_lambda_function,
+        sqs_create_queue,
+        sqs_get_queue_arn,
+        lambda_su_role,
+        cleanups,
+        aws_client,
+    ):
+        function_name = f"lambda_func-{short_uid()}"
+        queue_name = f"queue-{short_uid()}"
+        mapping_uuid = None
+
+        create_lambda_function(
+            func_name=function_name,
+            handler_file=TEST_LAMBDA_PYTHON_ECHO,
+            runtime=Runtime.python3_12,
+            role=lambda_su_role,
+        )
+        queue_url = sqs_create_queue(QueueName=queue_name)
+        queue_arn = sqs_get_queue_arn(queue_url)
+
+        create_event_source_mapping_response = aws_client.lambda_.create_event_source_mapping(
+            EventSourceArn=queue_arn,
+            FunctionName=function_name,
+            MaximumBatchingWindowInSeconds=30,
+            BatchSize=10_000,
+        )
+        mapping_uuid = create_event_source_mapping_response["UUID"]
+        cleanups.append(lambda: aws_client.lambda_.delete_event_source_mapping(UUID=mapping_uuid))
+        _await_event_source_mapping_enabled(aws_client.lambda_, mapping_uuid)
+
+        # Send 4 messages and delay their arrival by 5, 10, 15, and 25 seconds respectively
+        for s in [5, 10, 15, 25]:
+            aws_client.sqs.send_message(
+                QueueUrl=queue_url,
+                MessageBody=json.dumps({"delayed": f"{s}"}),
+            )
+
+        events = retry(
+            check_expected_lambda_log_events_length,
+            retries=60,
+            sleep=1,
+            function_name=function_name,
             expected_length=1,
             logs_client=aws_client.logs,
         )
 
         assert len(events) == 1
-        assert len(events[0].get("Records", [])) == batch_size
+        assert len(events[0].get("Records", [])) == 4
 
         rs = aws_client.sqs.receive_message(QueueUrl=queue_url)
         assert rs.get("Messages", []) == []

diff --git a/tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.snapshot.json b/tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.snapshot.json
@@ -2033,7 +2033,7 @@
     }
   },
   "tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.py::TestSQSEventSourceMapping::test_sqs_event_source_mapping_batching_reserved_concurrency": {
-    "recorded-date": "29-11-2024, 13:29:56",
+    "recorded-date": "25-02-2025, 16:35:01",
     "recorded-content": {
       "put_concurrency_resp": {
         "ReservedConcurrentExecutions": 2,
@@ -2049,7 +2049,7 @@
         "FunctionArn": "arn:<partition>:lambda:<region>:111111111111:function:<resource:2>",
         "FunctionResponseTypes": [],
         "LastModified": "<datetime>",
-        "MaximumBatchingWindowInSeconds": 10,
+        "MaximumBatchingWindowInSeconds": 1,
         "ScalingConfig": {
           "MaximumConcurrency": 2
         },

diff --git a/tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.validation.json b/tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.validation.json
@@ -87,7 +87,7 @@
     "last_validated_date": "2024-12-11T13:42:55+00:00"
   },
   "tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.py::TestSQSEventSourceMapping::test_sqs_event_source_mapping_batching_reserved_concurrency": {
-    "last_validated_date": "2024-11-29T13:29:53+00:00"
+    "last_validated_date": "2025-02-25T16:34:59+00:00"
   },
   "tests/aws/services/lambda_/event_source_mapping/test_lambda_integration_sqs.py::TestSQSEventSourceMapping::test_sqs_event_source_mapping_update": {
     "last_validated_date": "2024-10-12T13:45:43+00:00"