FIX: Retry parsing escaped inner JSON to handle control chars. (#1357)

romanrizzi · web-flow · commit d72ad84f8f57 · 2025-05-21T11:25:59.000-03:00
The structured output JSON comes embedded inside the API response, which is also a JSON. Since we have to parse the response to process it, any control characters inside the structured output are unescaped into regular characters, leading to invalid JSON and breaking during parsing. This change adds a retry mechanism that escapes
the string again if parsing fails, preventing the parser from breaking on malformed input and working around this issue.

For example:

```
  original = '{ "a": "{\\"key\\":\\"value with \\n newline\\"}" }'
  JSON.parse(original) =&gt; { "a" =&gt; "{\"key\":\"value with \n newline\"}" }
  # At this point, the inner JSON string contains an actual newline.
```
diff --git a/lib/completions/json_streaming_tracker.rb b/lib/completions/json_streaming_tracker.rb
@@ -28,17 +28,21 @@ def broken?
         @broken
       end
 
-      def <<(json)
+      def <<(raw_json)
         # llm could send broken json
         # in that case just deal with it later
         # don't stream
         return if @broken
 
         begin
-          @parser << json
+          @parser << raw_json
         rescue DiscourseAi::Completions::ParserError
-          @broken = true
-          return
+          # Note: We're parsing JSON content that was itself embedded as a string inside another JSON object.
+          # During the outer JSON.parse, any escaped control characters (like "\\n") are unescaped to real characters ("\n"),
+          # which corrupts the inner JSON structure when passed to the parser here.
+          # To handle this, we retry parsing with the string JSON-escaped again (`.dump[1..-2]`) if the first attempt fails.
+          try_escape_and_parse(raw_json)
+          return if @broken
         end
 
         if @parser.state == :start_string && @current_key
@@ -48,6 +52,20 @@ def <<(json)
 
         @current_key = nil if @parser.state == :end_value
       end
+
+      private
+
+      def try_escape_and_parse(raw_json)
+        if raw_json.blank? || !raw_json.is_a?(String)
+          @broken = true
+          return
+        end
+        # Escape the string as JSON and remove surrounding quotes
+        escaped_json = raw_json.dump[1..-2]
+        @parser << escaped_json
+      rescue DiscourseAi::Completions::ParserError
+        @broken = true
+      end
     end
   end
 end
diff --git a/spec/lib/completions/endpoints/anthropic_spec.rb b/spec/lib/completions/endpoints/anthropic_spec.rb
@@ -809,6 +809,9 @@
       event: content_block_delta
       data: {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "Hello!"}}
 
+      event: content_block_delta
+      data: {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "\\n there"}}
+
       event: content_block_delta
       data: {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "\\"}"}}
 
@@ -845,7 +848,7 @@
         response_format: schema,
       ) { |partial, cancel| structured_output = partial }
 
-      expect(structured_output.read_buffered_property(:key)).to eq("Hello!")
+      expect(structured_output.read_buffered_property(:key)).to eq("Hello!\n there")
 
       expected_body = {
         model: "claude-3-opus-20240229",
diff --git a/spec/lib/completions/endpoints/aws_bedrock_spec.rb b/spec/lib/completions/endpoints/aws_bedrock_spec.rb
@@ -574,6 +574,7 @@ def encode_message(message)
           { type: "content_block_delta", delta: { text: "key" } },
           { type: "content_block_delta", delta: { text: "\":\"" } },
           { type: "content_block_delta", delta: { text: "Hello!" } },
+          { type: "content_block_delta", delta: { text: "\n There" } },
           { type: "content_block_delta", delta: { text: "\"}" } },
           { type: "message_delta", delta: { usage: { output_tokens: 25 } } },
         ].map { |message| encode_message(message) }
@@ -607,7 +608,7 @@ def encode_message(message)
         }
         expect(JSON.parse(request.body)).to eq(expected)
 
-        expect(structured_output.read_buffered_property(:key)).to eq("Hello!")
+        expect(structured_output.read_buffered_property(:key)).to eq("Hello!\n There")
       end
     end
   end
diff --git a/spec/lib/completions/endpoints/cohere_spec.rb b/spec/lib/completions/endpoints/cohere_spec.rb
@@ -334,8 +334,9 @@
       {"is_finished":false,"event_type":"text-generation","text":"key"}
       {"is_finished":false,"event_type":"text-generation","text":"\\":\\""}
       {"is_finished":false,"event_type":"text-generation","text":"Hello!"}
+      {"is_finished":false,"event_type":"text-generation","text":"\\n there"}
       {"is_finished":false,"event_type":"text-generation","text":"\\"}"}|
-      {"is_finished":true,"event_type":"stream-end","response":{"response_id":"d235db17-8555-493b-8d91-e601f76de3f9","text":"{\\"key\\":\\"Hello!\\"}","generation_id":"eb889b0f-c27d-45ea-98cf-567bdb7fc8bf","chat_history":[{"role":"USER","message":"user1: hello"},{"role":"CHATBOT","message":"hi user"},{"role":"USER","message":"user1: thanks"},{"role":"CHATBOT","message":"You're welcome! Is there anything else I can help you with?"}],"token_count":{"prompt_tokens":29,"response_tokens":14,"total_tokens":43,"billed_tokens":28},"meta":{"api_version":{"version":"1"},"billed_units":{"input_tokens":14,"output_tokens":14}}},"finish_reason":"COMPLETE"}
+      {"is_finished":true,"event_type":"stream-end","response":{"response_id":"d235db17-8555-493b-8d91-e601f76de3f9","text":"{\\"key\\":\\"Hello! \\n there\\"}","generation_id":"eb889b0f-c27d-45ea-98cf-567bdb7fc8bf","chat_history":[{"role":"USER","message":"user1: hello"},{"role":"CHATBOT","message":"hi user"},{"role":"USER","message":"user1: thanks"},{"role":"CHATBOT","message":"You're welcome! Is there anything else I can help you with?"}],"token_count":{"prompt_tokens":29,"response_tokens":14,"total_tokens":43,"billed_tokens":28},"meta":{"api_version":{"version":"1"},"billed_units":{"input_tokens":14,"output_tokens":14}}},"finish_reason":"COMPLETE"}
     TEXT
 
     parsed_body = nil
@@ -366,6 +367,6 @@
     )
     expect(parsed_body[:message]).to eq("user1: thanks")
 
-    expect(structured_output.read_buffered_property(:key)).to eq("Hello!")
+    expect(structured_output.read_buffered_property(:key)).to eq("Hello!\n there")
   end
 end
diff --git a/spec/lib/completions/endpoints/gemini_spec.rb b/spec/lib/completions/endpoints/gemini_spec.rb
@@ -524,6 +524,9 @@ def tool_response
               key: {
                 type: "string",
               },
+              num: {
+                type: "integer",
+              },
             },
             required: ["key"],
             additionalProperties: false,
@@ -541,7 +544,19 @@ def tool_response
 
         data: {"candidates": [{"content": {"parts": [{"text": "Hello!"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
 
-        data: {"candidates": [{"content": {"parts": [{"text": "\\"}"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
+        data: {"candidates": [{"content": {"parts": [{"text": "\\n there"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
+
+        data: {"candidates": [{"content": {"parts": [{"text": "\\","}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
+
+        data: {"candidates": [{"content": {"parts": [{"text": "\\""}],"role": "model"}}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}
+        
+        data: {"candidates": [{"content": {"parts": [{"text": "num"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
+
+        data: {"candidates": [{"content": {"parts": [{"text": "\\":"}],"role": "model"},"safetyRatings": [{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}
+
+        data: {"candidates": [{"content": {"parts": [{"text": "42"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
+
+        data: {"candidates": [{"content": {"parts": [{"text": "}"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
 
         data: {"candidates": [{"finishReason": "MALFORMED_FUNCTION_CALL"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
 
@@ -565,7 +580,8 @@ def tool_response
         structured_response = partial
       end
 
-      expect(structured_response.read_buffered_property(:key)).to eq("Hello!")
+      expect(structured_response.read_buffered_property(:key)).to eq("Hello!\n there")
+      expect(structured_response.read_buffered_property(:num)).to eq(42)
 
       parsed = JSON.parse(req_body, symbolize_names: true)