Skip to content

Commit d72ad84

Browse files
authored
FIX: Retry parsing escaped inner JSON to handle control chars. (#1357)
The structured output JSON comes embedded inside the API response, which is also a JSON. Since we have to parse the response to process it, any control characters inside the structured output are unescaped into regular characters, leading to invalid JSON and breaking during parsing. This change adds a retry mechanism that escapes the string again if parsing fails, preventing the parser from breaking on malformed input and working around this issue. For example: ``` original = '{ "a": "{\\"key\\":\\"value with \\n newline\\"}" }' JSON.parse(original) => { "a" => "{\"key\":\"value with \n newline\"}" } # At this point, the inner JSON string contains an actual newline. ```
1 parent e207eba commit d72ad84

File tree

5 files changed

+49
-10
lines changed

5 files changed

+49
-10
lines changed

lib/completions/json_streaming_tracker.rb

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,21 @@ def broken?
2828
@broken
2929
end
3030

31-
def <<(json)
31+
def <<(raw_json)
3232
# llm could send broken json
3333
# in that case just deal with it later
3434
# don't stream
3535
return if @broken
3636

3737
begin
38-
@parser << json
38+
@parser << raw_json
3939
rescue DiscourseAi::Completions::ParserError
40-
@broken = true
41-
return
40+
# Note: We're parsing JSON content that was itself embedded as a string inside another JSON object.
41+
# During the outer JSON.parse, any escaped control characters (like "\\n") are unescaped to real characters ("\n"),
42+
# which corrupts the inner JSON structure when passed to the parser here.
43+
# To handle this, we retry parsing with the string JSON-escaped again (`.dump[1..-2]`) if the first attempt fails.
44+
try_escape_and_parse(raw_json)
45+
return if @broken
4246
end
4347

4448
if @parser.state == :start_string && @current_key
@@ -48,6 +52,20 @@ def <<(json)
4852

4953
@current_key = nil if @parser.state == :end_value
5054
end
55+
56+
private
57+
58+
def try_escape_and_parse(raw_json)
59+
if raw_json.blank? || !raw_json.is_a?(String)
60+
@broken = true
61+
return
62+
end
63+
# Escape the string as JSON and remove surrounding quotes
64+
escaped_json = raw_json.dump[1..-2]
65+
@parser << escaped_json
66+
rescue DiscourseAi::Completions::ParserError
67+
@broken = true
68+
end
5169
end
5270
end
5371
end

spec/lib/completions/endpoints/anthropic_spec.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,9 @@
809809
event: content_block_delta
810810
data: {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "Hello!"}}
811811
812+
event: content_block_delta
813+
data: {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "\\n there"}}
814+
812815
event: content_block_delta
813816
data: {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "\\"}"}}
814817
@@ -845,7 +848,7 @@
845848
response_format: schema,
846849
) { |partial, cancel| structured_output = partial }
847850

848-
expect(structured_output.read_buffered_property(:key)).to eq("Hello!")
851+
expect(structured_output.read_buffered_property(:key)).to eq("Hello!\n there")
849852

850853
expected_body = {
851854
model: "claude-3-opus-20240229",

spec/lib/completions/endpoints/aws_bedrock_spec.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ def encode_message(message)
574574
{ type: "content_block_delta", delta: { text: "key" } },
575575
{ type: "content_block_delta", delta: { text: "\":\"" } },
576576
{ type: "content_block_delta", delta: { text: "Hello!" } },
577+
{ type: "content_block_delta", delta: { text: "\n There" } },
577578
{ type: "content_block_delta", delta: { text: "\"}" } },
578579
{ type: "message_delta", delta: { usage: { output_tokens: 25 } } },
579580
].map { |message| encode_message(message) }
@@ -607,7 +608,7 @@ def encode_message(message)
607608
}
608609
expect(JSON.parse(request.body)).to eq(expected)
609610

610-
expect(structured_output.read_buffered_property(:key)).to eq("Hello!")
611+
expect(structured_output.read_buffered_property(:key)).to eq("Hello!\n There")
611612
end
612613
end
613614
end

spec/lib/completions/endpoints/cohere_spec.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,9 @@
334334
{"is_finished":false,"event_type":"text-generation","text":"key"}
335335
{"is_finished":false,"event_type":"text-generation","text":"\\":\\""}
336336
{"is_finished":false,"event_type":"text-generation","text":"Hello!"}
337+
{"is_finished":false,"event_type":"text-generation","text":"\\n there"}
337338
{"is_finished":false,"event_type":"text-generation","text":"\\"}"}|
338-
{"is_finished":true,"event_type":"stream-end","response":{"response_id":"d235db17-8555-493b-8d91-e601f76de3f9","text":"{\\"key\\":\\"Hello!\\"}","generation_id":"eb889b0f-c27d-45ea-98cf-567bdb7fc8bf","chat_history":[{"role":"USER","message":"user1: hello"},{"role":"CHATBOT","message":"hi user"},{"role":"USER","message":"user1: thanks"},{"role":"CHATBOT","message":"You're welcome! Is there anything else I can help you with?"}],"token_count":{"prompt_tokens":29,"response_tokens":14,"total_tokens":43,"billed_tokens":28},"meta":{"api_version":{"version":"1"},"billed_units":{"input_tokens":14,"output_tokens":14}}},"finish_reason":"COMPLETE"}
339+
{"is_finished":true,"event_type":"stream-end","response":{"response_id":"d235db17-8555-493b-8d91-e601f76de3f9","text":"{\\"key\\":\\"Hello! \\n there\\"}","generation_id":"eb889b0f-c27d-45ea-98cf-567bdb7fc8bf","chat_history":[{"role":"USER","message":"user1: hello"},{"role":"CHATBOT","message":"hi user"},{"role":"USER","message":"user1: thanks"},{"role":"CHATBOT","message":"You're welcome! Is there anything else I can help you with?"}],"token_count":{"prompt_tokens":29,"response_tokens":14,"total_tokens":43,"billed_tokens":28},"meta":{"api_version":{"version":"1"},"billed_units":{"input_tokens":14,"output_tokens":14}}},"finish_reason":"COMPLETE"}
339340
TEXT
340341

341342
parsed_body = nil
@@ -366,6 +367,6 @@
366367
)
367368
expect(parsed_body[:message]).to eq("user1: thanks")
368369

369-
expect(structured_output.read_buffered_property(:key)).to eq("Hello!")
370+
expect(structured_output.read_buffered_property(:key)).to eq("Hello!\n there")
370371
end
371372
end

spec/lib/completions/endpoints/gemini_spec.rb

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,9 @@ def tool_response
524524
key: {
525525
type: "string",
526526
},
527+
num: {
528+
type: "integer",
529+
},
527530
},
528531
required: ["key"],
529532
additionalProperties: false,
@@ -541,7 +544,19 @@ def tool_response
541544
542545
data: {"candidates": [{"content": {"parts": [{"text": "Hello!"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
543546
544-
data: {"candidates": [{"content": {"parts": [{"text": "\\"}"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
547+
data: {"candidates": [{"content": {"parts": [{"text": "\\n there"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
548+
549+
data: {"candidates": [{"content": {"parts": [{"text": "\\","}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
550+
551+
data: {"candidates": [{"content": {"parts": [{"text": "\\""}],"role": "model"}}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}
552+
553+
data: {"candidates": [{"content": {"parts": [{"text": "num"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
554+
555+
data: {"candidates": [{"content": {"parts": [{"text": "\\":"}],"role": "model"},"safetyRatings": [{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}
556+
557+
data: {"candidates": [{"content": {"parts": [{"text": "42"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
558+
559+
data: {"candidates": [{"content": {"parts": [{"text": "}"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
545560
546561
data: {"candidates": [{"finishReason": "MALFORMED_FUNCTION_CALL"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
547562
@@ -565,7 +580,8 @@ def tool_response
565580
structured_response = partial
566581
end
567582

568-
expect(structured_response.read_buffered_property(:key)).to eq("Hello!")
583+
expect(structured_response.read_buffered_property(:key)).to eq("Hello!\n there")
584+
expect(structured_response.read_buffered_property(:num)).to eq(42)
569585

570586
parsed = JSON.parse(req_body, symbolize_names: true)
571587

0 commit comments

Comments
 (0)