Skip to content

Commit 25a7812

Browse files
committed
Fix JSON error reporting for many cases of erroneous string values.
The majority of error exit cases in json_lex_string() failed to set lex->token_terminator, causing problems for the error context reporting code: it would see token_terminator less than token_start and do something more or less nuts. In v14 and up the end result could be as bad as a crash in report_json_context(). Older versions accidentally avoided that fate; but all versions produce error context lines that are far less useful than intended, because they'd stop at the end of the prior token instead of continuing to where the actually-bad input is. To fix, invent some macros that make it less notationally painful to do the right thing. Also add documentation about what the function is actually required to do; and in >= v14, add an assertion in report_json_context about token_terminator being sufficiently far advanced. Per report from Nikolay Shaplov. Back-patch to all supported versions. Discussion: https://postgr.es/m/7332649.x5DLKWyVIX@thinkpad-pgpro
1 parent 30dbdbe commit 25a7812

File tree

4 files changed

+78
-59
lines changed

4 files changed

+78
-59
lines changed

src/backend/utils/adt/jsonfuncs.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,7 @@ report_json_context(JsonLexContext *lex)
675675
line_start = lex->line_start;
676676
context_start = line_start;
677677
context_end = lex->token_terminator;
678+
Assert(context_end >= context_start);
678679

679680
/* Advance until we are close enough to context_end */
680681
while (context_end - context_start >= 50)

src/common/jsonapi.c

Lines changed: 47 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,14 @@ json_lex(JsonLexContext *lex)
697697

698698
/*
699699
* The next token in the input stream is known to be a string; lex it.
700+
*
701+
* If lex->strval isn't NULL, fill it with the decoded string.
702+
* Set lex->token_terminator to the end of the decoded input, and in
703+
* success cases, transfer its previous value to lex->prev_token_terminator.
704+
* Return JSON_SUCCESS or an error code.
705+
*
706+
* Note: be careful that all error exits advance lex->token_terminator
707+
* to the point after the character we detected the error on.
700708
*/
701709
static inline JsonParseErrorType
702710
json_lex_string(JsonLexContext *lex)
@@ -705,6 +713,19 @@ json_lex_string(JsonLexContext *lex)
705713
char *const end = lex->input + lex->input_length;
706714
int hi_surrogate = -1;
707715

716+
/* Convenience macros for error exits */
717+
#define FAIL_AT_CHAR_START(code) \
718+
do { \
719+
lex->token_terminator = s; \
720+
return code; \
721+
} while (0)
722+
#define FAIL_AT_CHAR_END(code) \
723+
do { \
724+
lex->token_terminator = \
725+
s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
726+
return code; \
727+
} while (0)
728+
708729
if (lex->strval != NULL)
709730
resetStringInfo(lex->strval);
710731

@@ -715,21 +736,15 @@ json_lex_string(JsonLexContext *lex)
715736
s++;
716737
/* Premature end of the string. */
717738
if (s >= end)
718-
{
719-
lex->token_terminator = s;
720-
return JSON_INVALID_TOKEN;
721-
}
739+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
722740
else if (*s == '"')
723741
break;
724742
else if (*s == '\\')
725743
{
726744
/* OK, we have an escape character. */
727745
s++;
728746
if (s >= end)
729-
{
730-
lex->token_terminator = s;
731-
return JSON_INVALID_TOKEN;
732-
}
747+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
733748
else if (*s == 'u')
734749
{
735750
int i;
@@ -739,21 +754,15 @@ json_lex_string(JsonLexContext *lex)
739754
{
740755
s++;
741756
if (s >= end)
742-
{
743-
lex->token_terminator = s;
744-
return JSON_INVALID_TOKEN;
745-
}
757+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
746758
else if (*s >= '0' && *s <= '9')
747759
ch = (ch * 16) + (*s - '0');
748760
else if (*s >= 'a' && *s <= 'f')
749761
ch = (ch * 16) + (*s - 'a') + 10;
750762
else if (*s >= 'A' && *s <= 'F')
751763
ch = (ch * 16) + (*s - 'A') + 10;
752764
else
753-
{
754-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
755-
return JSON_UNICODE_ESCAPE_FORMAT;
756-
}
765+
FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
757766
}
758767
if (lex->strval != NULL)
759768
{
@@ -763,20 +772,20 @@ json_lex_string(JsonLexContext *lex)
763772
if (is_utf16_surrogate_first(ch))
764773
{
765774
if (hi_surrogate != -1)
766-
return JSON_UNICODE_HIGH_SURROGATE;
775+
FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
767776
hi_surrogate = ch;
768777
continue;
769778
}
770779
else if (is_utf16_surrogate_second(ch))
771780
{
772781
if (hi_surrogate == -1)
773-
return JSON_UNICODE_LOW_SURROGATE;
782+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
774783
ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
775784
hi_surrogate = -1;
776785
}
777786

778787
if (hi_surrogate != -1)
779-
return JSON_UNICODE_LOW_SURROGATE;
788+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
780789

781790
/*
782791
* Reject invalid cases. We can't have a value above
@@ -786,7 +795,7 @@ json_lex_string(JsonLexContext *lex)
786795
if (ch == 0)
787796
{
788797
/* We can't allow this, since our TEXT type doesn't */
789-
return JSON_UNICODE_CODE_POINT_ZERO;
798+
FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
790799
}
791800

792801
/*
@@ -800,7 +809,7 @@ json_lex_string(JsonLexContext *lex)
800809
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
801810

802811
if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
803-
return JSON_UNICODE_UNTRANSLATABLE;
812+
FAIL_AT_CHAR_END(JSON_UNICODE_UNTRANSLATABLE);
804813
appendStringInfoString(lex->strval, cbuf);
805814
}
806815
#else
@@ -820,14 +829,14 @@ json_lex_string(JsonLexContext *lex)
820829
appendStringInfoChar(lex->strval, (char) ch);
821830
}
822831
else
823-
return JSON_UNICODE_HIGH_ESCAPE;
832+
FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
824833
#endif /* FRONTEND */
825834
}
826835
}
827836
else if (lex->strval != NULL)
828837
{
829838
if (hi_surrogate != -1)
830-
return JSON_UNICODE_LOW_SURROGATE;
839+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
831840

832841
switch (*s)
833842
{
@@ -852,10 +861,14 @@ json_lex_string(JsonLexContext *lex)
852861
appendStringInfoChar(lex->strval, '\t');
853862
break;
854863
default:
855-
/* Not a valid string escape, so signal error. */
864+
865+
/*
866+
* Not a valid string escape, so signal error. We
867+
* adjust token_start so that just the escape sequence
868+
* is reported, not the whole string.
869+
*/
856870
lex->token_start = s;
857-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
858-
return JSON_ESCAPING_INVALID;
871+
FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
859872
}
860873
}
861874
else if (strchr("\"\\/bfnrt", *s) == NULL)
@@ -868,16 +881,15 @@ json_lex_string(JsonLexContext *lex)
868881
* shown it's not a performance win.
869882
*/
870883
lex->token_start = s;
871-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
872-
return JSON_ESCAPING_INVALID;
884+
FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
873885
}
874886
}
875887
else
876888
{
877889
char *p = s;
878890

879891
if (hi_surrogate != -1)
880-
return JSON_UNICODE_LOW_SURROGATE;
892+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
881893

882894
/*
883895
* Skip to the first byte that requires special handling, so we
@@ -917,12 +929,18 @@ json_lex_string(JsonLexContext *lex)
917929
}
918930

919931
if (hi_surrogate != -1)
932+
{
933+
lex->token_terminator = s + 1;
920934
return JSON_UNICODE_LOW_SURROGATE;
935+
}
921936

922937
/* Hooray, we found the end of the string! */
923938
lex->prev_token_terminator = lex->token_terminator;
924939
lex->token_terminator = s + 1;
925940
return JSON_SUCCESS;
941+
942+
#undef FAIL_AT_CHAR_START
943+
#undef FAIL_AT_CHAR_END
926944
}
927945

928946
/*

src/test/regress/expected/json_encoding.out

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,19 @@ select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
5656
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
5757
ERROR: invalid input syntax for type json
5858
DETAIL: Unicode high surrogate must not follow a high surrogate.
59-
CONTEXT: JSON data, line 1: { "a":...
59+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
6060
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
6161
ERROR: invalid input syntax for type json
6262
DETAIL: Unicode low surrogate must follow a high surrogate.
63-
CONTEXT: JSON data, line 1: { "a":...
63+
CONTEXT: JSON data, line 1: { "a": "\ude04...
6464
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
6565
ERROR: invalid input syntax for type json
6666
DETAIL: Unicode low surrogate must follow a high surrogate.
67-
CONTEXT: JSON data, line 1: { "a":...
67+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
6868
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
6969
ERROR: invalid input syntax for type json
7070
DETAIL: Unicode low surrogate must follow a high surrogate.
71-
CONTEXT: JSON data, line 1: { "a":...
71+
CONTEXT: JSON data, line 1: { "a": "\ude04...
7272
--handling of simple unicode escapes
7373
select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
7474
correct_in_utf8
@@ -121,7 +121,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
121121
select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
122122
ERROR: unsupported Unicode escape sequence
123123
DETAIL: \u0000 cannot be converted to text.
124-
CONTEXT: JSON data, line 1: { "a":...
124+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
125125
select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
126126
not_an_escape
127127
--------------------
@@ -159,7 +159,7 @@ ERROR: unsupported Unicode escape sequence
159159
LINE 1: SELECT '"\u0000"'::jsonb;
160160
^
161161
DETAIL: \u0000 cannot be converted to text.
162-
CONTEXT: JSON data, line 1: ...
162+
CONTEXT: JSON data, line 1: "\u0000...
163163
-- use octet_length here so we don't get an odd unicode char in the
164164
-- output
165165
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -180,25 +180,25 @@ ERROR: invalid input syntax for type json
180180
LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a';
181181
^
182182
DETAIL: Unicode high surrogate must not follow a high surrogate.
183-
CONTEXT: JSON data, line 1: { "a":...
183+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
184184
SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
185185
ERROR: invalid input syntax for type json
186186
LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a';
187187
^
188188
DETAIL: Unicode low surrogate must follow a high surrogate.
189-
CONTEXT: JSON data, line 1: { "a":...
189+
CONTEXT: JSON data, line 1: { "a": "\ude04...
190190
SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
191191
ERROR: invalid input syntax for type json
192192
LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a';
193193
^
194194
DETAIL: Unicode low surrogate must follow a high surrogate.
195-
CONTEXT: JSON data, line 1: { "a":...
195+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
196196
SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
197197
ERROR: invalid input syntax for type json
198198
LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a';
199199
^
200200
DETAIL: Unicode low surrogate must follow a high surrogate.
201-
CONTEXT: JSON data, line 1: { "a":...
201+
CONTEXT: JSON data, line 1: { "a": "\ude04...
202202
-- handling of simple unicode escapes
203203
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
204204
correct_in_utf8
@@ -223,7 +223,7 @@ ERROR: unsupported Unicode escape sequence
223223
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
224224
^
225225
DETAIL: \u0000 cannot be converted to text.
226-
CONTEXT: JSON data, line 1: { "a":...
226+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
227227
SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
228228
not_an_escape
229229
------------------------------
@@ -253,7 +253,7 @@ ERROR: unsupported Unicode escape sequence
253253
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai...
254254
^
255255
DETAIL: \u0000 cannot be converted to text.
256-
CONTEXT: JSON data, line 1: { "a":...
256+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
257257
SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
258258
not_an_escape
259259
--------------------

0 commit comments

Comments
 (0)