Skip to content

Commit 0ee9d68

Browse files
committed
Fix JSON error reporting for many cases of erroneous string values.
The majority of error exit cases in json_lex_string() failed to set lex->token_terminator, causing problems for the error context reporting code: it would see token_terminator less than token_start and do something more or less nuts. In v14 and up the end result could be as bad as a crash in report_json_context(). Older versions accidentally avoided that fate; but all versions produce error context lines that are far less useful than intended, because they'd stop at the end of the prior token instead of continuing to where the actually-bad input is. To fix, invent some macros that make it less notationally painful to do the right thing. Also add documentation about what the function is actually required to do; and in >= v14, add an assertion in report_json_context about token_terminator being sufficiently far advanced. Per report from Nikolay Shaplov. Back-patch to all supported versions. Discussion: https://postgr.es/m/7332649.x5DLKWyVIX@thinkpad-pgpro
1 parent 096e708 commit 0ee9d68

File tree

4 files changed

+72
-54
lines changed

4 files changed

+72
-54
lines changed

src/backend/utils/adt/jsonfuncs.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,7 @@ report_json_context(JsonLexContext *lex)
656656
line_start = lex->line_start;
657657
context_start = line_start;
658658
context_end = lex->token_terminator;
659+
Assert(context_end >= context_start);
659660

660661
/* Advance until we are close enough to context_end */
661662
while (context_end - context_start >= 50)

src/common/jsonapi.c

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,14 @@ json_lex(JsonLexContext *lex)
675675

676676
/*
677677
* The next token in the input stream is known to be a string; lex it.
678+
*
679+
* If lex->strval isn't NULL, fill it with the decoded string.
680+
* Set lex->token_terminator to the end of the decoded input, and in
681+
* success cases, transfer its previous value to lex->prev_token_terminator.
682+
* Return JSON_SUCCESS or an error code.
683+
*
684+
* Note: be careful that all error exits advance lex->token_terminator
685+
* to the point after the character we detected the error on.
678686
*/
679687
static inline JsonParseErrorType
680688
json_lex_string(JsonLexContext *lex)
@@ -683,6 +691,19 @@ json_lex_string(JsonLexContext *lex)
683691
int len;
684692
int hi_surrogate = -1;
685693

694+
/* Convenience macros for error exits */
695+
#define FAIL_AT_CHAR_START(code) \
696+
do { \
697+
lex->token_terminator = s; \
698+
return code; \
699+
} while (0)
700+
#define FAIL_AT_CHAR_END(code) \
701+
do { \
702+
lex->token_terminator = \
703+
s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
704+
return code; \
705+
} while (0)
706+
686707
if (lex->strval != NULL)
687708
resetStringInfo(lex->strval);
688709

@@ -695,29 +716,22 @@ json_lex_string(JsonLexContext *lex)
695716
len++;
696717
/* Premature end of the string. */
697718
if (len >= lex->input_length)
698-
{
699-
lex->token_terminator = s;
700-
return JSON_INVALID_TOKEN;
701-
}
719+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
702720
else if (*s == '"')
703721
break;
704722
else if ((unsigned char) *s < 32)
705723
{
706724
/* Per RFC4627, these characters MUST be escaped. */
707725
/* Since *s isn't printable, exclude it from the context string */
708-
lex->token_terminator = s;
709-
return JSON_ESCAPING_REQUIRED;
726+
FAIL_AT_CHAR_START(JSON_ESCAPING_REQUIRED);
710727
}
711728
else if (*s == '\\')
712729
{
713730
/* OK, we have an escape character. */
714731
s++;
715732
len++;
716733
if (len >= lex->input_length)
717-
{
718-
lex->token_terminator = s;
719-
return JSON_INVALID_TOKEN;
720-
}
734+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
721735
else if (*s == 'u')
722736
{
723737
int i;
@@ -728,21 +742,15 @@ json_lex_string(JsonLexContext *lex)
728742
s++;
729743
len++;
730744
if (len >= lex->input_length)
731-
{
732-
lex->token_terminator = s;
733-
return JSON_INVALID_TOKEN;
734-
}
745+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
735746
else if (*s >= '0' && *s <= '9')
736747
ch = (ch * 16) + (*s - '0');
737748
else if (*s >= 'a' && *s <= 'f')
738749
ch = (ch * 16) + (*s - 'a') + 10;
739750
else if (*s >= 'A' && *s <= 'F')
740751
ch = (ch * 16) + (*s - 'A') + 10;
741752
else
742-
{
743-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
744-
return JSON_UNICODE_ESCAPE_FORMAT;
745-
}
753+
FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
746754
}
747755
if (lex->strval != NULL)
748756
{
@@ -752,20 +760,20 @@ json_lex_string(JsonLexContext *lex)
752760
if (is_utf16_surrogate_first(ch))
753761
{
754762
if (hi_surrogate != -1)
755-
return JSON_UNICODE_HIGH_SURROGATE;
763+
FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
756764
hi_surrogate = ch;
757765
continue;
758766
}
759767
else if (is_utf16_surrogate_second(ch))
760768
{
761769
if (hi_surrogate == -1)
762-
return JSON_UNICODE_LOW_SURROGATE;
770+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
763771
ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
764772
hi_surrogate = -1;
765773
}
766774

767775
if (hi_surrogate != -1)
768-
return JSON_UNICODE_LOW_SURROGATE;
776+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
769777

770778
/*
771779
* Reject invalid cases. We can't have a value above
@@ -775,7 +783,7 @@ json_lex_string(JsonLexContext *lex)
775783
if (ch == 0)
776784
{
777785
/* We can't allow this, since our TEXT type doesn't */
778-
return JSON_UNICODE_CODE_POINT_ZERO;
786+
FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
779787
}
780788

781789
/*
@@ -812,14 +820,14 @@ json_lex_string(JsonLexContext *lex)
812820
appendStringInfoChar(lex->strval, (char) ch);
813821
}
814822
else
815-
return JSON_UNICODE_HIGH_ESCAPE;
823+
FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
816824
#endif /* FRONTEND */
817825
}
818826
}
819827
else if (lex->strval != NULL)
820828
{
821829
if (hi_surrogate != -1)
822-
return JSON_UNICODE_LOW_SURROGATE;
830+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
823831

824832
switch (*s)
825833
{
@@ -844,10 +852,14 @@ json_lex_string(JsonLexContext *lex)
844852
appendStringInfoChar(lex->strval, '\t');
845853
break;
846854
default:
847-
/* Not a valid string escape, so signal error. */
855+
856+
/*
857+
* Not a valid string escape, so signal error. We
858+
* adjust token_start so that just the escape sequence
859+
* is reported, not the whole string.
860+
*/
848861
lex->token_start = s;
849-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
850-
return JSON_ESCAPING_INVALID;
862+
FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
851863
}
852864
}
853865
else if (strchr("\"\\/bfnrt", *s) == NULL)
@@ -860,28 +872,33 @@ json_lex_string(JsonLexContext *lex)
860872
* shown it's not a performance win.
861873
*/
862874
lex->token_start = s;
863-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
864-
return JSON_ESCAPING_INVALID;
875+
FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
865876
}
866877

867878
}
868879
else if (lex->strval != NULL)
869880
{
870881
if (hi_surrogate != -1)
871-
return JSON_UNICODE_LOW_SURROGATE;
882+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
872883

873884
appendStringInfoChar(lex->strval, *s);
874885
}
875886

876887
}
877888

878889
if (hi_surrogate != -1)
890+
{
891+
lex->token_terminator = s + 1;
879892
return JSON_UNICODE_LOW_SURROGATE;
893+
}
880894

881895
/* Hooray, we found the end of the string! */
882896
lex->prev_token_terminator = lex->token_terminator;
883897
lex->token_terminator = s + 1;
884898
return JSON_SUCCESS;
899+
900+
#undef FAIL_AT_CHAR_START
901+
#undef FAIL_AT_CHAR_END
885902
}
886903

887904
/*

src/test/regress/expected/json_encoding.out

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,19 @@ select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
5656
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
5757
ERROR: invalid input syntax for type json
5858
DETAIL: Unicode high surrogate must not follow a high surrogate.
59-
CONTEXT: JSON data, line 1: { "a":...
59+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
6060
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
6161
ERROR: invalid input syntax for type json
6262
DETAIL: Unicode low surrogate must follow a high surrogate.
63-
CONTEXT: JSON data, line 1: { "a":...
63+
CONTEXT: JSON data, line 1: { "a": "\ude04...
6464
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
6565
ERROR: invalid input syntax for type json
6666
DETAIL: Unicode low surrogate must follow a high surrogate.
67-
CONTEXT: JSON data, line 1: { "a":...
67+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
6868
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
6969
ERROR: invalid input syntax for type json
7070
DETAIL: Unicode low surrogate must follow a high surrogate.
71-
CONTEXT: JSON data, line 1: { "a":...
71+
CONTEXT: JSON data, line 1: { "a": "\ude04...
7272
--handling of simple unicode escapes
7373
select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
7474
correct_in_utf8
@@ -121,7 +121,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
121121
select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
122122
ERROR: unsupported Unicode escape sequence
123123
DETAIL: \u0000 cannot be converted to text.
124-
CONTEXT: JSON data, line 1: { "a":...
124+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
125125
select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
126126
not_an_escape
127127
--------------------
@@ -159,7 +159,7 @@ ERROR: unsupported Unicode escape sequence
159159
LINE 1: SELECT '"\u0000"'::jsonb;
160160
^
161161
DETAIL: \u0000 cannot be converted to text.
162-
CONTEXT: JSON data, line 1: ...
162+
CONTEXT: JSON data, line 1: "\u0000...
163163
-- use octet_length here so we don't get an odd unicode char in the
164164
-- output
165165
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -180,25 +180,25 @@ ERROR: invalid input syntax for type json
180180
LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a';
181181
^
182182
DETAIL: Unicode high surrogate must not follow a high surrogate.
183-
CONTEXT: JSON data, line 1: { "a":...
183+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
184184
SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
185185
ERROR: invalid input syntax for type json
186186
LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a';
187187
^
188188
DETAIL: Unicode low surrogate must follow a high surrogate.
189-
CONTEXT: JSON data, line 1: { "a":...
189+
CONTEXT: JSON data, line 1: { "a": "\ude04...
190190
SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
191191
ERROR: invalid input syntax for type json
192192
LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a';
193193
^
194194
DETAIL: Unicode low surrogate must follow a high surrogate.
195-
CONTEXT: JSON data, line 1: { "a":...
195+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
196196
SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
197197
ERROR: invalid input syntax for type json
198198
LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a';
199199
^
200200
DETAIL: Unicode low surrogate must follow a high surrogate.
201-
CONTEXT: JSON data, line 1: { "a":...
201+
CONTEXT: JSON data, line 1: { "a": "\ude04...
202202
-- handling of simple unicode escapes
203203
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
204204
correct_in_utf8
@@ -223,7 +223,7 @@ ERROR: unsupported Unicode escape sequence
223223
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
224224
^
225225
DETAIL: \u0000 cannot be converted to text.
226-
CONTEXT: JSON data, line 1: { "a":...
226+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
227227
SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
228228
not_an_escape
229229
------------------------------
@@ -253,7 +253,7 @@ ERROR: unsupported Unicode escape sequence
253253
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai...
254254
^
255255
DETAIL: \u0000 cannot be converted to text.
256-
CONTEXT: JSON data, line 1: { "a":...
256+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
257257
SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
258258
not_an_escape
259259
--------------------

src/test/regress/expected/json_encoding_1.out

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,19 @@ ERROR: conversion between UTF8 and SQL_ASCII is not supported
5252
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
5353
ERROR: invalid input syntax for type json
5454
DETAIL: Unicode high surrogate must not follow a high surrogate.
55-
CONTEXT: JSON data, line 1: { "a":...
55+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
5656
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
5757
ERROR: invalid input syntax for type json
5858
DETAIL: Unicode low surrogate must follow a high surrogate.
59-
CONTEXT: JSON data, line 1: { "a":...
59+
CONTEXT: JSON data, line 1: { "a": "\ude04...
6060
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
6161
ERROR: invalid input syntax for type json
6262
DETAIL: Unicode low surrogate must follow a high surrogate.
63-
CONTEXT: JSON data, line 1: { "a":...
63+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
6464
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
6565
ERROR: invalid input syntax for type json
6666
DETAIL: Unicode low surrogate must follow a high surrogate.
67-
CONTEXT: JSON data, line 1: { "a":...
67+
CONTEXT: JSON data, line 1: { "a": "\ude04...
6868
--handling of simple unicode escapes
6969
select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
7070
correct_in_utf8
@@ -113,7 +113,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
113113
select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
114114
ERROR: unsupported Unicode escape sequence
115115
DETAIL: \u0000 cannot be converted to text.
116-
CONTEXT: JSON data, line 1: { "a":...
116+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
117117
select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
118118
not_an_escape
119119
--------------------
@@ -151,7 +151,7 @@ ERROR: unsupported Unicode escape sequence
151151
LINE 1: SELECT '"\u0000"'::jsonb;
152152
^
153153
DETAIL: \u0000 cannot be converted to text.
154-
CONTEXT: JSON data, line 1: ...
154+
CONTEXT: JSON data, line 1: "\u0000...
155155
-- use octet_length here so we don't get an odd unicode char in the
156156
-- output
157157
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -168,25 +168,25 @@ ERROR: invalid input syntax for type json
168168
LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a';
169169
^
170170
DETAIL: Unicode high surrogate must not follow a high surrogate.
171-
CONTEXT: JSON data, line 1: { "a":...
171+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
172172
SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
173173
ERROR: invalid input syntax for type json
174174
LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a';
175175
^
176176
DETAIL: Unicode low surrogate must follow a high surrogate.
177-
CONTEXT: JSON data, line 1: { "a":...
177+
CONTEXT: JSON data, line 1: { "a": "\ude04...
178178
SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
179179
ERROR: invalid input syntax for type json
180180
LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a';
181181
^
182182
DETAIL: Unicode low surrogate must follow a high surrogate.
183-
CONTEXT: JSON data, line 1: { "a":...
183+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
184184
SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
185185
ERROR: invalid input syntax for type json
186186
LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a';
187187
^
188188
DETAIL: Unicode low surrogate must follow a high surrogate.
189-
CONTEXT: JSON data, line 1: { "a":...
189+
CONTEXT: JSON data, line 1: { "a": "\ude04...
190190
-- handling of simple unicode escapes
191191
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
192192
ERROR: conversion between UTF8 and SQL_ASCII is not supported
@@ -209,7 +209,7 @@ ERROR: unsupported Unicode escape sequence
209209
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
210210
^
211211
DETAIL: \u0000 cannot be converted to text.
212-
CONTEXT: JSON data, line 1: { "a":...
212+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
213213
SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
214214
not_an_escape
215215
------------------------------
@@ -237,7 +237,7 @@ ERROR: unsupported Unicode escape sequence
237237
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai...
238238
^
239239
DETAIL: \u0000 cannot be converted to text.
240-
CONTEXT: JSON data, line 1: { "a":...
240+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
241241
SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
242242
not_an_escape
243243
--------------------

0 commit comments

Comments
 (0)