Skip to content

Commit fadff3f

Browse files
committed
Prevent mis-encoding of "trailing junk after numeric literal" errors.
Since commit 2549f06, we reject an identifier immediately following a numeric literal (without separating whitespace), because that risks ambiguity with hex/octal/binary integers. However, that patch used token patterns like "{integer}{ident_start}", which is problematic because {ident_start} matches only a single byte. If the first character after the integer is a multibyte character, this ends up with flex reporting an error message that includes a partial multibyte character. That can cause assorted bad-encoding problems downstream, both in the report to the client and in the postmaster log file. To fix, use {identifier} not {ident_start} in the "junk" token patterns, so that they will match complete multibyte characters. This seems generally better user experience quite aside from the encoding problem: for "123abc" the error message will now say that the error appeared at or near "123abc" instead of "123a". While at it, add some commentary about why these patterns exist and how they work. Report and patch by Karina Litskevich; review by Pavel Borisov. Back-patch to v15 where the problem came in. Discussion: https://postgr.es/m/CACiT8iZ_diop=0zJ7zuY3BXegJpkKK1Av-PU7xh0EDYHsa5+=g@mail.gmail.com
1 parent 85837b8 commit fadff3f

File tree

4 files changed

+74
-62
lines changed

4 files changed

+74
-62
lines changed

src/backend/parser/scan.l

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -412,16 +412,30 @@ numericfail {decinteger}\.\.
412412
real ({decinteger}|{numeric})[Ee][-+]?{decinteger}
413413
realfail ({decinteger}|{numeric})[Ee][-+]
414414

415-
decinteger_junk {decinteger}{ident_start}
416-
hexinteger_junk {hexinteger}{ident_start}
417-
octinteger_junk {octinteger}{ident_start}
418-
bininteger_junk {bininteger}{ident_start}
419-
numeric_junk {numeric}{ident_start}
420-
real_junk {real}{ident_start}
421-
422415
/* Positional parameters don't accept underscores. */
423416
param \${decdigit}+
424-
param_junk \${decdigit}+{ident_start}
417+
418+
/*
419+
* An identifier immediately following an integer literal is disallowed because
420+
* in some cases it's ambiguous what is meant: for example, 0x1234 could be
421+
* either a hexinteger or a decinteger "0" and an identifier "x1234". We can
422+
* detect such problems by seeing if integer_junk matches a longer substring
423+
* than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
424+
* bininteger). One "junk" pattern is sufficient because
425+
* {decinteger}{identifier} will match all the same strings we'd match with
426+
* {hexinteger}{identifier} etc.
427+
*
428+
* Note that the rule for integer_junk must appear after the ones for
429+
* XXXinteger to make this work correctly: 0x1234 will match both hexinteger
430+
* and integer_junk, and we need hexinteger to be chosen in that case.
431+
*
432+
* Also disallow strings matched by numeric_junk, real_junk and param_junk
433+
* for consistency.
434+
*/
435+
integer_junk {decinteger}{identifier}
436+
numeric_junk {numeric}{identifier}
437+
real_junk {real}{identifier}
438+
param_junk \${decdigit}+{identifier}
425439

426440
other .
427441

@@ -1055,19 +1069,7 @@ other .
10551069
SET_YYLLOC();
10561070
yyerror("trailing junk after numeric literal");
10571071
}
1058-
{decinteger_junk} {
1059-
SET_YYLLOC();
1060-
yyerror("trailing junk after numeric literal");
1061-
}
1062-
{hexinteger_junk} {
1063-
SET_YYLLOC();
1064-
yyerror("trailing junk after numeric literal");
1065-
}
1066-
{octinteger_junk} {
1067-
SET_YYLLOC();
1068-
yyerror("trailing junk after numeric literal");
1069-
}
1070-
{bininteger_junk} {
1072+
{integer_junk} {
10711073
SET_YYLLOC();
10721074
yyerror("trailing junk after numeric literal");
10731075
}

src/fe_utils/psqlscan.l

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -348,16 +348,30 @@ numericfail {decinteger}\.\.
348348
real ({decinteger}|{numeric})[Ee][-+]?{decinteger}
349349
realfail ({decinteger}|{numeric})[Ee][-+]
350350

351-
decinteger_junk {decinteger}{ident_start}
352-
hexinteger_junk {hexinteger}{ident_start}
353-
octinteger_junk {octinteger}{ident_start}
354-
bininteger_junk {bininteger}{ident_start}
355-
numeric_junk {numeric}{ident_start}
356-
real_junk {real}{ident_start}
357-
358351
/* Positional parameters don't accept underscores. */
359352
param \${decdigit}+
360-
param_junk \${decdigit}+{ident_start}
353+
354+
/*
355+
* An identifier immediately following an integer literal is disallowed because
356+
* in some cases it's ambiguous what is meant: for example, 0x1234 could be
357+
* either a hexinteger or a decinteger "0" and an identifier "x1234". We can
358+
* detect such problems by seeing if integer_junk matches a longer substring
359+
* than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
360+
* bininteger). One "junk" pattern is sufficient because
361+
* {decinteger}{identifier} will match all the same strings we'd match with
362+
* {hexinteger}{identifier} etc.
363+
*
364+
* Note that the rule for integer_junk must appear after the ones for
365+
* XXXinteger to make this work correctly: 0x1234 will match both hexinteger
366+
* and integer_junk, and we need hexinteger to be chosen in that case.
367+
*
368+
* Also disallow strings matched by numeric_junk, real_junk and param_junk
369+
* for consistency.
370+
*/
371+
integer_junk {decinteger}{identifier}
372+
numeric_junk {numeric}{identifier}
373+
real_junk {real}{identifier}
374+
param_junk \${decdigit}+{identifier}
361375

362376
/* psql-specific: characters allowed in variable names */
363377
variable_char [A-Za-z\200-\377_0-9]
@@ -898,16 +912,7 @@ other .
898912
{realfail} {
899913
ECHO;
900914
}
901-
{decinteger_junk} {
902-
ECHO;
903-
}
904-
{hexinteger_junk} {
905-
ECHO;
906-
}
907-
{octinteger_junk} {
908-
ECHO;
909-
}
910-
{bininteger_junk} {
915+
{integer_junk} {
911916
ECHO;
912917
}
913918
{numeric_junk} {

src/interfaces/ecpg/preproc/pgc.l

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -381,16 +381,30 @@ numericfail {decinteger}\.\.
381381
real ({decinteger}|{numeric})[Ee][-+]?{decinteger}
382382
realfail ({decinteger}|{numeric})[Ee][-+]
383383

384-
decinteger_junk {decinteger}{ident_start}
385-
hexinteger_junk {hexinteger}{ident_start}
386-
octinteger_junk {octinteger}{ident_start}
387-
bininteger_junk {bininteger}{ident_start}
388-
numeric_junk {numeric}{ident_start}
389-
real_junk {real}{ident_start}
390-
391384
/* Positional parameters don't accept underscores. */
392385
param \${decdigit}+
393-
param_junk \${decdigit}+{ident_start}
386+
387+
/*
388+
* An identifier immediately following an integer literal is disallowed because
389+
* in some cases it's ambiguous what is meant: for example, 0x1234 could be
390+
* either a hexinteger or a decinteger "0" and an identifier "x1234". We can
391+
* detect such problems by seeing if integer_junk matches a longer substring
392+
* than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
393+
* bininteger). One "junk" pattern is sufficient because
394+
* {decinteger}{identifier} will match all the same strings we'd match with
395+
* {hexinteger}{identifier} etc.
396+
*
397+
* Note that the rule for integer_junk must appear after the ones for
398+
* XXXinteger to make this work correctly: 0x1234 will match both hexinteger
399+
* and integer_junk, and we need hexinteger to be chosen in that case.
400+
*
401+
* Also disallow strings matched by numeric_junk, real_junk and param_junk
402+
* for consistency.
403+
*/
404+
integer_junk {decinteger}{identifier}
405+
numeric_junk {numeric}{identifier}
406+
real_junk {real}{identifier}
407+
param_junk \${decdigit}+{identifier}
394408

395409
/* special characters for other dbms */
396410
/* we have to react differently in compat mode */
@@ -1023,16 +1037,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
10231037
* Note that some trailing junk is valid in C (such as 100LL), so we
10241038
* contain this to SQL mode.
10251039
*/
1026-
{decinteger_junk} {
1027-
mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
1028-
}
1029-
{hexinteger_junk} {
1030-
mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
1031-
}
1032-
{octinteger_junk} {
1033-
mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
1034-
}
1035-
{bininteger_junk} {
1040+
{integer_junk} {
10361041
mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
10371042
}
10381043
{numeric_junk} {

src/test/regress/expected/numerology.out

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ SELECT -0x8000000000000001;
171171

172172
-- error cases
173173
SELECT 123abc;
174-
ERROR: trailing junk after numeric literal at or near "123a"
174+
ERROR: trailing junk after numeric literal at or near "123abc"
175175
LINE 1: SELECT 123abc;
176176
^
177177
SELECT 0x0o;
@@ -322,7 +322,7 @@ ERROR: trailing junk after numeric literal at or near "100_"
322322
LINE 1: SELECT 100_;
323323
^
324324
SELECT 100__000;
325-
ERROR: trailing junk after numeric literal at or near "100_"
325+
ERROR: trailing junk after numeric literal at or near "100__000"
326326
LINE 1: SELECT 100__000;
327327
^
328328
SELECT _1_000.5;
@@ -334,19 +334,19 @@ ERROR: trailing junk after numeric literal at or near "1_000_"
334334
LINE 1: SELECT 1_000_.5;
335335
^
336336
SELECT 1_000._5;
337-
ERROR: trailing junk after numeric literal at or near "1_000._"
337+
ERROR: trailing junk after numeric literal at or near "1_000._5"
338338
LINE 1: SELECT 1_000._5;
339339
^
340340
SELECT 1_000.5_;
341341
ERROR: trailing junk after numeric literal at or near "1_000.5_"
342342
LINE 1: SELECT 1_000.5_;
343343
^
344344
SELECT 1_000.5e_1;
345-
ERROR: trailing junk after numeric literal at or near "1_000.5e"
345+
ERROR: trailing junk after numeric literal at or near "1_000.5e_1"
346346
LINE 1: SELECT 1_000.5e_1;
347347
^
348348
PREPARE p1 AS SELECT $0_1;
349-
ERROR: trailing junk after parameter at or near "$0_"
349+
ERROR: trailing junk after parameter at or near "$0_1"
350350
LINE 1: PREPARE p1 AS SELECT $0_1;
351351
^
352352
--

0 commit comments

Comments
 (0)