Skip to content

Commit 5ada9ef

Browse files
committed
Teach plpgsql's lexer about dollar-quoted literals.
Andrew Dunstan, some help from Tom Lane.
1 parent fa7a3ab commit 5ada9ef

File tree

4 files changed

+118
-30
lines changed

4 files changed

+118
-30
lines changed

src/pl/plpgsql/src/gram.y

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* procedural language
55
*
66
* IDENTIFICATION
7-
* $PostgreSQL: pgsql/src/pl/plpgsql/src/gram.y,v 1.50 2003/12/23 00:01:57 tgl Exp $
7+
* $PostgreSQL: pgsql/src/pl/plpgsql/src/gram.y,v 1.51 2004/02/25 18:10:51 tgl Exp $
88
*
99
* This software is copyrighted by Jan Wieck - Hamburg.
1010
*
@@ -1235,7 +1235,7 @@ stmt_raise : K_RAISE lno raise_level raise_msg raise_params ';'
12351235

12361236
raise_msg : T_STRING
12371237
{
1238-
$$ = strdup(yytext);
1238+
$$ = plpgsql_get_string_value();
12391239
}
12401240
;
12411241

src/pl/plpgsql/src/pl_exec.c

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* procedural language
44
*
55
* IDENTIFICATION
6-
* $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_exec.c,v 1.96 2004/02/24 01:44:33 tgl Exp $
6+
* $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_exec.c,v 1.97 2004/02/25 18:10:51 tgl Exp $
77
*
88
* This software is copyrighted by Jan Wieck - Hamburg.
99
*
@@ -1805,7 +1805,7 @@ exec_stmt_raise(PLpgSQL_execstate * estate, PLpgSQL_stmt_raise * stmt)
18051805
for (cp = stmt->message; *cp; cp++)
18061806
{
18071807
/*
1808-
* Occurences of a single % are replaced by the next argument's
1808+
* Occurrences of a single % are replaced by the next argument's
18091809
* external representation. Double %'s are converted to one %.
18101810
*/
18111811
if ((c[0] = *cp) == '%')
@@ -1834,21 +1834,6 @@ exec_stmt_raise(PLpgSQL_execstate * estate, PLpgSQL_stmt_raise * stmt)
18341834
continue;
18351835
}
18361836

1837-
/*
1838-
* Occurrences of single ' are removed. double ' are reduced to
1839-
* single ones. We must do this because the parameter stored by
1840-
* the grammar is the raw T_STRING input literal, rather than the
1841-
* de-lexed string as you might expect ...
1842-
*/
1843-
if (*cp == '\'')
1844-
{
1845-
cp++;
1846-
if (*cp == '\'')
1847-
plpgsql_dstring_append(&ds, c);
1848-
else
1849-
cp--;
1850-
continue;
1851-
}
18521837
plpgsql_dstring_append(&ds, c);
18531838
}
18541839

src/pl/plpgsql/src/plpgsql.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* procedural language
44
*
55
* IDENTIFICATION
6-
* $PostgreSQL: pgsql/src/pl/plpgsql/src/plpgsql.h,v 1.43 2003/11/29 19:52:12 pgsql Exp $
6+
* $PostgreSQL: pgsql/src/pl/plpgsql/src/plpgsql.h,v 1.44 2004/02/25 18:10:51 tgl Exp $
77
*
88
* This software is copyrighted by Jan Wieck - Hamburg.
99
*
@@ -694,5 +694,6 @@ extern void plpgsql_push_back_token(int token);
694694
extern int plpgsql_scanner_lineno(void);
695695
extern void plpgsql_scanner_init(const char *str, int functype);
696696
extern void plpgsql_scanner_finish(void);
697+
extern char *plpgsql_get_string_value(void);
697698

698699
#endif /* PLPGSQL_H */

src/pl/plpgsql/src/scan.l

Lines changed: 112 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* procedural language
55
*
66
* IDENTIFICATION
7-
* $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.31 2004/02/24 22:06:32 tgl Exp $
7+
* $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.32 2004/02/25 18:10:51 tgl Exp $
88
*
99
* This software is copyrighted by Jan Wieck - Hamburg.
1010
*
@@ -57,6 +57,8 @@ static int lookahead_token;
5757
static bool have_lookahead_token;
5858
static const char *cur_line_start;
5959
static int cur_line_num;
60+
static char *dolqstart; /* current $foo$ quote start string */
61+
static int dolqlen; /* signal to plpgsql_get_string_value */
6062

6163
int plpgsql_SpaceScanned = 0;
6264
%}
@@ -70,7 +72,9 @@ int plpgsql_SpaceScanned = 0;
7072
%option case-insensitive
7173

7274

73-
%x IN_STRING IN_COMMENT
75+
%x IN_STRING
76+
%x IN_COMMENT
77+
%x IN_DOLLARQUOTE
7478

7579
digit [0-9]
7680
ident_start [A-Za-z\200-\377_]
@@ -84,6 +88,14 @@ param \${digit}+
8488

8589
space [ \t\n\r\f]
8690

91+
/* $foo$ style quotes ("dollar quoting")
92+
* copied straight from the backend SQL parser
93+
*/
94+
dolq_start [A-Za-z\200-\377_]
95+
dolq_cont [A-Za-z\200-\377_0-9]
96+
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
97+
dolqinside [^$]+
98+
8799
%%
88100
/* ----------
89101
* Local variables in scanner to remember where
@@ -97,7 +109,7 @@ space [ \t\n\r\f]
97109
* Reset the state when entering the scanner
98110
* ----------
99111
*/
100-
BEGIN INITIAL;
112+
BEGIN(INITIAL);
101113
plpgsql_SpaceScanned = 0;
102114

103115
/* ----------
@@ -247,9 +259,9 @@ dump { return O_DUMP; }
247259
--[^\r\n]* ;
248260

249261
\/\* { start_lineno = plpgsql_scanner_lineno();
250-
BEGIN IN_COMMENT;
262+
BEGIN(IN_COMMENT);
251263
}
252-
<IN_COMMENT>\*\/ { BEGIN INITIAL; plpgsql_SpaceScanned = 1; }
264+
<IN_COMMENT>\*\/ { BEGIN(INITIAL); plpgsql_SpaceScanned = 1; }
253265
<IN_COMMENT>\n ;
254266
<IN_COMMENT>. ;
255267
<IN_COMMENT><<EOF>> {
@@ -260,7 +272,7 @@ dump { return O_DUMP; }
260272
}
261273

262274
/* ----------
263-
* Collect anything inside of ''s and return one STRING
275+
* Collect anything inside of ''s and return one STRING token
264276
*
265277
* Hacking yytext/yyleng here lets us avoid using yymore(), which is
266278
* a win for performance. It's safe because we know the underlying
@@ -270,15 +282,18 @@ dump { return O_DUMP; }
270282
' {
271283
start_lineno = plpgsql_scanner_lineno();
272284
start_charpos = yytext;
273-
BEGIN IN_STRING;
285+
BEGIN(IN_STRING);
274286
}
275287
<IN_STRING>\\. { }
276288
<IN_STRING>\\ { /* can only happen with \ at EOF */ }
277289
<IN_STRING>'' { }
278290
<IN_STRING>' {
279-
yyleng -= (yytext - start_charpos);
291+
/* tell plpgsql_get_string_value it's not a dollar quote */
292+
dolqlen = 0;
293+
/* adjust yytext/yyleng to describe whole string token */
294+
yyleng += (yytext - start_charpos);
280295
yytext = start_charpos;
281-
BEGIN INITIAL;
296+
BEGIN(INITIAL);
282297
return T_STRING;
283298
}
284299
<IN_STRING>[^'\\]+ { }
@@ -289,6 +304,43 @@ dump { return O_DUMP; }
289304
errmsg("unterminated string")));
290305
}
291306

307+
{dolqdelim} {
308+
start_lineno = plpgsql_scanner_lineno();
309+
start_charpos = yytext;
310+
dolqstart = pstrdup(yytext);
311+
BEGIN(IN_DOLLARQUOTE);
312+
}
313+
<IN_DOLLARQUOTE>{dolqdelim} {
314+
if (strcmp(yytext, dolqstart) == 0)
315+
{
316+
pfree(dolqstart);
317+
/* tell plpgsql_get_string_value it is a dollar quote */
318+
dolqlen = yyleng;
319+
/* adjust yytext/yyleng to describe whole string token */
320+
yyleng += (yytext - start_charpos);
321+
yytext = start_charpos;
322+
BEGIN(INITIAL);
323+
return T_STRING;
324+
}
325+
else
326+
{
327+
/*
328+
* When we fail to match $...$ to dolqstart, transfer
329+
* the $... part to the output, but put back the final
330+
* $ for rescanning. Consider $delim$...$junk$delim$
331+
*/
332+
yyless(yyleng-1);
333+
}
334+
}
335+
<IN_DOLLARQUOTE>{dolqinside} { }
336+
<IN_DOLLARQUOTE>. { /* needed for $ inside the quoted text */ }
337+
<IN_DOLLARQUOTE><<EOF>> {
338+
plpgsql_error_lineno = start_lineno;
339+
ereport(ERROR,
340+
(errcode(ERRCODE_DATATYPE_MISMATCH),
341+
errmsg("unterminated dollar-quoted string")));
342+
}
343+
292344
/* ----------
293345
* Any unmatched character is returned as is
294346
* ----------
@@ -429,7 +481,6 @@ plpgsql_scanner_init(const char *str, int functype)
429481
BEGIN(INITIAL);
430482
}
431483

432-
433484
/*
434485
* Called after parsing is done to clean up after plpgsql_scanner_init()
435486
*/
@@ -439,3 +490,54 @@ plpgsql_scanner_finish(void)
439490
yy_delete_buffer(scanbufhandle);
440491
pfree(scanbuf);
441492
}
493+
494+
/*
495+
* Called after a T_STRING token is read to get the string literal's value
496+
* as a malloc'd string. (We make this a separate call because in many
497+
* scenarios there's no need to get the decoded value.)
498+
*
499+
* Note: we expect the literal to be the most recently lexed token. This
500+
* would not work well if we supported multiple-token pushback or if
501+
* plpgsql_yylex() wanted to read ahead beyond a T_STRING token.
502+
*/
503+
char *
504+
plpgsql_get_string_value(void)
505+
{
506+
char *result;
507+
const char *cp;
508+
int len;
509+
510+
if (dolqlen > 0)
511+
{
512+
/* Token is a $foo$...$foo$ string */
513+
len = yyleng - 2 * dolqlen;
514+
Assert(len >= 0);
515+
result = (char *) malloc(len + 1);
516+
memcpy(result, yytext + dolqlen, len);
517+
result[len] = '\0';
518+
}
519+
else
520+
{
521+
/* Token is a '...' string */
522+
result = (char *) malloc(yyleng + 1); /* more than enough room */
523+
len = 0;
524+
for (cp = yytext; *cp; cp++)
525+
{
526+
if (*cp == '\'')
527+
{
528+
if (cp[1] == '\'')
529+
result[len++] = *cp++;
530+
/* else it must be string start or end quote */
531+
}
532+
else if (*cp == '\\')
533+
{
534+
if (cp[1] != '\0') /* just a paranoid check */
535+
result[len++] = *(++cp);
536+
}
537+
else
538+
result[len++] = *cp;
539+
}
540+
result[len] = '\0';
541+
}
542+
return result;
543+
}

0 commit comments

Comments
 (0)