Skip to content

Commit 3f6187a

Browse files
committed
merge revision(s) 1bc8838: [Backport #19750]
Handle unterminated unicode escapes in regexps This fixes an infinite loop possible after ec35422. For \u{} escapes in regexps, skip validation in the parser, and rely on the regexp code to handle validation. This is necessary so that invalid unicode escapes in comments in extended regexps are allowed. Fixes [Bug #19750] Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org> --- parse.y | 97 ++++++++++++++++++++++++++++++++----------------- test/ruby/test_parse.rb | 16 ++++++++ 2 files changed, 79 insertions(+), 34 deletions(-)
1 parent aef5316 commit 3f6187a

File tree

3 files changed

+82
-37
lines changed

3 files changed

+82
-37
lines changed

parse.y

Lines changed: 65 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -7260,6 +7260,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
72607260
return TRUE;
72617261
}
72627262

7263+
static int tokadd_mbchar(struct parser_params *p, int c);
7264+
72637265
/* return value is for ?\u3042 */
72647266
static void
72657267
tokadd_utf8(struct parser_params *p, rb_encoding **encp,
@@ -7277,44 +7279,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
72777279
if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
72787280

72797281
if (peek(p, open_brace)) { /* handle \u{...} form */
7280-
const char *second = NULL;
7281-
int c, last = nextc(p);
7282-
if (p->lex.pcur >= p->lex.pend) goto unterminated;
7283-
while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
7284-
while (c != close_brace) {
7285-
if (c == term) goto unterminated;
7286-
if (second == multiple_codepoints)
7287-
second = p->lex.pcur;
7288-
if (regexp_literal) tokadd(p, last);
7289-
if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
7290-
break;
7291-
}
7292-
while (ISSPACE(c = *p->lex.pcur)) {
7293-
if (++p->lex.pcur >= p->lex.pend) goto unterminated;
7294-
last = c;
7295-
}
7296-
if (term == -1 && !second)
7297-
second = multiple_codepoints;
7298-
}
7282+
if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) {
7283+
/*
7284+
* Skip parsing validation code and copy bytes as-is until term or
7285+
* closing brace, in order to correctly handle extended regexps where
7286+
* invalid unicode escapes are allowed in comments. The regexp parser
7287+
* does its own validation and will catch any issues.
7288+
*/
7289+
int c = *p->lex.pcur;
7290+
tokadd(p, c);
7291+
for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) {
7292+
if (c == close_brace) {
7293+
tokadd(p, c);
7294+
++p->lex.pcur;
7295+
break;
7296+
}
7297+
else if (c == term) {
7298+
break;
7299+
}
7300+
if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) {
7301+
tokadd(p, c);
7302+
c = *++p->lex.pcur;
7303+
}
7304+
tokadd_mbchar(p, c);
7305+
}
7306+
}
7307+
else {
7308+
const char *second = NULL;
7309+
int c, last = nextc(p);
7310+
if (p->lex.pcur >= p->lex.pend) goto unterminated;
7311+
while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
7312+
while (c != close_brace) {
7313+
if (c == term) goto unterminated;
7314+
if (second == multiple_codepoints)
7315+
second = p->lex.pcur;
7316+
if (regexp_literal) tokadd(p, last);
7317+
if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
7318+
break;
7319+
}
7320+
while (ISSPACE(c = *p->lex.pcur)) {
7321+
if (++p->lex.pcur >= p->lex.pend) goto unterminated;
7322+
last = c;
7323+
}
7324+
if (term == -1 && !second)
7325+
second = multiple_codepoints;
7326+
}
72997327

7300-
if (c != close_brace) {
7301-
unterminated:
7302-
token_flush(p);
7303-
yyerror0("unterminated Unicode escape");
7304-
return;
7305-
}
7306-
if (second && second != multiple_codepoints) {
7307-
const char *pcur = p->lex.pcur;
7308-
p->lex.pcur = second;
7309-
dispatch_scan_event(p, tSTRING_CONTENT);
7310-
token_flush(p);
7311-
p->lex.pcur = pcur;
7312-
yyerror0(multiple_codepoints);
7313-
token_flush(p);
7314-
}
7328+
if (c != close_brace) {
7329+
unterminated:
7330+
token_flush(p);
7331+
yyerror0("unterminated Unicode escape");
7332+
return;
7333+
}
7334+
if (second && second != multiple_codepoints) {
7335+
const char *pcur = p->lex.pcur;
7336+
p->lex.pcur = second;
7337+
dispatch_scan_event(p, tSTRING_CONTENT);
7338+
token_flush(p);
7339+
p->lex.pcur = pcur;
7340+
yyerror0(multiple_codepoints);
7341+
token_flush(p);
7342+
}
73157343

7316-
if (regexp_literal) tokadd(p, close_brace);
7317-
nextc(p);
7344+
if (regexp_literal) tokadd(p, close_brace);
7345+
nextc(p);
7346+
}
73187347
}
73197348
else { /* handle \uxxxx form */
73207349
if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) {

test/ruby/test_parse.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,6 +1041,22 @@ def test_yyerror_at_eol
10411041
assert_syntax_error(" 0b\n", /\^/)
10421042
end
10431043

1044+
def test_unclosed_unicode_escape_at_eol_bug_19750
1045+
assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}")
1046+
begin;
1047+
assert_syntax_error("/\\u", /too short escape sequence/)
1048+
assert_syntax_error("/\\u{", /unterminated regexp meets end of file/)
1049+
assert_syntax_error("/\\u{\\n", /invalid Unicode list/)
1050+
assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/)
1051+
re = eval("/a#\\u{\n$/x")
1052+
assert_match(re, 'a')
1053+
assert_not_match(re, 'a#')
1054+
re = eval("/a#\\u\n$/x")
1055+
assert_match(re, 'a')
1056+
assert_not_match(re, 'a#')
1057+
end;
1058+
end
1059+
10441060
def test_error_def_in_argument
10451061
assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}")
10461062
begin;

version.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR
1212
#define RUBY_VERSION_TEENY 2
1313
#define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR
14-
#define RUBY_PATCHLEVEL 92
14+
#define RUBY_PATCHLEVEL 93
1515

1616
#include "ruby/version.h"
1717
#include "ruby/internal/abi.h"

0 commit comments

Comments
 (0)