@@ -7260,6 +7260,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
7260
7260
return TRUE ;
7261
7261
}
7262
7262
7263
+ static int tokadd_mbchar (struct parser_params *p, int c);
7264
+
7263
7265
/* return value is for ?\u3042 */
7264
7266
static void
7265
7267
tokadd_utf8 (struct parser_params *p, rb_encoding **encp,
@@ -7277,44 +7279,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
7277
7279
if (regexp_literal) { tokadd (p, ' \\ ' ); tokadd (p, ' u' ); }
7278
7280
7279
7281
if (peek (p, open_brace)) { /* handle \u{...} form */
7280
- const char *second = NULL ;
7281
- int c, last = nextc (p);
7282
- if (p->lex .pcur >= p->lex .pend ) goto unterminated;
7283
- while (ISSPACE (c = *p->lex .pcur ) && ++p->lex .pcur < p->lex .pend );
7284
- while (c != close_brace) {
7285
- if (c == term) goto unterminated;
7286
- if (second == multiple_codepoints)
7287
- second = p->lex .pcur ;
7288
- if (regexp_literal) tokadd (p, last);
7289
- if (!tokadd_codepoint (p, encp, regexp_literal, TRUE )) {
7290
- break ;
7291
- }
7292
- while (ISSPACE (c = *p->lex .pcur )) {
7293
- if (++p->lex .pcur >= p->lex .pend ) goto unterminated;
7294
- last = c;
7295
- }
7296
- if (term == -1 && !second)
7297
- second = multiple_codepoints;
7298
- }
7282
+ if (regexp_literal && p->lex .strterm ->u .literal .u1 .func == str_regexp) {
7283
+ /*
7284
+ * Skip parsing validation code and copy bytes as-is until term or
7285
+ * closing brace, in order to correctly handle extended regexps where
7286
+ * invalid unicode escapes are allowed in comments. The regexp parser
7287
+ * does its own validation and will catch any issues.
7288
+ */
7289
+ int c = *p->lex .pcur ;
7290
+ tokadd (p, c);
7291
+ for (c = *++p->lex .pcur ; p->lex .pcur < p->lex .pend ; c = *++p->lex .pcur ) {
7292
+ if (c == close_brace) {
7293
+ tokadd (p, c);
7294
+ ++p->lex .pcur ;
7295
+ break ;
7296
+ }
7297
+ else if (c == term) {
7298
+ break ;
7299
+ }
7300
+ if (c == ' \\ ' && p->lex .pcur + 1 < p->lex .pend ) {
7301
+ tokadd (p, c);
7302
+ c = *++p->lex .pcur ;
7303
+ }
7304
+ tokadd_mbchar (p, c);
7305
+ }
7306
+ }
7307
+ else {
7308
+ const char *second = NULL ;
7309
+ int c, last = nextc (p);
7310
+ if (p->lex .pcur >= p->lex .pend ) goto unterminated;
7311
+ while (ISSPACE (c = *p->lex .pcur ) && ++p->lex .pcur < p->lex .pend );
7312
+ while (c != close_brace) {
7313
+ if (c == term) goto unterminated;
7314
+ if (second == multiple_codepoints)
7315
+ second = p->lex .pcur ;
7316
+ if (regexp_literal) tokadd (p, last);
7317
+ if (!tokadd_codepoint (p, encp, regexp_literal, TRUE )) {
7318
+ break ;
7319
+ }
7320
+ while (ISSPACE (c = *p->lex .pcur )) {
7321
+ if (++p->lex .pcur >= p->lex .pend ) goto unterminated;
7322
+ last = c;
7323
+ }
7324
+ if (term == -1 && !second)
7325
+ second = multiple_codepoints;
7326
+ }
7299
7327
7300
- if (c != close_brace) {
7301
- unterminated:
7302
- token_flush (p);
7303
- yyerror0 (" unterminated Unicode escape" );
7304
- return ;
7305
- }
7306
- if (second && second != multiple_codepoints) {
7307
- const char *pcur = p->lex .pcur ;
7308
- p->lex .pcur = second;
7309
- dispatch_scan_event (p, tSTRING_CONTENT);
7310
- token_flush (p);
7311
- p->lex .pcur = pcur;
7312
- yyerror0 (multiple_codepoints);
7313
- token_flush (p);
7314
- }
7328
+ if (c != close_brace) {
7329
+ unterminated:
7330
+ token_flush (p);
7331
+ yyerror0 (" unterminated Unicode escape" );
7332
+ return ;
7333
+ }
7334
+ if (second && second != multiple_codepoints) {
7335
+ const char *pcur = p->lex .pcur ;
7336
+ p->lex .pcur = second;
7337
+ dispatch_scan_event (p, tSTRING_CONTENT);
7338
+ token_flush (p);
7339
+ p->lex .pcur = pcur;
7340
+ yyerror0 (multiple_codepoints);
7341
+ token_flush (p);
7342
+ }
7315
7343
7316
- if (regexp_literal) tokadd (p, close_brace);
7317
- nextc (p);
7344
+ if (regexp_literal) tokadd (p, close_brace);
7345
+ nextc (p);
7346
+ }
7318
7347
}
7319
7348
else { /* handle \uxxxx form */
7320
7349
if (!tokadd_codepoint (p, encp, regexp_literal, FALSE )) {
0 commit comments