Skip to content

Commit ed418be

Browse files
committed
Add support for '\q{}' escape sequence in regular expressions.
1 parent d162acf commit ed418be

File tree

5 files changed

+203
-267
lines changed

5 files changed

+203
-267
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package com.semmle.js.ast.regexp;
2+
3+
import com.semmle.js.ast.SourceLocation;
4+
5+
/**
6+
* A '\q{}' escape sequence in a regular expression, which is a special extension
7+
* to standard regular expressions.
8+
*/
9+
public class CharacterClassQuotedString extends RegExpTerm {
10+
private final RegExpTerm term;
11+
12+
public CharacterClassQuotedString(SourceLocation loc, RegExpTerm term) {
13+
super(loc, "CharacterClassQuotedString");
14+
this.term = term;
15+
}
16+
17+
public RegExpTerm getTerm() {
18+
return term;
19+
}
20+
21+
@Override
22+
public void accept(Visitor v) {
23+
v.visit(this);
24+
}
25+
}

javascript/extractor/src/com/semmle/js/ast/regexp/Visitor.java

+2
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,6 @@ public interface Visitor {
6161
public void visit(ZeroWidthNegativeLookbehind nd);
6262

6363
public void visit(UnicodePropertyEscape nd);
64+
65+
public void visit(CharacterClassQuotedString nd);
6466
}

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

+8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import com.semmle.js.ast.regexp.Caret;
1111
import com.semmle.js.ast.regexp.CharacterClass;
1212
import com.semmle.js.ast.regexp.CharacterClassEscape;
13+
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
1314
import com.semmle.js.ast.regexp.CharacterClassRange;
1415
import com.semmle.js.ast.regexp.Constant;
1516
import com.semmle.js.ast.regexp.ControlEscape;
@@ -92,6 +93,7 @@ public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
9293
termkinds.put("ZeroWidthPositiveLookbehind", 25);
9394
termkinds.put("ZeroWidthNegativeLookbehind", 26);
9495
termkinds.put("UnicodePropertyEscape", 27);
96+
termkinds.put("CharacterClassQuotedString", 28);
9597
}
9698

9799
private static final String[] errmsgs =
@@ -344,6 +346,12 @@ public void visit(CharacterClassRange nd) {
344346
visit(nd.getLeft(), lbl, 0);
345347
visit(nd.getRight(), lbl, 1);
346348
}
349+
350+
@Override
351+
public void visit(CharacterClassQuotedString nd) {
352+
Label lbl = extractTerm(nd, parent, idx);
353+
visit(nd.getTerm(), lbl, 0);
354+
}
347355
}
348356

349357
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {

javascript/extractor/src/com/semmle/js/parser/RegExpParser.java

+46
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import com.semmle.js.ast.regexp.Caret;
77
import com.semmle.js.ast.regexp.CharacterClass;
88
import com.semmle.js.ast.regexp.CharacterClassEscape;
9+
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
910
import com.semmle.js.ast.regexp.CharacterClassRange;
1011
import com.semmle.js.ast.regexp.Constant;
1112
import com.semmle.js.ast.regexp.ControlEscape;
@@ -283,6 +284,45 @@ private RegExpTerm parseTerm() {
283284
return this.finishTerm(this.parseQuantifierOpt(loc, this.parseAtom()));
284285
}
285286

287+
private RegExpTerm parseDisjunctionInsideQuotedString() {
288+
SourceLocation loc = new SourceLocation(pos());
289+
List<RegExpTerm> disjuncts = new ArrayList<>();
290+
disjuncts.add(this.parseAlternativeInsideQuotedString());
291+
while (this.match("|")) {
292+
disjuncts.add(this.parseAlternativeInsideQuotedString());
293+
}
294+
if (disjuncts.size() == 1) return disjuncts.get(0);
295+
return this.finishTerm(new Disjunction(loc, disjuncts));
296+
}
297+
298+
private RegExpTerm parseAlternativeInsideQuotedString() {
299+
SourceLocation loc = new SourceLocation(pos());
300+
StringBuilder sb = new StringBuilder();
301+
boolean escaped = false;
302+
while (true) {
303+
// If we're at the end of the string, something went wrong.
304+
if (this.atEOS()) {
305+
this.error(Error.UNEXPECTED_EOS);
306+
break;
307+
}
308+
// We can end parsing if we're not escaped and we see a `|` which would mean Alternation
309+
// or `}` which would mean the end of the Quoted String.
310+
if(!escaped && this.lookahead(null, "|", "}")){
311+
break;
312+
}
313+
char c = this.nextChar();
314+
// Track whether the character is an escape character.
315+
escaped = !escaped && (c == '\\');
316+
sb.append(c);
317+
}
318+
319+
String literal = sb.toString();
320+
loc.setEnd(pos());
321+
loc.setSource(literal);
322+
323+
return new Constant(loc, literal);
324+
}
325+
286326
private RegExpTerm parseQuantifierOpt(SourceLocation loc, RegExpTerm atom) {
287327
if (this.match("*")) return this.finishTerm(new Star(loc, atom, !this.match("?")));
288328
if (this.match("+")) return this.finishTerm(new Plus(loc, atom, !this.match("?")));
@@ -427,6 +467,12 @@ private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
427467
return this.finishTerm(new NamedBackReference(loc, name, "\\k<" + name + ">"));
428468
}
429469

470+
if (this.match("q{")) {
471+
RegExpTerm term = parseDisjunctionInsideQuotedString();
472+
this.expectRBrace();
473+
return this.finishTerm(new CharacterClassQuotedString(loc, term));
474+
}
475+
430476
if (this.match("p{", "P{")) {
431477
String name = this.readIdentifier();
432478
if (this.match("=")) {

0 commit comments

Comments
 (0)