Skip to content

Commit ed71c83

Browse files
committed
Fix #1201: Correct regex support.
Previously, `java.util.regex.Pattern` was implemented without much concern for correctness wrt. the Java semantics of regular expressions. Patterns were passed through to the native `RegExp` with minimal preprocessing. This could cause several kinds of incompatibilities: - Throwing `ParseError`s for features not supported by JS regexes, - Or worse, silently compile with different semantics. In this commit, we correctly implement virtually all the features of Java regular expressions by compiling Java patterns down to JS patterns with the same semantics. This change introduces a significant code size regression for code bases that were already using `Pattern` and/or Scala's `Regex`. Therefore, we went to great lengths to optimize the compiler for code size, in particular in the default ES 2015 configuration. This means that some code is not as idiomatic as it could be. The impact of this commit on a typical output is approximately 65 KB for fastOpt and 12 KB for fullOpt. The `README.md` file contains an extensive explanation of the design of the compiler, and of how it compiles each kind of Java pattern. In addition to fixing the mega-issue #1201, this commit fixes the smaller issues #105, #1677, #1847, #2082 and #3959, which had been closed as duplicates of #1201.
1 parent df81197 commit ed71c83

File tree

41 files changed

+5441
-343
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+5441
-343
lines changed

javalib/src/main/scala/java/util/regex/GroupStartMapper.scala

Lines changed: 107 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,38 @@ import scala.scalajs.js
3838
* - It computes the start of every group thanks to the groups before it
3939
* - It builds and returns the mapping of previous group number -> start
4040
*
41+
* The `pattern` that is parsed by `GroupStartMapper` is the *compiled* JS
42+
* pattern produced by `PatternCompiler`, not the original Java pattern. This
43+
* means that we can simplify a number of things with the knowledge that:
44+
*
45+
* - the pattern is well-formed,
46+
* - it contains no named group or named back references, and
47+
* - a '\' is always followed by an ASCII character that is:
48+
* - a digit, for a back reference,
49+
* - one of `^ $ \ . * + ? ( ) [ ] { } |`, for an escape,
50+
* - 'b' or 'B' for a word boundary,
51+
* - 'd' or 'D' for a digit character class (used in `[\d\D]` for any code point), or
52+
* - 'p' or 'P' followed by a `{}`-enclosed name that contains only ASCII word characters.
53+
*
4154
* @author Mikaël Mayer
4255
*/
4356
private[regex] class GroupStartMapper private (pattern: String, flags: String,
4457
node: GroupStartMapper.Node, groupCount: Int,
45-
allMatchingRegex: js.RegExp) {
58+
jsRegExpForFind: js.RegExp, jsRegExpForMatches: js.RegExp) {
4659

4760
import GroupStartMapper._
4861

49-
def apply(string: String, start: Int): js.Array[Int] = {
50-
allMatchingRegex.lastIndex = start
51-
val allMatchResult = allMatchingRegex.exec(string)
52-
if (allMatchResult == null) {
62+
def apply(forMatches: Boolean, string: String, index: Int): js.Array[Int] = {
63+
val regExp =
64+
if (forMatches) jsRegExpForMatches
65+
else jsRegExpForFind
66+
67+
regExp.lastIndex = index
68+
val allMatchResult = regExp.exec(string)
69+
if (allMatchResult == null || allMatchResult.index != index) {
5370
throw new AssertionError(
54-
s"[Internal error] Executed '$allMatchingRegex' on " +
55-
s"'$string' at position $start, got an error.\n" +
71+
s"[Internal error] Executed '$regExp' on " +
72+
s"'$string' at position $index, got an error.\n" +
5673
s"Original pattern '$pattern' with flags '$flags' did match however.")
5774
}
5875

@@ -65,7 +82,7 @@ private[regex] class GroupStartMapper private (pattern: String, flags: String,
6582
i += 1
6683
}
6784

68-
node.propagateFromStart(allMatchResult, groupStartMap, start)
85+
node.propagateFromStart(allMatchResult, groupStartMap, index)
6986

7087
groupStartMap
7188
}
@@ -76,10 +93,12 @@ private[regex] object GroupStartMapper {
7693
val parser = new Parser(pattern)
7794
val node = parser.parseTopLevel()
7895
node.setNewGroup(1)
79-
val allMatchingRegex =
80-
new js.RegExp(node.buildRegex(parser.groupNodeMap), flags)
96+
val allMatchingPattern = node.buildRegex(parser.groupNodeMap)
97+
val jsRegExpForFind = new js.RegExp(allMatchingPattern, flags + "g")
98+
val jsRegExpForMatches =
99+
new js.RegExp(Pattern.wrapJSPatternForMatches(allMatchingPattern), flags)
81100
new GroupStartMapper(pattern, flags, node, parser.parsedGroupCount,
82-
allMatchingRegex)
101+
jsRegExpForFind, jsRegExpForMatches)
83102
}
84103

85104
/** Node of the regex tree. */
@@ -98,48 +117,36 @@ private[regex] object GroupStartMapper {
98117

99118
def buildRegex(groupNodeMap: js.Array[Node]): String
100119

101-
/* When assigning group positions. I could not choose between assigning
102-
* group numbers from left to right or from right to left, because there
103-
* both failed in one case each. Normally, both directions give the same
104-
* result. But there are corner cases.
105-
*
106-
* Consider the following regex matching `abbbbbbc`
107-
*
108-
* (?=ab*(c))ab
109-
*
110-
* After conversion, this becomes:
111-
*
112-
* (?=(ab*)(c))(ab)
113-
*
114-
* To recover the position of the group (c), we cannot do anything but
115-
* compute it from the length of (ab*), that is, propagate the start,
116-
* compute the length, and return the end, and this, recursively. This is
117-
* what we need to do in a forward-matching regex.
118-
*
119-
* However, consider the following regex matching `abcbdbe`
120-
*
121-
* a(b.)*
120+
/* The overall algorithm consists in, given known start and end positions
121+
* of a parent node, determine the positions of its children. This is done
122+
* in the main polymorphic method `propagate`, which each node implements.
122123
*
123-
* After conversion, it is transformed to:
124+
* For some kinds of parent nodes, even when we know both their start and
125+
* end positions, we can only determine one side of their children.
126+
* Obvious examples are look-around nodes. Since they are zero-length,
127+
* their start and end are always equal, but correspond to different sides
128+
* of their children:
124129
*
125-
* (a)((b.)*)
130+
* - For look-ahead nodes (?=X) and (?!X), they correspond to the *start* of X.
131+
* - For look-behind nodes (?<=X) and (?<!X), they correspond to the *end* of X.
126132
*
127-
* The semantics of group matching under a star are that the last match is
128-
* kept. Therefore, we cannot obtain the start position of (b.) from
129-
* propagating lengths from left to right. What we first do is to get the
130-
* start, then the end, of the group `((b.)*)`, and then we propagate this
131-
* end to the inner group.
133+
* Because of this, we have `propagateFromStart` and `propagateFromEnd`.
134+
* In either case, since we know the length of the child, we can compute
135+
* the position of the other side, which then allows to call `propagate`.
132136
*
133-
* Note that when JavaScript will support back-matching `(?<= )` and
134-
* `(?<! )` (hopefully one day), we will be able to implement the length
135-
* transfer using the `propagateFromEnd` method, because we have no clue on
136-
* where the match started (this is different from the `start` position
137-
* because it can extend before it).
137+
* In addition to look-around nodes, repeated nodes also have a constraint
138+
* on the side on which they operate. In X+ and X*, the groups inside X
139+
* correspond to the *last* iteration of the repetition. When we know the
140+
* start and end of X*, we only know the *end* of the last iteration of X.
138141
*
139-
* Zero-length test nodes of the type `(?= )` or `(?! )` do not
140-
* count to propagate the length on the right or on the left.
142+
* Sequence nodes can choose either direction. We choose to process their
143+
* children from start to end, and therefore thread the start of the
144+
* sequence through the children, using the computed end of each child as
145+
* the start of its next sibling.
141146
*
142-
* `RepeatedNode`s use the semantics of propagating the end to the start.
147+
* Other nodes such as alternatives or capturing groups have both the same
148+
* start and end position as their children, so they can directly call
149+
* `propagate`.
143150
*/
144151

145152
/** Propagates the start or end position of this node to its descendants.
@@ -152,21 +159,18 @@ private[regex] object GroupStartMapper {
152159

153160
/** Propagates the appropriate positions to the descendants of this node
154161
* from its end position.
155-
*
156-
* @return the start position of this node
157162
*/
158163
final def propagateFromEnd(matchResult: js.RegExp.ExecResult,
159-
groupStartMap: js.Array[Int], end: Int): Int = {
164+
groupStartMap: js.Array[Int], end: Int): Unit = {
160165

161166
val start = matchResult(newGroup).fold(-1)(matched => end - matched.length)
162167
propagate(matchResult, groupStartMap, start, end)
163-
start
164168
}
165169

166170
/** Propagates the appropriate positions to the descendants of this node
167171
* from its start position.
168172
*
169-
* @return the end position of this node
173+
* @return the end position of this node, as a convenience for `SequenceNode.propagate`
170174
*/
171175
final def propagateFromStart(matchResult: js.RegExp.ExecResult,
172176
groupStartMap: js.Array[Int], start: Int): Int = {
@@ -194,12 +198,16 @@ private[regex] object GroupStartMapper {
194198
*/
195199
if (matchResult(newGroup).isDefined)
196200
groupStartMap(number) = start
197-
inner.propagateFromStart(matchResult, groupStartMap, start)
201+
inner.propagate(matchResult, groupStartMap, start, end)
198202
}
199203
}
200204

201-
/** A zero-length test of the form `(?= )` or `(?! )`. */
202-
private final class ZeroLengthTestNode(val indicator: String, val inner: Node)
205+
/** A look-around group of the form `(?= )`, `(?! )`, `(?<= )` or `(?<! )`.
206+
*
207+
* Look-aheads propagate from left to right, while look-behinds propagate
208+
* from right to left.
209+
*/
210+
private final class LookAroundNode(isLookBehind: Boolean, indicator: String, inner: Node)
203211
extends Node {
204212

205213
override def setNewGroup(newGroupIndex: Int): Int =
@@ -210,7 +218,10 @@ private[regex] object GroupStartMapper {
210218

211219
def propagate(matchResult: js.RegExp.ExecResult,
212220
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
213-
inner.propagateFromStart(matchResult, groupStartMap, start)
221+
if (isLookBehind)
222+
inner.propagateFromEnd(matchResult, groupStartMap, end)
223+
else
224+
inner.propagateFromStart(matchResult, groupStartMap, start)
214225
}
215226
}
216227

@@ -326,7 +337,7 @@ private[regex] object GroupStartMapper {
326337
val len = alternatives.length
327338
var i = 0
328339
while (i != len) {
329-
alternatives(i).propagateFromStart(matchResult, groupStartMap, start)
340+
alternatives(i).propagate(matchResult, groupStartMap, start, end)
330341
i += 1
331342
}
332343
}
@@ -362,7 +373,15 @@ private[regex] object GroupStartMapper {
362373
}
363374

364375
while (true) {
365-
val baseNode = (pattern.charAt(pIndex): @switch) match {
376+
/* Parse the pattern by code points if RegExp supports the 'u' flag,
377+
* in which case PatternCompiler always uses it, or by chars if it
378+
* doesn't. This distinction is important for repeated surrogate pairs.
379+
*/
380+
val dispatchCP =
381+
if (PatternCompiler.Support.supportsUnicode) pattern.codePointAt(pIndex)
382+
else pattern.charAt(pIndex).toInt
383+
384+
val baseNode = (dispatchCP: @switch) match {
366385
case '|' =>
367386
// Complete one alternative
368387
alternatives.push(completeSequence(sequence))
@@ -384,15 +403,25 @@ private[regex] object GroupStartMapper {
384403
case '(' =>
385404
val indicator = pattern.substring(pIndex + 1, pIndex + 3)
386405
if (indicator == "?=" || indicator == "?!") {
387-
// Non-capturing test group
406+
// Look-ahead group
388407
pIndex += 3
389408
val inner = parseInsideParensAndClosingParen()
390-
new ZeroLengthTestNode(indicator, inner)
409+
new LookAroundNode(isLookBehind = false, indicator, inner)
410+
} else if (indicator == "?<") {
411+
// Look-behind group, which must be ?<= or ?<!
412+
val fullIndicator = pattern.substring(pIndex + 1, pIndex + 4)
413+
pIndex += 4
414+
val inner = parseInsideParensAndClosingParen()
415+
new LookAroundNode(isLookBehind = true, fullIndicator, inner)
391416
} else if (indicator == "?:") {
392417
// Non-capturing group
393418
pIndex += 3
394419
val inner = parseInsideParensAndClosingParen()
395-
inner
420+
// Wrap LeafRegexNode's so that they do not merge with their neighbors
421+
if (inner.isInstanceOf[LeafRegexNode])
422+
new SequenceNode(js.Array(inner))
423+
else
424+
inner
396425
} else {
397426
// Capturing group
398427
pIndex += 1
@@ -408,26 +437,31 @@ private[regex] object GroupStartMapper {
408437
@inline
409438
def isDigit(c: Char): Boolean = c >= '0' && c <= '9'
410439

411-
if (isDigit(pattern.charAt(pIndex + 1))) {
440+
val startIndex = pIndex
441+
val c = pattern.charAt(startIndex + 1)
442+
pIndex += 2
443+
444+
if (isDigit(c)) {
412445
// it is a back reference; parse all following digits
413-
val startIndex = pIndex
414-
pIndex += 2
415446
while (isDigit(pattern.charAt(pIndex)))
416447
pIndex += 1
417448
new BackReferenceNode(
418449
Integer.parseInt(pattern.substring(startIndex + 1, pIndex)))
419450
} else {
420-
// it is a character escape
421-
val e = pattern.substring(pIndex, pIndex + 2)
422-
pIndex += 2
423-
new LeafRegexNode(e)
451+
// it is a character escape, or one of \b, \B, \d, \D, \p{...} or \P{...}
452+
if (c == 'p' || c == 'P') {
453+
while (pattern.charAt(pIndex) != '}')
454+
pIndex += 1
455+
pIndex += 1
456+
}
457+
new LeafRegexNode(pattern.substring(startIndex, pIndex))
424458
}
425459

426460
case '[' =>
427-
// parse until the corresponding ']'
461+
// parse until the corresponding ']' (here surrogate pairs don't matter)
428462
@tailrec def loop(pIndex: Int): Int = {
429463
pattern.charAt(pIndex) match {
430-
case '\\' => loop(pIndex + 2)
464+
case '\\' => loop(pIndex + 2) // this is also fine for \p{...} and \P{...}
431465
case ']' => pIndex + 1
432466
case _ => loop(pIndex + 1)
433467
}
@@ -439,9 +473,9 @@ private[regex] object GroupStartMapper {
439473
new LeafRegexNode(regex)
440474

441475
case _ =>
442-
val e = pattern.substring(pIndex, pIndex + 1)
443-
pIndex += 1
444-
new LeafRegexNode(e)
476+
val start = pIndex
477+
pIndex += Character.charCount(dispatchCP)
478+
new LeafRegexNode(pattern.substring(start, pIndex))
445479
}
446480

447481
if (baseNode ne null) { // null if we just completed an alternative

0 commit comments

Comments
 (0)