@@ -38,21 +38,38 @@ import scala.scalajs.js
38
38
* - It computes the start of every group thanks to the groups before it
39
39
* - It builds and returns the mapping of previous group number -> start
40
40
*
41
+ * The `pattern` that is parsed by `GroupStartMapper` is the *compiled* JS
42
+ * pattern produced by `PatternCompiler`, not the original Java pattern. This
43
+ * means that we can simplify a number of things with the knowledge that:
44
+ *
45
+ * - the pattern is well-formed,
46
+ * - it contains no named group or named back references, and
47
+ * - a '\' is always followed by an ASCII character that is:
48
+ * - a digit, for a back reference,
49
+ * - one of `^ $ \ . * + ? ( ) [ ] { } |`, for an escape,
50
+ * - 'b' or 'B' for a word boundary,
51
+ * - 'd' or 'D' for a digit character class (used in `[\d\D]` for any code point), or
52
+ * - 'p' or 'P' followed by a `{}`-enclosed name that contains only ASCII word characters.
53
+ *
41
54
* @author Mikaël Mayer
42
55
*/
43
56
private [regex] class GroupStartMapper private (pattern : String , flags : String ,
44
57
node : GroupStartMapper .Node , groupCount : Int ,
45
- allMatchingRegex : js.RegExp ) {
58
+ jsRegExpForFind : js. RegExp , jsRegExpForMatches : js.RegExp ) {
46
59
47
60
import GroupStartMapper ._
48
61
49
- def apply (string : String , start : Int ): js.Array [Int ] = {
50
- allMatchingRegex.lastIndex = start
51
- val allMatchResult = allMatchingRegex.exec(string)
52
- if (allMatchResult == null ) {
62
+ def apply (forMatches : Boolean , string : String , index : Int ): js.Array [Int ] = {
63
+ val regExp =
64
+ if (forMatches) jsRegExpForMatches
65
+ else jsRegExpForFind
66
+
67
+ regExp.lastIndex = index
68
+ val allMatchResult = regExp.exec(string)
69
+ if (allMatchResult == null || allMatchResult.index != index) {
53
70
throw new AssertionError (
54
- s " [Internal error] Executed ' $allMatchingRegex ' on " +
55
- s " ' $string' at position $start , got an error. \n " +
71
+ s " [Internal error] Executed ' $regExp ' on " +
72
+ s " ' $string' at position $index , got an error. \n " +
56
73
s " Original pattern ' $pattern' with flags ' $flags' did match however. " )
57
74
}
58
75
@@ -65,7 +82,7 @@ private[regex] class GroupStartMapper private (pattern: String, flags: String,
65
82
i += 1
66
83
}
67
84
68
- node.propagateFromStart(allMatchResult, groupStartMap, start )
85
+ node.propagateFromStart(allMatchResult, groupStartMap, index )
69
86
70
87
groupStartMap
71
88
}
@@ -76,10 +93,12 @@ private[regex] object GroupStartMapper {
76
93
val parser = new Parser (pattern)
77
94
val node = parser.parseTopLevel()
78
95
node.setNewGroup(1 )
79
- val allMatchingRegex =
80
- new js.RegExp (node.buildRegex(parser.groupNodeMap), flags)
96
+ val allMatchingPattern = node.buildRegex(parser.groupNodeMap)
97
+ val jsRegExpForFind = new js.RegExp (allMatchingPattern, flags + " g" )
98
+ val jsRegExpForMatches =
99
+ new js.RegExp (Pattern .wrapJSPatternForMatches(allMatchingPattern), flags)
81
100
new GroupStartMapper (pattern, flags, node, parser.parsedGroupCount,
82
- allMatchingRegex )
101
+ jsRegExpForFind, jsRegExpForMatches )
83
102
}
84
103
85
104
/** Node of the regex tree. */
@@ -98,48 +117,36 @@ private[regex] object GroupStartMapper {
98
117
99
118
def buildRegex (groupNodeMap : js.Array [Node ]): String
100
119
101
- /* When assigning group positions. I could not choose between assigning
102
- * group numbers from left to right or from right to left, because there
103
- * both failed in one case each. Normally, both directions give the same
104
- * result. But there are corner cases.
105
- *
106
- * Consider the following regex matching `abbbbbbc`
107
- *
108
- * (?=ab*(c))ab
109
- *
110
- * After conversion, this becomes:
111
- *
112
- * (?=(ab*)(c))(ab)
113
- *
114
- * To recover the position of the group (c), we cannot do anything but
115
- * compute it from the length of (ab*), that is, propagate the start,
116
- * compute the length, and return the end, and this, recursively. This is
117
- * what we need to do in a forward-matching regex.
118
- *
119
- * However, consider the following regex matching `abcbdbe`
120
- *
121
- * a(b.)*
120
+ /* The overall algorithm consists in, given known start and end positions
121
+ * of a parent node, determine the positions of its children. This is done
122
+ * in the main polymorphic method `propagate`, which each node implements.
122
123
*
123
- * After conversion, it is transformed to:
124
+ * For some kinds of parent nodes, even when we know both their start and
125
+ * end positions, we can only determine one side of their children.
126
+ * Obvious examples are look-around nodes. Since they are zero-length,
127
+ * their start and end are always equal, but correspond to different sides
128
+ * of their children:
124
129
*
125
- * (a)((b.)*)
130
+ * - For look-ahead nodes (?=X) and (?!X), they correspond to the *start* of X.
131
+ * - For look-behind nodes (?<=X) and (?<!X), they correspond to the *end* of X.
126
132
*
127
- * The semantics of group matching under a star are that the last match is
128
- * kept. Therefore, we cannot obtain the start position of (b.) from
129
- * propagating lengths from left to right. What we first do is to get the
130
- * start, then the end, of the group `((b.)*)`, and then we propagate this
131
- * end to the inner group.
133
+ * Because of this, we have `propagateFromStart` and `propagateFromEnd`.
134
+ * In either case, since we know the length of the child, we can compute
135
+ * the position of the other side, which then allows to call `propagate`.
132
136
*
133
- * Note that when JavaScript will support back-matching `(?<= )` and
134
- * `(?<! )` (hopefully one day), we will be able to implement the length
135
- * transfer using the `propagateFromEnd` method, because we have no clue on
136
- * where the match started (this is different from the `start` position
137
- * because it can extend before it).
137
+ * In addition to look-around nodes, repeated nodes also have a constraint
138
+ * on the side on which they operate. In X+ and X*, the groups inside X
139
+ * correspond to the *last* iteration of the repetition. When we know the
140
+ * start and end of X*, we only know the *end* of the last iteration of X.
138
141
*
139
- * Zero-length test nodes of the type `(?= )` or `(?! )` do not
140
- * count to propagate the length on the right or on the left.
142
+ * Sequence nodes can choose either direction. We choose to process their
143
+ * children from start to end, and therefore thread the start of the
144
+ * sequence through the children, using the computed end of each child as
145
+ * the start of its next sibling.
141
146
*
142
- * `RepeatedNode`s use the semantics of propagating the end to the start.
147
+ * Other nodes such as alternatives or capturing groups have both the same
148
+ * start and end position as their children, so they can directly call
149
+ * `propagate`.
143
150
*/
144
151
145
152
/** Propagates the start or end position of this node to its descendants.
@@ -152,21 +159,18 @@ private[regex] object GroupStartMapper {
152
159
153
160
/** Propagates the appropriate positions to the descendants of this node
154
161
* from its end position.
155
- *
156
- * @return the start position of this node
157
162
*/
158
163
final def propagateFromEnd (matchResult : js.RegExp .ExecResult ,
159
- groupStartMap : js.Array [Int ], end : Int ): Int = {
164
+ groupStartMap : js.Array [Int ], end : Int ): Unit = {
160
165
161
166
val start = matchResult(newGroup).fold(- 1 )(matched => end - matched.length)
162
167
propagate(matchResult, groupStartMap, start, end)
163
- start
164
168
}
165
169
166
170
/** Propagates the appropriate positions to the descendants of this node
167
171
* from its start position.
168
172
*
169
- * @return the end position of this node
173
+ * @return the end position of this node, as a convenience for `SequenceNode.propagate`
170
174
*/
171
175
final def propagateFromStart (matchResult : js.RegExp .ExecResult ,
172
176
groupStartMap : js.Array [Int ], start : Int ): Int = {
@@ -194,12 +198,16 @@ private[regex] object GroupStartMapper {
194
198
*/
195
199
if (matchResult(newGroup).isDefined)
196
200
groupStartMap(number) = start
197
- inner.propagateFromStart (matchResult, groupStartMap, start)
201
+ inner.propagate (matchResult, groupStartMap, start, end )
198
202
}
199
203
}
200
204
201
- /** A zero-length test of the form `(?= )` or `(?! )`. */
202
- private final class ZeroLengthTestNode (val indicator : String , val inner : Node )
205
+ /** A look-around group of the form `(?= )`, `(?! )`, `(?<= )` or `(?<! )`.
206
+ *
207
+ * Look-aheads propagate from left to right, while look-behinds propagate
208
+ * from right to left.
209
+ */
210
+ private final class LookAroundNode (isLookBehind : Boolean , indicator : String , inner : Node )
203
211
extends Node {
204
212
205
213
override def setNewGroup (newGroupIndex : Int ): Int =
@@ -210,7 +218,10 @@ private[regex] object GroupStartMapper {
210
218
211
219
def propagate (matchResult : js.RegExp .ExecResult ,
212
220
groupStartMap : js.Array [Int ], start : Int , end : Int ): Unit = {
213
- inner.propagateFromStart(matchResult, groupStartMap, start)
221
+ if (isLookBehind)
222
+ inner.propagateFromEnd(matchResult, groupStartMap, end)
223
+ else
224
+ inner.propagateFromStart(matchResult, groupStartMap, start)
214
225
}
215
226
}
216
227
@@ -326,7 +337,7 @@ private[regex] object GroupStartMapper {
326
337
val len = alternatives.length
327
338
var i = 0
328
339
while (i != len) {
329
- alternatives(i).propagateFromStart (matchResult, groupStartMap, start)
340
+ alternatives(i).propagate (matchResult, groupStartMap, start, end )
330
341
i += 1
331
342
}
332
343
}
@@ -362,7 +373,15 @@ private[regex] object GroupStartMapper {
362
373
}
363
374
364
375
while (true ) {
365
- val baseNode = (pattern.charAt(pIndex): @ switch) match {
376
+ /* Parse the pattern by code points if RegExp supports the 'u' flag,
377
+ * in which case PatternCompiler always uses it, or by chars if it
378
+ * doesn't. This distinction is important for repeated surrogate pairs.
379
+ */
380
+ val dispatchCP =
381
+ if (PatternCompiler .Support .supportsUnicode) pattern.codePointAt(pIndex)
382
+ else pattern.charAt(pIndex).toInt
383
+
384
+ val baseNode = (dispatchCP : @ switch) match {
366
385
case '|' =>
367
386
// Complete one alternative
368
387
alternatives.push(completeSequence(sequence))
@@ -384,15 +403,25 @@ private[regex] object GroupStartMapper {
384
403
case '(' =>
385
404
val indicator = pattern.substring(pIndex + 1 , pIndex + 3 )
386
405
if (indicator == " ?=" || indicator == " ?!" ) {
387
- // Non-capturing test group
406
+ // Look-ahead group
388
407
pIndex += 3
389
408
val inner = parseInsideParensAndClosingParen()
390
- new ZeroLengthTestNode (indicator, inner)
409
+ new LookAroundNode (isLookBehind = false , indicator, inner)
410
+ } else if (indicator == " ?<" ) {
411
+ // Look-behind group, which must be ?<= or ?<!
412
+ val fullIndicator = pattern.substring(pIndex + 1 , pIndex + 4 )
413
+ pIndex += 4
414
+ val inner = parseInsideParensAndClosingParen()
415
+ new LookAroundNode (isLookBehind = true , fullIndicator, inner)
391
416
} else if (indicator == " ?:" ) {
392
417
// Non-capturing group
393
418
pIndex += 3
394
419
val inner = parseInsideParensAndClosingParen()
395
- inner
420
+ // Wrap LeafRegexNode's so that they do not merge with their neighbors
421
+ if (inner.isInstanceOf [LeafRegexNode ])
422
+ new SequenceNode (js.Array (inner))
423
+ else
424
+ inner
396
425
} else {
397
426
// Capturing group
398
427
pIndex += 1
@@ -408,26 +437,31 @@ private[regex] object GroupStartMapper {
408
437
@ inline
409
438
def isDigit (c : Char ): Boolean = c >= '0' && c <= '9'
410
439
411
- if (isDigit(pattern.charAt(pIndex + 1 ))) {
440
+ val startIndex = pIndex
441
+ val c = pattern.charAt(startIndex + 1 )
442
+ pIndex += 2
443
+
444
+ if (isDigit(c)) {
412
445
// it is a back reference; parse all following digits
413
- val startIndex = pIndex
414
- pIndex += 2
415
446
while (isDigit(pattern.charAt(pIndex)))
416
447
pIndex += 1
417
448
new BackReferenceNode (
418
449
Integer .parseInt(pattern.substring(startIndex + 1 , pIndex)))
419
450
} else {
420
- // it is a character escape
421
- val e = pattern.substring(pIndex, pIndex + 2 )
422
- pIndex += 2
423
- new LeafRegexNode (e)
451
+ // it is a character escape, or one of \b, \B, \d, \D, \p{...} or \P{...}
452
+ if (c == 'p' || c == 'P' ) {
453
+ while (pattern.charAt(pIndex) != '}' )
454
+ pIndex += 1
455
+ pIndex += 1
456
+ }
457
+ new LeafRegexNode (pattern.substring(startIndex, pIndex))
424
458
}
425
459
426
460
case '[' =>
427
- // parse until the corresponding ']'
461
+ // parse until the corresponding ']' (here surrogate pairs don't matter)
428
462
@ tailrec def loop (pIndex : Int ): Int = {
429
463
pattern.charAt(pIndex) match {
430
- case '\\ ' => loop(pIndex + 2 )
464
+ case '\\ ' => loop(pIndex + 2 ) // this is also fine for \p{...} and \P{...}
431
465
case ']' => pIndex + 1
432
466
case _ => loop(pIndex + 1 )
433
467
}
@@ -439,9 +473,9 @@ private[regex] object GroupStartMapper {
439
473
new LeafRegexNode (regex)
440
474
441
475
case _ =>
442
- val e = pattern.substring( pIndex, pIndex + 1 )
443
- pIndex += 1
444
- new LeafRegexNode (e )
476
+ val start = pIndex
477
+ pIndex += Character .charCount(dispatchCP)
478
+ new LeafRegexNode (pattern.substring(start, pIndex) )
445
479
}
446
480
447
481
if (baseNode ne null ) { // null if we just completed an alternative
0 commit comments