Skip to content

Commit ca4e64e

Browse files
committed
Regex: Store start and end for the indices of matched groups.
And `undefined` for unmatched groups. In addition, store the start and end of the whole match at index 0. The indices array is cached directly as a field `indices` on the `MatchResult`. This allows the cache to be better shared between a `Matcher` and a `MatchResult`. This corresponds to the format of the `indices` field of the `regexp-match-indices` ECMAScript proposal.
1 parent 85b9d6f commit ca4e64e

File tree

3 files changed

+77
-97
lines changed

3 files changed

+77
-97
lines changed

javalib/src/main/scala/java/util/regex/GroupStartMapper.scala renamed to javalib/src/main/scala/java/util/regex/IndicesBuilder.scala

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,18 @@ package java.util.regex
1414

1515
import scala.annotation.{tailrec, switch}
1616

17-
import java.util.HashMap
18-
1917
import scala.scalajs.js
2018

21-
/** The goal of a `GroupStartMapper` is to retrieve the start position of each
22-
* group of a matching regular expression where only the strings of the
23-
* matched groups are known.
19+
import Pattern.IndicesArray
20+
21+
/** The goal of a `IndicesBuilder` is to retrieve the start and end positions
22+
* of each group of a matching regular expression.
23+
*
24+
* This is essentially a polyfill for the 'd' flag of `js.RegExp`, which is
25+
* a Stage 4 proposal scheduled for inclusion in ECMAScript 2022. Without that
26+
* flag, `js.RegExp` only provides the substrings matched by capturing groups,
27+
* but not their positions. We implement the positions on top of that.
28+
*
2429
* For that, we use the following observation:
2530
* If the regex /A(B)\1/ matches a string at a given index,
2631
* then /(A)(B)\2/ matches the same string at the same index.
@@ -38,7 +43,7 @@ import scala.scalajs.js
3843
* - It computes the start of every group thanks to the groups before it
3944
* - It builds and returns the mapping of previous group number -> start
4045
*
41-
* The `pattern` that is parsed by `GroupStartMapper` is the *compiled* JS
46+
* The `pattern` that is parsed by `IndicesBuilder` is the *compiled* JS
4247
* pattern produced by `PatternCompiler`, not the original Java pattern. This
4348
* means that we can simplify a number of things with the knowledge that:
4449
*
@@ -53,13 +58,13 @@ import scala.scalajs.js
5358
*
5459
* @author Mikaël Mayer
5560
*/
56-
private[regex] class GroupStartMapper private (pattern: String, flags: String,
57-
node: GroupStartMapper.Node, groupCount: Int,
61+
private[regex] class IndicesBuilder private (pattern: String, flags: String,
62+
node: IndicesBuilder.Node, groupCount: Int,
5863
jsRegExpForFind: js.RegExp, jsRegExpForMatches: js.RegExp) {
5964

60-
import GroupStartMapper._
65+
import IndicesBuilder._
6166

62-
def apply(forMatches: Boolean, string: String, index: Int): js.Array[Int] = {
67+
def apply(forMatches: Boolean, string: String, index: Int): IndicesArray = {
6368
val regExp =
6469
if (forMatches) jsRegExpForMatches
6570
else jsRegExpForFind
@@ -73,31 +78,35 @@ private[regex] class GroupStartMapper private (pattern: String, flags: String,
7378
s"Original pattern '$pattern' with flags '$flags' did match however.")
7479
}
7580

76-
// Prepare a `groupStartMap` array with the correct length filled with -1
77-
val len = groupCount + 1 // index 0 is not used
78-
val groupStartMap = new js.Array[Int](len)
79-
var i = 0
81+
val start = index // by definition
82+
val end = start + allMatchResult(0).get.length()
83+
84+
// Prepare an `indices` array with the correct length filled with undefined
85+
val len = groupCount + 1 // index 0 is the whole match
86+
val indices = new IndicesArray(len)
87+
indices(0) = js.Tuple2(start, end)
88+
var i = 1
8089
while (i != len) {
81-
groupStartMap(i) = -1
90+
indices(i) = js.undefined
8291
i += 1
8392
}
8493

85-
node.propagateFromStart(allMatchResult, groupStartMap, index)
94+
node.propagate(allMatchResult, indices, start, end)
8695

87-
groupStartMap
96+
indices
8897
}
8998
}
9099

91-
private[regex] object GroupStartMapper {
92-
def apply(pattern: String, flags: String): GroupStartMapper = {
100+
private[regex] object IndicesBuilder {
101+
def apply(pattern: String, flags: String): IndicesBuilder = {
93102
val parser = new Parser(pattern)
94103
val node = parser.parseTopLevel()
95104
node.setNewGroup(1)
96105
val allMatchingPattern = node.buildRegex(parser.groupNodeMap)
97106
val jsRegExpForFind = new js.RegExp(allMatchingPattern, flags + "g")
98107
val jsRegExpForMatches =
99108
new js.RegExp(Pattern.wrapJSPatternForMatches(allMatchingPattern), flags)
100-
new GroupStartMapper(pattern, flags, node, parser.parsedGroupCount,
109+
new IndicesBuilder(pattern, flags, node, parser.parsedGroupCount,
101110
jsRegExpForFind, jsRegExpForMatches)
102111
}
103112

@@ -155,16 +164,16 @@ private[regex] object GroupStartMapper {
155164
* `end`, while other nodes propagate the `start`.
156165
*/
157166
def propagate(matchResult: js.RegExp.ExecResult,
158-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit
167+
indices: IndicesArray, start: Int, end: Int): Unit
159168

160169
/** Propagates the appropriate positions to the descendants of this node
161170
* from its end position.
162171
*/
163172
final def propagateFromEnd(matchResult: js.RegExp.ExecResult,
164-
groupStartMap: js.Array[Int], end: Int): Unit = {
173+
indices: IndicesArray, end: Int): Unit = {
165174

166175
val start = matchResult(newGroup).fold(-1)(matched => end - matched.length)
167-
propagate(matchResult, groupStartMap, start, end)
176+
propagate(matchResult, indices, start, end)
168177
}
169178

170179
/** Propagates the appropriate positions to the descendants of this node
@@ -173,10 +182,10 @@ private[regex] object GroupStartMapper {
173182
* @return the end position of this node, as a convenience for `SequenceNode.propagate`
174183
*/
175184
final def propagateFromStart(matchResult: js.RegExp.ExecResult,
176-
groupStartMap: js.Array[Int], start: Int): Int = {
185+
indices: IndicesArray, start: Int): Int = {
177186

178187
val end = matchResult(newGroup).fold(-1)(matched => start + matched.length)
179-
propagate(matchResult, groupStartMap, start, end)
188+
propagate(matchResult, indices, start, end)
180189
end
181190
}
182191
}
@@ -190,15 +199,15 @@ private[regex] object GroupStartMapper {
190199
"(" + inner.buildRegex(groupNodeMap) + ")"
191200

192201
def propagate(matchResult: js.RegExp.ExecResult,
193-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
202+
indices: IndicesArray, start: Int, end: Int): Unit = {
194203
/* #3901: A GroupNode within a negative look-ahead node may receive
195204
* `start != -1` from above, yet not match anything itself. We must
196205
* always keep the default `-1` if this group node does not match
197206
* anything.
198207
*/
199208
if (matchResult(newGroup).isDefined)
200-
groupStartMap(number) = start
201-
inner.propagate(matchResult, groupStartMap, start, end)
209+
indices(number) = js.Tuple2(start, end)
210+
inner.propagate(matchResult, indices, start, end)
202211
}
203212
}
204213

@@ -217,11 +226,11 @@ private[regex] object GroupStartMapper {
217226
"((" + indicator + inner.buildRegex(groupNodeMap) + "))"
218227

219228
def propagate(matchResult: js.RegExp.ExecResult,
220-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
229+
indices: IndicesArray, start: Int, end: Int): Unit = {
221230
if (isLookBehind)
222-
inner.propagateFromEnd(matchResult, groupStartMap, end)
231+
inner.propagateFromEnd(matchResult, indices, end)
223232
else
224-
inner.propagateFromStart(matchResult, groupStartMap, start)
233+
inner.propagateFromStart(matchResult, indices, start)
225234
}
226235
}
227236

@@ -236,8 +245,8 @@ private[regex] object GroupStartMapper {
236245
"(" + inner.buildRegex(groupNodeMap) + repeater + ")"
237246

238247
def propagate(matchResult: js.RegExp.ExecResult,
239-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
240-
inner.propagateFromEnd(matchResult, groupStartMap, end)
248+
indices: IndicesArray, start: Int, end: Int): Unit = {
249+
inner.propagateFromEnd(matchResult, indices, end)
241250
}
242251
}
243252

@@ -247,7 +256,7 @@ private[regex] object GroupStartMapper {
247256
"(" + regex + ")"
248257

249258
def propagate(matchResult: js.RegExp.ExecResult,
250-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
259+
indices: IndicesArray, start: Int, end: Int): Unit = {
251260
// nothing to do
252261
}
253262
}
@@ -262,7 +271,7 @@ private[regex] object GroupStartMapper {
262271
}
263272

264273
def propagate(matchResult: js.RegExp.ExecResult,
265-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
274+
indices: IndicesArray, start: Int, end: Int): Unit = {
266275
// nothing to do
267276
}
268277
}
@@ -292,13 +301,13 @@ private[regex] object GroupStartMapper {
292301
}
293302

294303
def propagate(matchResult: js.RegExp.ExecResult,
295-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
304+
indices: IndicesArray, start: Int, end: Int): Unit = {
296305
val len = sequence.length
297306
var i = 0
298307
var nextStart = start
299308
while (i != len) {
300309
nextStart =
301-
sequence(i).propagateFromStart(matchResult, groupStartMap, nextStart)
310+
sequence(i).propagateFromStart(matchResult, indices, nextStart)
302311
i += 1
303312
}
304313
}
@@ -333,11 +342,11 @@ private[regex] object GroupStartMapper {
333342
}
334343

335344
def propagate(matchResult: js.RegExp.ExecResult,
336-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
345+
indices: IndicesArray, start: Int, end: Int): Unit = {
337346
val len = alternatives.length
338347
var i = 0
339348
while (i != len) {
340-
alternatives(i).propagate(matchResult, groupStartMap, start, end)
349+
alternatives(i).propagate(matchResult, indices, start, end)
341350
i += 1
342351
}
343352
}

javalib/src/main/scala/java/util/regex/Matcher.scala

Lines changed: 18 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212

1313
package java.util.regex
1414

15-
import scala.language.implicitConversions
16-
1715
import scala.annotation.switch
1816

1917
import scala.scalajs.js
2018

19+
import Pattern.IndicesArray
20+
2121
final class Matcher private[regex] (
2222
private var pattern0: Pattern, private var input0: String)
2323
extends AnyRef with MatchResult {
@@ -67,7 +67,6 @@ final class Matcher private[regex] (
6767
position = inputstr.length() + 1
6868
}
6969
lastMatchIsForMatches = false
70-
startOfGroupCache = null
7170
lastMatch ne null
7271
}
7372

@@ -151,7 +150,6 @@ final class Matcher private[regex] (
151150
position = 0
152151
lastMatch = null
153152
appendPos = 0
154-
startOfGroupCache = null
155153
this
156154
}
157155

@@ -172,7 +170,6 @@ final class Matcher private[regex] (
172170
// note that `position` and `appendPos` are left unchanged
173171
pattern0 = pattern
174172
lastMatch = null
175-
startOfGroupCache = null
176173
this
177174
}
178175

@@ -190,11 +187,11 @@ final class Matcher private[regex] (
190187
def end(): Int = start() + group().length
191188
def group(): String = ensureLastMatch(0).get
192189

193-
private def startInternal(compiledGroup: Int): Int = {
194-
val s = startOfGroup(compiledGroup)
195-
if (s == -1) -1
196-
else s + regionStart()
197-
}
190+
private def indices: IndicesArray =
191+
pattern().getIndices(ensureLastMatch, lastMatchIsForMatches)
192+
193+
private def startInternal(compiledGroup: Int): Int =
194+
indices(compiledGroup).fold(-1)(_._1 + regionStart())
198195

199196
def start(group: Int): Int = {
200197
if (group == 0) start()
@@ -204,11 +201,8 @@ final class Matcher private[regex] (
204201
def start(name: String): Int =
205202
startInternal(pattern().namedGroup(name))
206203

207-
private def endInternal(compiledGroup: Int): Int = {
208-
val s = startOfGroup(compiledGroup)
209-
if (s == -1) -1
210-
else s + ensureLastMatch(compiledGroup).get.length + regionStart()
211-
}
204+
private def endInternal(compiledGroup: Int): Int =
205+
indices(compiledGroup).fold(-1)(_._2 + regionStart())
212206

213207
def end(group: Int): Int =
214208
if (group == 0) end()
@@ -225,10 +219,8 @@ final class Matcher private[regex] (
225219

226220
// Seal the state
227221

228-
def toMatchResult(): MatchResult = {
229-
new SealedResult(inputstr, lastMatch, lastMatchIsForMatches, pattern(),
230-
regionStart(), startOfGroupCache)
231-
}
222+
def toMatchResult(): MatchResult =
223+
new SealedResult(lastMatch, lastMatchIsForMatches, pattern(), regionStart())
232224

233225
// Other query state methods
234226

@@ -255,16 +247,6 @@ final class Matcher private[regex] (
255247

256248
def hasAnchoringBounds(): Boolean = true
257249
//def useAnchoringBounds(b: Boolean): Matcher
258-
259-
// Lazily computed by `startOfGroup`, reset every time `lastMatch` changes
260-
private var startOfGroupCache: js.Array[Int] = _
261-
262-
/** Returns a mapping from the group number to the respective start position. */
263-
private def startOfGroup: js.Array[Int] = {
264-
if (startOfGroupCache eq null)
265-
startOfGroupCache = pattern0.groupStartMapper(lastMatchIsForMatches, inputstr, ensureLastMatch.index)
266-
startOfGroupCache
267-
}
268250
}
269251

270252
object Matcher {
@@ -282,10 +264,8 @@ object Matcher {
282264
result
283265
}
284266

285-
private final class SealedResult(inputstr: String,
286-
lastMatch: js.RegExp.ExecResult, lastMatchIsForMatches: Boolean,
287-
pattern: Pattern, regionStart: Int,
288-
private var startOfGroupCache: js.Array[Int])
267+
private final class SealedResult(lastMatch: js.RegExp.ExecResult,
268+
lastMatchIsForMatches: Boolean, pattern: Pattern, regionStart: Int)
289269
extends MatchResult {
290270

291271
def groupCount(): Int = pattern.groupCount
@@ -294,36 +274,18 @@ object Matcher {
294274
def end(): Int = start() + group().length
295275
def group(): String = ensureLastMatch(0).get
296276

297-
private def startOfGroup: js.Array[Int] = {
298-
if (startOfGroupCache eq null)
299-
startOfGroupCache = pattern.groupStartMapper(lastMatchIsForMatches, inputstr, ensureLastMatch.index)
300-
startOfGroupCache
301-
}
277+
private def indices: IndicesArray =
278+
pattern.getIndices(ensureLastMatch, lastMatchIsForMatches)
302279

303280
/* Note that MatchResult does *not* define the named versions of `group`,
304281
* `start` and `end`, so we don't have them here either.
305282
*/
306283

307-
private def startInternal(compiledGroup: Int): Int = {
308-
val s = startOfGroup(compiledGroup)
309-
if (s == -1) -1
310-
else s + regionStart
311-
}
312-
313-
def start(group: Int): Int = {
314-
if (group == 0) start()
315-
else startInternal(pattern.numberedGroup(group))
316-
}
317-
318-
private def endInternal(compiledGroup: Int): Int = {
319-
val s = startOfGroup(compiledGroup)
320-
if (s == -1) -1
321-
else s + ensureLastMatch(compiledGroup).get.length + regionStart
322-
}
284+
def start(group: Int): Int =
285+
indices(pattern.numberedGroup(group)).fold(-1)(_._1 + regionStart)
323286

324287
def end(group: Int): Int =
325-
if (group == 0) end()
326-
else endInternal(pattern.numberedGroup(group))
288+
indices(pattern.numberedGroup(group)).fold(-1)(_._2 + regionStart)
327289

328290
def group(group: Int): String =
329291
ensureLastMatch(pattern.numberedGroup(group)).orNull

0 commit comments

Comments
 (0)