Skip to content

Commit deb2479

Browse files
committed
Regex: Store start and end for the indices of matched groups.
And `undefined` for unmatched groups. In addition, store the start and end of the whole match at index 0. The indices array is cached directly as a field `indices` on the `MatchResult`. This allows the cache to be better shared between a `Matcher` and a `MatchResult`. This corresponds to the format of the `indices` field of the `regexp-match-indices` ECMAScript proposal.
1 parent d47b40e commit deb2479

File tree

3 files changed

+77
-97
lines changed

3 files changed

+77
-97
lines changed

javalib/src/main/scala/java/util/regex/GroupStartMapper.scala renamed to javalib/src/main/scala/java/util/regex/IndicesBuilder.scala

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,18 @@ package java.util.regex
1414

1515
import scala.annotation.{tailrec, switch}
1616

17-
import java.util.HashMap
18-
1917
import scala.scalajs.js
2018

21-
/** The goal of a `GroupStartMapper` is to retrieve the start position of each
22-
* group of a matching regular expression where only the strings of the
23-
* matched groups are known.
19+
import Pattern.IndicesArray
20+
21+
/** The goal of a `IndicesBuilder` is to retrieve the start and end positions
22+
* of each group of a matching regular expression.
23+
*
24+
* This is essentially a polyfill for the 'd' flag of `js.RegExp`, which
25+
* should make its way into ECMAScript 2021. Without that flag, `js.RegExp`
26+
* only provides the substrings matched by capturing groups, but not their
27+
* positions. We implement the positions on top of that.
28+
*
2429
* For that, we use the following observation:
2530
* If the regex /A(B)\1/ matches a string at a given index,
2631
* then /(A)(B)\2/ matches the same string at the same index.
@@ -38,7 +43,7 @@ import scala.scalajs.js
3843
* - It computes the start of every group thanks to the groups before it
3944
* - It builds and returns the mapping of previous group number -> start
4045
*
41-
* The `pattern` that is parsed by `GroupStartMapper` is the *compiled* JS
46+
* The `pattern` that is parsed by `IndicesBuilder` is the *compiled* JS
4247
* pattern produced by `PatternCompiler`, not the original Java pattern. This
4348
* means that we can simplify a number of things with the knowledge that:
4449
*
@@ -53,13 +58,13 @@ import scala.scalajs.js
5358
*
5459
* @author Mikaël Mayer
5560
*/
56-
private[regex] class GroupStartMapper private (pattern: String, flags: String,
57-
node: GroupStartMapper.Node, groupCount: Int,
61+
private[regex] class IndicesBuilder private (pattern: String, flags: String,
62+
node: IndicesBuilder.Node, groupCount: Int,
5863
jsRegExpForFind: js.RegExp, jsRegExpForMatches: js.RegExp) {
5964

60-
import GroupStartMapper._
65+
import IndicesBuilder._
6166

62-
def apply(forMatches: Boolean, string: String, index: Int): js.Array[Int] = {
67+
def apply(forMatches: Boolean, string: String, index: Int): IndicesArray = {
6368
val regExp =
6469
if (forMatches) jsRegExpForMatches
6570
else jsRegExpForFind
@@ -73,30 +78,34 @@ private[regex] class GroupStartMapper private (pattern: String, flags: String,
7378
s"Original pattern '$pattern' with flags '$flags' did match however.")
7479
}
7580

76-
// Prepare a `groupStartMap` array with the correct length filled with -1
77-
val len = groupCount + 1 // index 0 is not used
78-
val groupStartMap = new js.Array[Int](len)
79-
var i = 0
81+
val start = index // by definition
82+
val end = start + allMatchResult(0).get.length()
83+
84+
// Prepare an `indices` array with the correct length filled with undefined
85+
val len = groupCount + 1 // index 0 is the whole match
86+
val indices = new IndicesArray(len)
87+
indices(0) = js.Tuple2(start, end)
88+
var i = 1
8089
while (i != len) {
81-
groupStartMap(i) = -1
90+
indices(i) = js.undefined
8291
i += 1
8392
}
8493

85-
node.propagateFromStart(allMatchResult, groupStartMap, index)
94+
node.propagate(allMatchResult, indices, start, end)
8695

87-
groupStartMap
96+
indices
8897
}
8998
}
9099

91-
private[regex] object GroupStartMapper {
92-
def apply(pattern: String, flags: String): GroupStartMapper = {
100+
private[regex] object IndicesBuilder {
101+
def apply(pattern: String, flags: String): IndicesBuilder = {
93102
val parser = new Parser(pattern)
94103
val node = parser.parseTopLevel()
95104
node.setNewGroup(1)
96105
val allMatchingPattern = node.buildRegex(parser.groupNodeMap)
97106
val jsRegExpForFind = new js.RegExp(allMatchingPattern, flags + "g")
98107
val jsRegExpForMatches = new js.RegExp("^" + allMatchingPattern + "$", flags)
99-
new GroupStartMapper(pattern, flags, node, parser.parsedGroupCount,
108+
new IndicesBuilder(pattern, flags, node, parser.parsedGroupCount,
100109
jsRegExpForFind, jsRegExpForMatches)
101110
}
102111

@@ -154,16 +163,16 @@ private[regex] object GroupStartMapper {
154163
* `end`, while other nodes propagate the `start`.
155164
*/
156165
def propagate(matchResult: js.RegExp.ExecResult,
157-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit
166+
indices: IndicesArray, start: Int, end: Int): Unit
158167

159168
/** Propagates the appropriate positions to the descendants of this node
160169
* from its end position.
161170
*/
162171
final def propagateFromEnd(matchResult: js.RegExp.ExecResult,
163-
groupStartMap: js.Array[Int], end: Int): Unit = {
172+
indices: IndicesArray, end: Int): Unit = {
164173

165174
val start = matchResult(newGroup).fold(-1)(matched => end - matched.length)
166-
propagate(matchResult, groupStartMap, start, end)
175+
propagate(matchResult, indices, start, end)
167176
}
168177

169178
/** Propagates the appropriate positions to the descendants of this node
@@ -172,10 +181,10 @@ private[regex] object GroupStartMapper {
172181
* @return the end position of this node, as a convenience for `SequenceNode.propagate`
173182
*/
174183
final def propagateFromStart(matchResult: js.RegExp.ExecResult,
175-
groupStartMap: js.Array[Int], start: Int): Int = {
184+
indices: IndicesArray, start: Int): Int = {
176185

177186
val end = matchResult(newGroup).fold(-1)(matched => start + matched.length)
178-
propagate(matchResult, groupStartMap, start, end)
187+
propagate(matchResult, indices, start, end)
179188
end
180189
}
181190
}
@@ -189,15 +198,15 @@ private[regex] object GroupStartMapper {
189198
"(" + inner.buildRegex(groupNodeMap) + ")"
190199

191200
def propagate(matchResult: js.RegExp.ExecResult,
192-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
201+
indices: IndicesArray, start: Int, end: Int): Unit = {
193202
/* #3901: A GroupNode within a negative look-ahead node may receive
194203
* `start != -1` from above, yet not match anything itself. We must
195204
* always keep the default `-1` if this group node does not match
196205
* anything.
197206
*/
198207
if (matchResult(newGroup).isDefined)
199-
groupStartMap(number) = start
200-
inner.propagate(matchResult, groupStartMap, start, end)
208+
indices(number) = js.Tuple2(start, end)
209+
inner.propagate(matchResult, indices, start, end)
201210
}
202211
}
203212

@@ -216,11 +225,11 @@ private[regex] object GroupStartMapper {
216225
"((" + indicator + inner.buildRegex(groupNodeMap) + "))"
217226

218227
def propagate(matchResult: js.RegExp.ExecResult,
219-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
228+
indices: IndicesArray, start: Int, end: Int): Unit = {
220229
if (isLookBehind)
221-
inner.propagateFromEnd(matchResult, groupStartMap, end)
230+
inner.propagateFromEnd(matchResult, indices, end)
222231
else
223-
inner.propagateFromStart(matchResult, groupStartMap, start)
232+
inner.propagateFromStart(matchResult, indices, start)
224233
}
225234
}
226235

@@ -235,8 +244,8 @@ private[regex] object GroupStartMapper {
235244
"(" + inner.buildRegex(groupNodeMap) + repeater + ")"
236245

237246
def propagate(matchResult: js.RegExp.ExecResult,
238-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
239-
inner.propagateFromEnd(matchResult, groupStartMap, end)
247+
indices: IndicesArray, start: Int, end: Int): Unit = {
248+
inner.propagateFromEnd(matchResult, indices, end)
240249
}
241250
}
242251

@@ -246,7 +255,7 @@ private[regex] object GroupStartMapper {
246255
"(" + regex + ")"
247256

248257
def propagate(matchResult: js.RegExp.ExecResult,
249-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
258+
indices: IndicesArray, start: Int, end: Int): Unit = {
250259
// nothing to do
251260
}
252261
}
@@ -261,7 +270,7 @@ private[regex] object GroupStartMapper {
261270
}
262271

263272
def propagate(matchResult: js.RegExp.ExecResult,
264-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
273+
indices: IndicesArray, start: Int, end: Int): Unit = {
265274
// nothing to do
266275
}
267276
}
@@ -291,13 +300,13 @@ private[regex] object GroupStartMapper {
291300
}
292301

293302
def propagate(matchResult: js.RegExp.ExecResult,
294-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
303+
indices: IndicesArray, start: Int, end: Int): Unit = {
295304
val len = sequence.length
296305
var i = 0
297306
var nextStart = start
298307
while (i != len) {
299308
nextStart =
300-
sequence(i).propagateFromStart(matchResult, groupStartMap, nextStart)
309+
sequence(i).propagateFromStart(matchResult, indices, nextStart)
301310
i += 1
302311
}
303312
}
@@ -332,11 +341,11 @@ private[regex] object GroupStartMapper {
332341
}
333342

334343
def propagate(matchResult: js.RegExp.ExecResult,
335-
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
344+
indices: IndicesArray, start: Int, end: Int): Unit = {
336345
val len = alternatives.length
337346
var i = 0
338347
while (i != len) {
339-
alternatives(i).propagate(matchResult, groupStartMap, start, end)
348+
alternatives(i).propagate(matchResult, indices, start, end)
340349
i += 1
341350
}
342351
}

javalib/src/main/scala/java/util/regex/Matcher.scala

Lines changed: 18 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212

1313
package java.util.regex
1414

15-
import scala.language.implicitConversions
16-
1715
import scala.annotation.switch
1816

1917
import scala.scalajs.js
2018

19+
import Pattern.IndicesArray
20+
2121
final class Matcher private[regex] (
2222
private var pattern0: Pattern, private var input0: String)
2323
extends AnyRef with MatchResult {
@@ -67,7 +67,6 @@ final class Matcher private[regex] (
6767
position = inputstr.length() + 1
6868
}
6969
lastMatchIsForMatches = false
70-
startOfGroupCache = null
7170
lastMatch ne null
7271
}
7372

@@ -151,7 +150,6 @@ final class Matcher private[regex] (
151150
position = 0
152151
lastMatch = null
153152
appendPos = 0
154-
startOfGroupCache = null
155153
this
156154
}
157155

@@ -172,7 +170,6 @@ final class Matcher private[regex] (
172170
// note that `position` and `appendPos` are left unchanged
173171
pattern0 = pattern
174172
lastMatch = null
175-
startOfGroupCache = null
176173
this
177174
}
178175

@@ -190,11 +187,11 @@ final class Matcher private[regex] (
190187
def end(): Int = start() + group().length
191188
def group(): String = ensureLastMatch(0).get
192189

193-
private def startInternal(compiledGroup: Int): Int = {
194-
val s = startOfGroup(compiledGroup)
195-
if (s == -1) -1
196-
else s + regionStart()
197-
}
190+
private def indices: IndicesArray =
191+
pattern().getIndices(ensureLastMatch, lastMatchIsForMatches)
192+
193+
private def startInternal(compiledGroup: Int): Int =
194+
indices(compiledGroup).fold(-1)(_._1 + regionStart())
198195

199196
def start(group: Int): Int = {
200197
if (group == 0) start()
@@ -204,11 +201,8 @@ final class Matcher private[regex] (
204201
def start(name: String): Int =
205202
startInternal(pattern().namedGroup(name))
206203

207-
private def endInternal(compiledGroup: Int): Int = {
208-
val s = startOfGroup(compiledGroup)
209-
if (s == -1) -1
210-
else s + ensureLastMatch(compiledGroup).get.length + regionStart()
211-
}
204+
private def endInternal(compiledGroup: Int): Int =
205+
indices(compiledGroup).fold(-1)(_._2 + regionStart())
212206

213207
def end(group: Int): Int =
214208
if (group == 0) end()
@@ -225,10 +219,8 @@ final class Matcher private[regex] (
225219

226220
// Seal the state
227221

228-
def toMatchResult(): MatchResult = {
229-
new SealedResult(inputstr, lastMatch, lastMatchIsForMatches, pattern(),
230-
regionStart(), startOfGroupCache)
231-
}
222+
def toMatchResult(): MatchResult =
223+
new SealedResult(lastMatch, lastMatchIsForMatches, pattern(), regionStart())
232224

233225
// Other query state methods
234226

@@ -255,16 +247,6 @@ final class Matcher private[regex] (
255247

256248
def hasAnchoringBounds(): Boolean = true
257249
//def useAnchoringBounds(b: Boolean): Matcher
258-
259-
// Lazily computed by `startOfGroup`, reset every time `lastMatch` changes
260-
private var startOfGroupCache: js.Array[Int] = _
261-
262-
/** Returns a mapping from the group number to the respective start position. */
263-
private def startOfGroup: js.Array[Int] = {
264-
if (startOfGroupCache eq null)
265-
startOfGroupCache = pattern0.groupStartMapper(lastMatchIsForMatches, inputstr, ensureLastMatch.index)
266-
startOfGroupCache
267-
}
268250
}
269251

270252
object Matcher {
@@ -282,10 +264,8 @@ object Matcher {
282264
result
283265
}
284266

285-
private final class SealedResult(inputstr: String,
286-
lastMatch: js.RegExp.ExecResult, lastMatchIsForMatches: Boolean,
287-
pattern: Pattern, regionStart: Int,
288-
private var startOfGroupCache: js.Array[Int])
267+
private final class SealedResult(lastMatch: js.RegExp.ExecResult,
268+
lastMatchIsForMatches: Boolean, pattern: Pattern, regionStart: Int)
289269
extends MatchResult {
290270

291271
def groupCount(): Int = pattern.groupCount
@@ -294,36 +274,18 @@ object Matcher {
294274
def end(): Int = start() + group().length
295275
def group(): String = ensureLastMatch(0).get
296276

297-
private def startOfGroup: js.Array[Int] = {
298-
if (startOfGroupCache eq null)
299-
startOfGroupCache = pattern.groupStartMapper(lastMatchIsForMatches, inputstr, ensureLastMatch.index)
300-
startOfGroupCache
301-
}
277+
private def indices: IndicesArray =
278+
pattern.getIndices(ensureLastMatch, lastMatchIsForMatches)
302279

303280
/* Note that MatchResult does *not* define the named versions of `group`,
304281
* `start` and `end`, so we don't have them here either.
305282
*/
306283

307-
private def startInternal(compiledGroup: Int): Int = {
308-
val s = startOfGroup(compiledGroup)
309-
if (s == -1) -1
310-
else s + regionStart
311-
}
312-
313-
def start(group: Int): Int = {
314-
if (group == 0) start()
315-
else startInternal(pattern.numberedGroup(group))
316-
}
317-
318-
private def endInternal(compiledGroup: Int): Int = {
319-
val s = startOfGroup(compiledGroup)
320-
if (s == -1) -1
321-
else s + ensureLastMatch(compiledGroup).get.length + regionStart
322-
}
284+
def start(group: Int): Int =
285+
indices(pattern.numberedGroup(group)).fold(-1)(_._1 + regionStart)
323286

324287
def end(group: Int): Int =
325-
if (group == 0) end()
326-
else endInternal(pattern.numberedGroup(group))
288+
indices(pattern.numberedGroup(group)).fold(-1)(_._2 + regionStart)
327289

328290
def group(group: Int): String =
329291
ensureLastMatch(pattern.numberedGroup(group)).orNull

0 commit comments

Comments
 (0)