Skip to content

Commit 762c51d

Browse files
committed
Regex: Reuse the same RegExp for all the Matcher's of a Pattern.
Previously, each `Matcher` had its own copy of a "blueprint" `RegExp`, to isolate its `lastIndex` state. We now explicitly store that state as a separate `position` field in `Matcher`, which we assign to `lastIndex` before `exec`, and read from `lastIndex` after `exec`. This allows to reuse the same `RegExp` for all matchers of a given `Pattern`, and to confine its use to `Pattern.execFind()`.
1 parent 3c58b2c commit 762c51d

File tree

4 files changed

+69
-59
lines changed

4 files changed

+69
-59
lines changed

javalib/src/main/scala/java/util/regex/Matcher.scala

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,12 @@ final class Matcher private[regex] (
2828
def pattern(): Pattern = pattern0
2929

3030
// Configuration (updated manually)
31-
private var regexp = pattern0.newJSRegExp()
3231
private var inputstr = input0.subSequence(regionStart0, regionEnd0).toString
3332

3433
// Match result (updated by successful matches)
34+
private var position: Int = 0 // within `inputstr`, not `input0`
3535
private var lastMatch: js.RegExp.ExecResult = null
3636
private var lastMatchIsForMatches = false
37-
private var canStillFind = true
3837

3938
// Append state (updated by replacement methods)
4039
private var appendPos: Int = 0
@@ -57,22 +56,20 @@ final class Matcher private[regex] (
5756
lastMatch ne null
5857
}
5958

60-
def find(): Boolean = if (canStillFind) {
61-
lastMatch = pattern().execFind(regexp, inputstr)
62-
if (lastMatch ne null) {
63-
if (lastMatch(0).get.isEmpty)
64-
regexp.lastIndex += 1
65-
} else {
66-
canStillFind = false
67-
}
59+
def find(): Boolean = {
60+
val (mtch, end) = pattern().execFind(inputstr, position)
61+
position =
62+
if (mtch ne null) (if (end == mtch.index) end + 1 else end)
63+
else inputstr.length() + 1 // cannot find anymore
64+
lastMatch = mtch
6865
lastMatchIsForMatches = false
6966
startOfGroupCache = null
70-
lastMatch ne null
71-
} else false
67+
mtch ne null
68+
}
7269

7370
def find(start: Int): Boolean = {
7471
reset()
75-
regexp.lastIndex = start
72+
position = start
7673
find()
7774
}
7875

@@ -147,9 +144,8 @@ final class Matcher private[regex] (
147144
// Reset methods
148145

149146
private def resetMatch(): Matcher = {
150-
regexp.lastIndex = 0
147+
position = 0
151148
lastMatch = null
152-
canStillFind = true
153149
appendPos = 0
154150
startOfGroupCache = null
155151
this
@@ -164,10 +160,8 @@ final class Matcher private[regex] (
164160
}
165161

166162
def usePattern(pattern: Pattern): Matcher = {
167-
val prevLastIndex = regexp.lastIndex
163+
// note that `position` and `appendPos` are left unchanged
168164
pattern0 = pattern
169-
regexp = pattern.newJSRegExp()
170-
regexp.lastIndex = prevLastIndex
171165
lastMatch = null
172166
startOfGroupCache = null
173167
this

javalib/src/main/scala/java/util/regex/Pattern.scala

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,18 @@ final class Pattern private[regex] (
3434
@inline private def jsFlagsForFind: String =
3535
jsFlags + (if (sticky && supportsSticky) "gy" else "g")
3636

37-
/** Compile the native RegExp once.
37+
/** The RegExp that is used for `Matcher.find()`.
3838
*
39-
* In `newJSRegExp()`, we clone that native RegExp using
40-
* `new js.RegExp(jsRegExpBlueprint)`, which the JS engine hopefully
41-
* optimizes by reusing the compiled internal representation of the RegExp.
42-
* Otherwise, well, there's not much we can do about it.
39+
* It receives the 'g' flag so that `lastIndex` is taken into acount.
40+
*
41+
* It also receives the 'y' flag if this pattern is sticky and it is
42+
* supported. If it is not supported, its behavior is polyfilled in
43+
* `execFind()`.
44+
*
45+
* Since that RegExp is only used locally within `execFind()`, we can
46+
* always reuse the same instance.
4347
*/
44-
private[this] val jsRegExpBlueprint =
48+
private[this] val jsRegExpForFind =
4549
new js.RegExp(jsPattern, jsFlagsForFind)
4650

4751
/** Another version of the RegExp that is used by `Matcher.matches()`.
@@ -59,33 +63,28 @@ final class Pattern private[regex] (
5963
private[regex] lazy val groupStartMapper: GroupStartMapper =
6064
GroupStartMapper(jsPattern, jsFlags)
6165

62-
private[regex] def newJSRegExp(): js.RegExp = {
63-
val r = new js.RegExp(jsRegExpBlueprint)
64-
if (r ne jsRegExpBlueprint) {
65-
r
66-
} else {
67-
/* Workaround for the PhantomJS 1.x bug
68-
* https://github.com/ariya/phantomjs/issues/11494
69-
* which causes new js.RegExp(jsRegExpBlueprint) to return the same
70-
* object, rather than a new one.
71-
* In that case, we reconstruct a new js.RegExp from scratch.
72-
*/
73-
new js.RegExp(jsPattern, jsFlagsForFind)
74-
}
75-
}
76-
7766
private[regex] def execMatches(input: String): js.RegExp.ExecResult =
7867
jsRegExpForMatches.exec(input)
7968

80-
private[regex] def execFind(regexp: js.RegExp, input: String): js.RegExp.ExecResult = {
69+
@inline // to stack-allocate the tuple
70+
private[regex] def execFind(input: String, start: Int): (js.RegExp.ExecResult, Int) = {
71+
val mtch = execFindInternal(input, start)
72+
val end = jsRegExpForFind.lastIndex
73+
(mtch, end)
74+
}
75+
76+
private def execFindInternal(input: String, start: Int): js.RegExp.ExecResult = {
77+
val regexp = jsRegExpForFind
78+
8179
if (!supportsSticky && sticky) {
82-
val start = regexp.lastIndex
80+
regexp.lastIndex = start
8381
val mtch = regexp.exec(input)
8482
if (mtch == null || mtch.index > start)
8583
null
8684
else
8785
mtch
8886
} else if (supportsUnicode) {
87+
regexp.lastIndex = start
8988
regexp.exec(input)
9089
} else {
9190
/* When the native RegExp does not support the 'u' flag (introduced in
@@ -101,8 +100,8 @@ final class Pattern private[regex] (
101100
* matches that start in the middle of a surrogate pair.
102101
*/
103102
@tailrec
104-
def loop(): js.RegExp.ExecResult = {
105-
val start = regexp.lastIndex
103+
def loop(start: Int): js.RegExp.ExecResult = {
104+
regexp.lastIndex = start
106105
val mtch = regexp.exec(input)
107106
if (mtch == null) {
108107
null
@@ -111,14 +110,13 @@ final class Pattern private[regex] (
111110
if (index > start && index < input.length() &&
112111
Character.isLowSurrogate(input.charAt(index)) &&
113112
Character.isHighSurrogate(input.charAt(index - 1))) {
114-
regexp.lastIndex = index + 1
115-
loop()
113+
loop(index + 1)
116114
} else {
117115
mtch
118116
}
119117
}
120118
}
121-
loop()
119+
loop(start)
122120
}
123121
}
124122

linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,9 @@ class LibrarySizeTest {
7070
)
7171

7272
testLinkedSizes(
73-
expectedFastLinkSize = 188076,
74-
expectedFullLinkSizeWithoutClosure = 175219,
75-
expectedFullLinkSizeWithClosure = 32036,
73+
expectedFastLinkSize = 187486,
74+
expectedFullLinkSizeWithoutClosure = 174649,
75+
expectedFullLinkSizeWithClosure = 31827,
7676
classDefs,
7777
moduleInitializers = MainTestModuleInitializers
7878
)

test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexMatcherTest.scala

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -465,18 +465,36 @@ class RegexMatcherTest {
465465
}
466466

467467
@Test def zeroLengthMatches(): Unit = {
468-
val pat = Pattern.compile("a*?")
469-
val mat = pat.matcher("aaaaa")
470-
for (i <- 0 to 5) {
471-
assertTrue(mat.find())
472-
assertEquals(i, mat.start)
473-
assertEquals(i, mat.end)
474-
}
468+
@noinline
469+
def assertZeroLengthMatches(pattern: String, input: String, jvmBug: Boolean, positions: Int*): Unit = {
470+
val m = Pattern.compile(pattern).matcher(input)
471+
for (pos <- positions) {
472+
val msg = s"$pattern in $input at $pos"
473+
assertTrue(msg, m.find())
474+
assertEquals(msg, pos, m.start())
475+
assertEquals(msg, pos, m.end())
476+
}
477+
478+
/* Check that subsequent `find()`s return `false`.
479+
* Do it 3 times because internal conditions vary the first two times.
480+
*/
481+
val msg = s"$pattern in $input at end"
482+
assertFalse(msg, m.find())
475483

476-
// Make sure we don't suddenly re-match
477-
for (i <- 0 to 5) {
478-
assertFalse(mat.find())
484+
if (!(jvmBug && executingInJVM)) {
485+
assertFalse(msg, m.find())
486+
assertFalse(msg, m.find())
487+
}
479488
}
489+
490+
assertZeroLengthMatches("a*?", "aaaaa", false, (0 to 5): _*)
491+
assertZeroLengthMatches("(?!b)", "abcdba", false, 0, 2, 3, 5, 6)
492+
493+
/* Unexplicably, on the following inputs, the JVM alternates between
494+
* `false` and `true` after the end is reached.
495+
*/
496+
assertZeroLengthMatches("(?=b)", "abcdbab", true, 1, 4, 6)
497+
assertZeroLengthMatches("(?=b)", "abcdba", true, 1, 4)
480498
}
481499

482500
@Test def inPatternFlags_Issue997(): Unit = {

0 commit comments

Comments
 (0)