From 762c51d2823b0c415858177e0d5455afffc9bb85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Doeraene?= Date: Fri, 23 Apr 2021 10:00:40 +0200 Subject: [PATCH 1/5] Regex: Reuse the same RegExp for all the Matcher's of a Pattern. Previously, each `Matcher` had its own copy of a "blueprint" `RegExp`, to isolate its `lastIndex` state. We now explicitly store that state as a separate `position` field in `Matcher`, which we assign to `lastIndex` before `exec`, and read from `lastIndex` after `exec`. This allows to reuse the same `RegExp` for all matchers of a given `Pattern`, and to confine its use to `Pattern.execFind()`. --- .../main/scala/java/util/regex/Matcher.scala | 30 +++++------ .../main/scala/java/util/regex/Pattern.scala | 54 +++++++++---------- .../org/scalajs/linker/LibrarySizeTest.scala | 6 +-- .../javalib/util/regex/RegexMatcherTest.scala | 38 +++++++++---- 4 files changed, 69 insertions(+), 59 deletions(-) diff --git a/javalib/src/main/scala/java/util/regex/Matcher.scala b/javalib/src/main/scala/java/util/regex/Matcher.scala index 106ba6c3d2..88f1f90a65 100644 --- a/javalib/src/main/scala/java/util/regex/Matcher.scala +++ b/javalib/src/main/scala/java/util/regex/Matcher.scala @@ -28,13 +28,12 @@ final class Matcher private[regex] ( def pattern(): Pattern = pattern0 // Configuration (updated manually) - private var regexp = pattern0.newJSRegExp() private var inputstr = input0.subSequence(regionStart0, regionEnd0).toString // Match result (updated by successful matches) + private var position: Int = 0 // within `inputstr`, not `input0` private var lastMatch: js.RegExp.ExecResult = null private var lastMatchIsForMatches = false - private var canStillFind = true // Append state (updated by replacement methods) private var appendPos: Int = 0 @@ -57,22 +56,20 @@ final class Matcher private[regex] ( lastMatch ne null } - def find(): Boolean = if (canStillFind) { - lastMatch = pattern().execFind(regexp, inputstr) - if (lastMatch ne null) { - if (lastMatch(0).get.isEmpty) - regexp.lastIndex += 1 - } else { - canStillFind = false - } + def find(): Boolean = { + val (mtch, end) = pattern().execFind(inputstr, position) + position = + if (mtch ne null) (if (end == mtch.index) end + 1 else end) + else inputstr.length() + 1 // cannot find anymore + lastMatch = mtch lastMatchIsForMatches = false startOfGroupCache = null - lastMatch ne null - } else false + mtch ne null + } def find(start: Int): Boolean = { reset() - regexp.lastIndex = start + position = start find() } @@ -147,9 +144,8 @@ final class Matcher private[regex] ( // Reset methods private def resetMatch(): Matcher = { - regexp.lastIndex = 0 + position = 0 lastMatch = null - canStillFind = true appendPos = 0 startOfGroupCache = null this @@ -164,10 +160,8 @@ final class Matcher private[regex] ( } def usePattern(pattern: Pattern): Matcher = { - val prevLastIndex = regexp.lastIndex + // note that `position` and `appendPos` are left unchanged pattern0 = pattern - regexp = pattern.newJSRegExp() - regexp.lastIndex = prevLastIndex lastMatch = null startOfGroupCache = null this diff --git a/javalib/src/main/scala/java/util/regex/Pattern.scala b/javalib/src/main/scala/java/util/regex/Pattern.scala index 7c1afbaf0d..70b7910ee2 100644 --- a/javalib/src/main/scala/java/util/regex/Pattern.scala +++ b/javalib/src/main/scala/java/util/regex/Pattern.scala @@ -34,14 +34,18 @@ final class Pattern private[regex] ( @inline private def jsFlagsForFind: String = jsFlags + (if (sticky && supportsSticky) "gy" else "g") - /** Compile the native RegExp once. + /** The RegExp that is used for `Matcher.find()`. * - * In `newJSRegExp()`, we clone that native RegExp using - * `new js.RegExp(jsRegExpBlueprint)`, which the JS engine hopefully - * optimizes by reusing the compiled internal representation of the RegExp. - * Otherwise, well, there's not much we can do about it. + * It receives the 'g' flag so that `lastIndex` is taken into acount. + * + * It also receives the 'y' flag if this pattern is sticky and it is + * supported. If it is not supported, its behavior is polyfilled in + * `execFind()`. + * + * Since that RegExp is only used locally within `execFind()`, we can + * always reuse the same instance. */ - private[this] val jsRegExpBlueprint = + private[this] val jsRegExpForFind = new js.RegExp(jsPattern, jsFlagsForFind) /** Another version of the RegExp that is used by `Matcher.matches()`. @@ -59,33 +63,28 @@ final class Pattern private[regex] ( private[regex] lazy val groupStartMapper: GroupStartMapper = GroupStartMapper(jsPattern, jsFlags) - private[regex] def newJSRegExp(): js.RegExp = { - val r = new js.RegExp(jsRegExpBlueprint) - if (r ne jsRegExpBlueprint) { - r - } else { - /* Workaround for the PhantomJS 1.x bug - * https://github.com/ariya/phantomjs/issues/11494 - * which causes new js.RegExp(jsRegExpBlueprint) to return the same - * object, rather than a new one. - * In that case, we reconstruct a new js.RegExp from scratch. - */ - new js.RegExp(jsPattern, jsFlagsForFind) - } - } - private[regex] def execMatches(input: String): js.RegExp.ExecResult = jsRegExpForMatches.exec(input) - private[regex] def execFind(regexp: js.RegExp, input: String): js.RegExp.ExecResult = { + @inline // to stack-allocate the tuple + private[regex] def execFind(input: String, start: Int): (js.RegExp.ExecResult, Int) = { + val mtch = execFindInternal(input, start) + val end = jsRegExpForFind.lastIndex + (mtch, end) + } + + private def execFindInternal(input: String, start: Int): js.RegExp.ExecResult = { + val regexp = jsRegExpForFind + if (!supportsSticky && sticky) { - val start = regexp.lastIndex + regexp.lastIndex = start val mtch = regexp.exec(input) if (mtch == null || mtch.index > start) null else mtch } else if (supportsUnicode) { + regexp.lastIndex = start regexp.exec(input) } else { /* When the native RegExp does not support the 'u' flag (introduced in @@ -101,8 +100,8 @@ final class Pattern private[regex] ( * matches that start in the middle of a surrogate pair. */ @tailrec - def loop(): js.RegExp.ExecResult = { - val start = regexp.lastIndex + def loop(start: Int): js.RegExp.ExecResult = { + regexp.lastIndex = start val mtch = regexp.exec(input) if (mtch == null) { null @@ -111,14 +110,13 @@ final class Pattern private[regex] ( if (index > start && index < input.length() && Character.isLowSurrogate(input.charAt(index)) && Character.isHighSurrogate(input.charAt(index - 1))) { - regexp.lastIndex = index + 1 - loop() + loop(index + 1) } else { mtch } } } - loop() + loop(start) } } diff --git a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala index d95543fd80..ea5b06073d 100644 --- a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala +++ b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala @@ -70,9 +70,9 @@ class LibrarySizeTest { ) testLinkedSizes( - expectedFastLinkSize = 188076, - expectedFullLinkSizeWithoutClosure = 175219, - expectedFullLinkSizeWithClosure = 32036, + expectedFastLinkSize = 187486, + expectedFullLinkSizeWithoutClosure = 174649, + expectedFullLinkSizeWithClosure = 31827, classDefs, moduleInitializers = MainTestModuleInitializers ) diff --git a/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexMatcherTest.scala b/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexMatcherTest.scala index dde2b04330..f9ebd775ed 100644 --- a/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexMatcherTest.scala +++ b/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexMatcherTest.scala @@ -465,18 +465,36 @@ class RegexMatcherTest { } @Test def zeroLengthMatches(): Unit = { - val pat = Pattern.compile("a*?") - val mat = pat.matcher("aaaaa") - for (i <- 0 to 5) { - assertTrue(mat.find()) - assertEquals(i, mat.start) - assertEquals(i, mat.end) - } + @noinline + def assertZeroLengthMatches(pattern: String, input: String, jvmBug: Boolean, positions: Int*): Unit = { + val m = Pattern.compile(pattern).matcher(input) + for (pos <- positions) { + val msg = s"$pattern in $input at $pos" + assertTrue(msg, m.find()) + assertEquals(msg, pos, m.start()) + assertEquals(msg, pos, m.end()) + } + + /* Check that subsequent `find()`s return `false`. + * Do it 3 times because internal conditions vary the first two times. + */ + val msg = s"$pattern in $input at end" + assertFalse(msg, m.find()) - // Make sure we don't suddenly re-match - for (i <- 0 to 5) { - assertFalse(mat.find()) + if (!(jvmBug && executingInJVM)) { + assertFalse(msg, m.find()) + assertFalse(msg, m.find()) + } } + + assertZeroLengthMatches("a*?", "aaaaa", false, (0 to 5): _*) + assertZeroLengthMatches("(?!b)", "abcdba", false, 0, 2, 3, 5, 6) + + /* Unexplicably, on the following inputs, the JVM alternates between + * `false` and `true` after the end is reached. + */ + assertZeroLengthMatches("(?=b)", "abcdbab", true, 1, 4, 6) + assertZeroLengthMatches("(?=b)", "abcdba", true, 1, 4) } @Test def inPatternFlags_Issue997(): Unit = { From 0896ff4ef0dc15d4466b06150c0ab2555aef1d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Doeraene?= Date: Sat, 22 May 2021 19:07:30 +0200 Subject: [PATCH 2/5] Regex: Only use subSequence in the `region()` method. Previously, the constructor of `Matcher` accepted region params, but the only call site always used the full string. This commit confines the use of `subSequence` to the `region()` method. --- .../main/scala/java/util/regex/Matcher.scala | 19 ++++++++++++------- .../main/scala/java/util/regex/Pattern.scala | 2 +- .../org/scalajs/linker/LibrarySizeTest.scala | 6 +++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/javalib/src/main/scala/java/util/regex/Matcher.scala b/javalib/src/main/scala/java/util/regex/Matcher.scala index 88f1f90a65..d66ba712c8 100644 --- a/javalib/src/main/scala/java/util/regex/Matcher.scala +++ b/javalib/src/main/scala/java/util/regex/Matcher.scala @@ -19,16 +19,17 @@ import scala.annotation.switch import scala.scalajs.js final class Matcher private[regex] ( - private var pattern0: Pattern, private var input0: CharSequence, - private var regionStart0: Int, private var regionEnd0: Int) + private var pattern0: Pattern, private var input0: CharSequence) extends AnyRef with MatchResult { import Matcher._ def pattern(): Pattern = pattern0 - // Configuration (updated manually) - private var inputstr = input0.subSequence(regionStart0, regionEnd0).toString + // Region configuration (updated by reset() and region()) + private var regionStart0 = 0 + private var regionEnd0 = input0.length() + private var inputstr = input0.toString() // Match result (updated by successful matches) private var position: Int = 0 // within `inputstr`, not `input0` @@ -151,8 +152,12 @@ final class Matcher private[regex] ( this } - def reset(): Matcher = - region(0, input0.length()) + def reset(): Matcher = { + regionStart0 = 0 + regionEnd0 = input0.length() + inputstr = input0.toString() + resetMatch() + } def reset(input: CharSequence): Matcher = { input0 = input @@ -229,7 +234,7 @@ final class Matcher private[regex] ( // Similar difficulties as with hitEnd() //def requireEnd(): Boolean - // Stub methods for region management + // Region management def regionStart(): Int = regionStart0 def regionEnd(): Int = regionEnd0 diff --git a/javalib/src/main/scala/java/util/regex/Pattern.scala b/javalib/src/main/scala/java/util/regex/Pattern.scala index 70b7910ee2..32e7045997 100644 --- a/javalib/src/main/scala/java/util/regex/Pattern.scala +++ b/javalib/src/main/scala/java/util/regex/Pattern.scala @@ -140,7 +140,7 @@ final class Pattern private[regex] ( override def toString(): String = pattern() def matcher(input: CharSequence): Matcher = - new Matcher(this, input, 0, input.length) + new Matcher(this, input) def split(input: CharSequence): Array[String] = split(input, 0) diff --git a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala index ea5b06073d..678124a20d 100644 --- a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala +++ b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala @@ -70,9 +70,9 @@ class LibrarySizeTest { ) testLinkedSizes( - expectedFastLinkSize = 187486, - expectedFullLinkSizeWithoutClosure = 174649, - expectedFullLinkSizeWithClosure = 31827, + expectedFastLinkSize = 186816, + expectedFullLinkSizeWithoutClosure = 173972, + expectedFullLinkSizeWithClosure = 31629, classDefs, moduleInitializers = MainTestModuleInitializers ) From 72d5ed96f0116516d5b01cb4e4c787d7015e4337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Doeraene?= Date: Fri, 23 Apr 2021 11:49:58 +0200 Subject: [PATCH 3/5] Regex: Turn CharSequence into Strings upon entry in our implementation. Since we have to convert the input `CharSequence`s to `String`s anyway to be able to give them to `js.RegExp`, it is better to do that upon entry into our implementation. We can force the `toString()`s to be at the call site, which can then be inlined and removed when the arguments are already Strings, which should be the vast majority of use cases. --- .../src/main/scala/java/util/regex/Matcher.scala | 11 ++++++----- .../src/main/scala/java/util/regex/Pattern.scala | 14 +++++++++++--- .../scala/org/scalajs/linker/LibrarySizeTest.scala | 6 +++--- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/javalib/src/main/scala/java/util/regex/Matcher.scala b/javalib/src/main/scala/java/util/regex/Matcher.scala index d66ba712c8..0bdf7bbde3 100644 --- a/javalib/src/main/scala/java/util/regex/Matcher.scala +++ b/javalib/src/main/scala/java/util/regex/Matcher.scala @@ -19,7 +19,7 @@ import scala.annotation.switch import scala.scalajs.js final class Matcher private[regex] ( - private var pattern0: Pattern, private var input0: CharSequence) + private var pattern0: Pattern, private var input0: String) extends AnyRef with MatchResult { import Matcher._ @@ -29,7 +29,7 @@ final class Matcher private[regex] ( // Region configuration (updated by reset() and region()) private var regionStart0 = 0 private var regionEnd0 = input0.length() - private var inputstr = input0.toString() + private var inputstr = input0 // Match result (updated by successful matches) private var position: Int = 0 // within `inputstr`, not `input0` @@ -155,12 +155,13 @@ final class Matcher private[regex] ( def reset(): Matcher = { regionStart0 = 0 regionEnd0 = input0.length() - inputstr = input0.toString() + inputstr = input0 resetMatch() } + @inline // `input` is almost certainly a String at call site def reset(input: CharSequence): Matcher = { - input0 = input + input0 = input.toString() reset() } @@ -242,7 +243,7 @@ final class Matcher private[regex] ( def region(start: Int, end: Int): Matcher = { regionStart0 = start regionEnd0 = end - inputstr = input0.subSequence(regionStart0, regionEnd0).toString + inputstr = input0.substring(start, end) resetMatch() } diff --git a/javalib/src/main/scala/java/util/regex/Pattern.scala b/javalib/src/main/scala/java/util/regex/Pattern.scala index 32e7045997..6658dacf83 100644 --- a/javalib/src/main/scala/java/util/regex/Pattern.scala +++ b/javalib/src/main/scala/java/util/regex/Pattern.scala @@ -139,15 +139,19 @@ final class Pattern private[regex] ( override def toString(): String = pattern() + @inline // `input` is almost certainly a String at call site def matcher(input: CharSequence): Matcher = - new Matcher(this, input) + new Matcher(this, input.toString()) + @inline // `input` is almost certainly a String at call site def split(input: CharSequence): Array[String] = split(input, 0) - def split(input: CharSequence, limit: Int): Array[String] = { - val inputStr = input.toString + @inline // `input` is almost certainly a String at call site + def split(input: CharSequence, limit: Int): Array[String] = + split(input.toString(), limit) + private def split(inputStr: String, limit: Int): Array[String] = { // If the input string is empty, always return Array("") - #987, #2592 if (inputStr == "") { Array("") @@ -210,7 +214,11 @@ object Pattern { def compile(regex: String): Pattern = compile(regex, 0) + @inline // `input` is almost certainly a String at call site def matches(regex: String, input: CharSequence): Boolean = + matches(regex, input.toString()) + + private def matches(regex: String, input: String): Boolean = compile(regex).matcher(input).matches() def quote(s: String): String = { diff --git a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala index 678124a20d..c43e72d0f3 100644 --- a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala +++ b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala @@ -70,9 +70,9 @@ class LibrarySizeTest { ) testLinkedSizes( - expectedFastLinkSize = 186816, - expectedFullLinkSizeWithoutClosure = 173972, - expectedFullLinkSizeWithClosure = 31629, + expectedFastLinkSize = 186693, + expectedFullLinkSizeWithoutClosure = 173849, + expectedFullLinkSizeWithClosure = 31572, classDefs, moduleInitializers = MainTestModuleInitializers ) From e65aac5c129aa6de745c921a113b70dff38a8206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Doeraene?= Date: Mon, 19 Apr 2021 14:01:44 +0200 Subject: [PATCH 4/5] Regex: Store start and end for the indices of matched groups. And `undefined` for unmatched groups. In addition, store the start and end of the whole match at index 0. The indices array is cached directly as a field `indices` on the `MatchResult`. This allows the cache to be better shared between a `Matcher` and a `MatchResult`. This corresponds to the format of the `indices` field of the `regexp-match-indices` ECMAScript proposal, scheduled for inclusion in ECMAScript 2022. --- ...StartMapper.scala => IndicesBuilder.scala} | 94 +++++++++++-------- .../main/scala/java/util/regex/Matcher.scala | 83 +++++----------- .../main/scala/java/util/regex/Pattern.scala | 13 ++- .../org/scalajs/linker/LibrarySizeTest.scala | 6 +- 4 files changed, 90 insertions(+), 106 deletions(-) rename javalib/src/main/scala/java/util/regex/{GroupStartMapper.scala => IndicesBuilder.scala} (86%) diff --git a/javalib/src/main/scala/java/util/regex/GroupStartMapper.scala b/javalib/src/main/scala/java/util/regex/IndicesBuilder.scala similarity index 86% rename from javalib/src/main/scala/java/util/regex/GroupStartMapper.scala rename to javalib/src/main/scala/java/util/regex/IndicesBuilder.scala index d32d890ffc..4b866920b0 100644 --- a/javalib/src/main/scala/java/util/regex/GroupStartMapper.scala +++ b/javalib/src/main/scala/java/util/regex/IndicesBuilder.scala @@ -14,13 +14,18 @@ package java.util.regex import scala.annotation.{tailrec, switch} -import java.util.HashMap - import scala.scalajs.js -/** The goal of a `GroupStartMapper` is to retrieve the start position of each - * group of a matching regular expression where only the strings of the - * matched groups are known. +import Pattern.IndicesArray + +/** The goal of an `IndicesBuilder` is to retrieve the start and end positions + * of each group of a matching regular expression. + * + * This is essentially a polyfill for the 'd' flag of `js.RegExp`, which is + * a Stage 4 proposal scheduled for inclusion in ECMAScript 2022. Without that + * flag, `js.RegExp` only provides the substrings matched by capturing groups, + * but not their positions. We implement the positions on top of that. + * * For that, we use the following observation: * If the regex /A(B)\1/ matches a string at a given index, * then /(A)(B)\2/ matches the same string at the same index. @@ -38,7 +43,7 @@ import scala.scalajs.js * - It computes the start of every group thanks to the groups before it * - It builds and returns the mapping of previous group number -> start * - * The `pattern` that is parsed by `GroupStartMapper` is the *compiled* JS + * The `pattern` that is parsed by `IndicesBuilder` is the *compiled* JS * pattern produced by `PatternCompiler`, not the original Java pattern. This * means that we can simplify a number of things with the knowledge that: * @@ -53,13 +58,13 @@ import scala.scalajs.js * * @author Mikaƫl Mayer */ -private[regex] class GroupStartMapper private (pattern: String, flags: String, - node: GroupStartMapper.Node, groupCount: Int, +private[regex] class IndicesBuilder private (pattern: String, flags: String, + node: IndicesBuilder.Node, groupCount: Int, jsRegExpForFind: js.RegExp, jsRegExpForMatches: js.RegExp) { - import GroupStartMapper._ + import IndicesBuilder._ - def apply(forMatches: Boolean, string: String, index: Int): js.Array[Int] = { + def apply(forMatches: Boolean, string: String, index: Int): IndicesArray = { val regExp = if (forMatches) jsRegExpForMatches else jsRegExpForFind @@ -73,23 +78,34 @@ private[regex] class GroupStartMapper private (pattern: String, flags: String, s"Original pattern '$pattern' with flags '$flags' did match however.") } - // Prepare a `groupStartMap` array with the correct length filled with -1 - val len = groupCount + 1 // index 0 is not used - val groupStartMap = new js.Array[Int](len) - var i = 0 + val start = index // by definition + val end = start + allMatchResult(0).get.length() + + /* Initialize the `indices` array with: + * - `[start, end]` at index 0, which represents the whole match, and + * - `undefined` in the other slots. + * + * We explicitly store `undefined` in the other slots to prevent the array + * from containing *empty* slots. That would make it a sparse array, which + * is less efficient. + */ + val len = groupCount + 1 + val indices = new IndicesArray(len) + indices(0) = js.Tuple2(start, end) + var i = 1 while (i != len) { - groupStartMap(i) = -1 + indices(i) = js.undefined i += 1 } - node.propagateFromStart(allMatchResult, groupStartMap, index) + node.propagate(allMatchResult, indices, start, end) - groupStartMap + indices } } -private[regex] object GroupStartMapper { - def apply(pattern: String, flags: String): GroupStartMapper = { +private[regex] object IndicesBuilder { + def apply(pattern: String, flags: String): IndicesBuilder = { val parser = new Parser(pattern) val node = parser.parseTopLevel() node.setNewGroup(1) @@ -97,7 +113,7 @@ private[regex] object GroupStartMapper { val jsRegExpForFind = new js.RegExp(allMatchingPattern, flags + "g") val jsRegExpForMatches = new js.RegExp(Pattern.wrapJSPatternForMatches(allMatchingPattern), flags) - new GroupStartMapper(pattern, flags, node, parser.parsedGroupCount, + new IndicesBuilder(pattern, flags, node, parser.parsedGroupCount, jsRegExpForFind, jsRegExpForMatches) } @@ -155,16 +171,16 @@ private[regex] object GroupStartMapper { * `end`, while other nodes propagate the `start`. */ def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit + indices: IndicesArray, start: Int, end: Int): Unit /** Propagates the appropriate positions to the descendants of this node * from its end position. */ final def propagateFromEnd(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], end: Int): Unit = { + indices: IndicesArray, end: Int): Unit = { val start = matchResult(newGroup).fold(-1)(matched => end - matched.length) - propagate(matchResult, groupStartMap, start, end) + propagate(matchResult, indices, start, end) } /** Propagates the appropriate positions to the descendants of this node @@ -173,10 +189,10 @@ private[regex] object GroupStartMapper { * @return the end position of this node, as a convenience for `SequenceNode.propagate` */ final def propagateFromStart(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int): Int = { + indices: IndicesArray, start: Int): Int = { val end = matchResult(newGroup).fold(-1)(matched => start + matched.length) - propagate(matchResult, groupStartMap, start, end) + propagate(matchResult, indices, start, end) end } } @@ -190,15 +206,15 @@ private[regex] object GroupStartMapper { "(" + inner.buildRegex(groupNodeMap) + ")" def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit = { + indices: IndicesArray, start: Int, end: Int): Unit = { /* #3901: A GroupNode within a negative look-ahead node may receive * `start != -1` from above, yet not match anything itself. We must * always keep the default `-1` if this group node does not match * anything. */ if (matchResult(newGroup).isDefined) - groupStartMap(number) = start - inner.propagate(matchResult, groupStartMap, start, end) + indices(number) = js.Tuple2(start, end) + inner.propagate(matchResult, indices, start, end) } } @@ -217,11 +233,11 @@ private[regex] object GroupStartMapper { "((" + indicator + inner.buildRegex(groupNodeMap) + "))" def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit = { + indices: IndicesArray, start: Int, end: Int): Unit = { if (isLookBehind) - inner.propagateFromEnd(matchResult, groupStartMap, end) + inner.propagateFromEnd(matchResult, indices, end) else - inner.propagateFromStart(matchResult, groupStartMap, start) + inner.propagateFromStart(matchResult, indices, start) } } @@ -236,8 +252,8 @@ private[regex] object GroupStartMapper { "(" + inner.buildRegex(groupNodeMap) + repeater + ")" def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit = { - inner.propagateFromEnd(matchResult, groupStartMap, end) + indices: IndicesArray, start: Int, end: Int): Unit = { + inner.propagateFromEnd(matchResult, indices, end) } } @@ -247,7 +263,7 @@ private[regex] object GroupStartMapper { "(" + regex + ")" def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit = { + indices: IndicesArray, start: Int, end: Int): Unit = { // nothing to do } } @@ -262,7 +278,7 @@ private[regex] object GroupStartMapper { } def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit = { + indices: IndicesArray, start: Int, end: Int): Unit = { // nothing to do } } @@ -292,13 +308,13 @@ private[regex] object GroupStartMapper { } def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit = { + indices: IndicesArray, start: Int, end: Int): Unit = { val len = sequence.length var i = 0 var nextStart = start while (i != len) { nextStart = - sequence(i).propagateFromStart(matchResult, groupStartMap, nextStart) + sequence(i).propagateFromStart(matchResult, indices, nextStart) i += 1 } } @@ -333,11 +349,11 @@ private[regex] object GroupStartMapper { } def propagate(matchResult: js.RegExp.ExecResult, - groupStartMap: js.Array[Int], start: Int, end: Int): Unit = { + indices: IndicesArray, start: Int, end: Int): Unit = { val len = alternatives.length var i = 0 while (i != len) { - alternatives(i).propagate(matchResult, groupStartMap, start, end) + alternatives(i).propagate(matchResult, indices, start, end) i += 1 } } diff --git a/javalib/src/main/scala/java/util/regex/Matcher.scala b/javalib/src/main/scala/java/util/regex/Matcher.scala index 0bdf7bbde3..4effe7de81 100644 --- a/javalib/src/main/scala/java/util/regex/Matcher.scala +++ b/javalib/src/main/scala/java/util/regex/Matcher.scala @@ -12,12 +12,12 @@ package java.util.regex -import scala.language.implicitConversions - import scala.annotation.switch import scala.scalajs.js +import Pattern.IndicesArray + final class Matcher private[regex] ( private var pattern0: Pattern, private var input0: String) extends AnyRef with MatchResult { @@ -64,7 +64,6 @@ final class Matcher private[regex] ( else inputstr.length() + 1 // cannot find anymore lastMatch = mtch lastMatchIsForMatches = false - startOfGroupCache = null mtch ne null } @@ -148,7 +147,6 @@ final class Matcher private[regex] ( position = 0 lastMatch = null appendPos = 0 - startOfGroupCache = null this } @@ -169,7 +167,6 @@ final class Matcher private[regex] ( // note that `position` and `appendPos` are left unchanged pattern0 = pattern lastMatch = null - startOfGroupCache = null this } @@ -187,29 +184,23 @@ final class Matcher private[regex] ( def end(): Int = start() + group().length def group(): String = ensureLastMatch(0).get - private def startInternal(compiledGroup: Int): Int = { - val s = startOfGroup(compiledGroup) - if (s == -1) -1 - else s + regionStart() - } + private def indices: IndicesArray = + pattern().getIndices(ensureLastMatch, lastMatchIsForMatches) - def start(group: Int): Int = { - if (group == 0) start() - else startInternal(pattern().numberedGroup(group)) - } + private def startInternal(compiledGroup: Int): Int = + indices(compiledGroup).fold(-1)(_._1 + regionStart()) + + def start(group: Int): Int = + startInternal(pattern().numberedGroup(group)) def start(name: String): Int = startInternal(pattern().namedGroup(name)) - private def endInternal(compiledGroup: Int): Int = { - val s = startOfGroup(compiledGroup) - if (s == -1) -1 - else s + ensureLastMatch(compiledGroup).get.length + regionStart() - } + private def endInternal(compiledGroup: Int): Int = + indices(compiledGroup).fold(-1)(_._2 + regionStart()) def end(group: Int): Int = - if (group == 0) end() - else endInternal(pattern().numberedGroup(group)) + endInternal(pattern().numberedGroup(group)) def end(name: String): Int = endInternal(pattern().namedGroup(name)) @@ -222,10 +213,8 @@ final class Matcher private[regex] ( // Seal the state - def toMatchResult(): MatchResult = { - new SealedResult(inputstr, lastMatch, lastMatchIsForMatches, pattern(), - regionStart(), startOfGroupCache) - } + def toMatchResult(): MatchResult = + new SealedResult(lastMatch, lastMatchIsForMatches, pattern(), regionStart()) // Other query state methods @@ -252,16 +241,6 @@ final class Matcher private[regex] ( def hasAnchoringBounds(): Boolean = true //def useAnchoringBounds(b: Boolean): Matcher - - // Lazily computed by `startOfGroup`, reset every time `lastMatch` changes - private var startOfGroupCache: js.Array[Int] = _ - - /** Returns a mapping from the group number to the respective start position. */ - private def startOfGroup: js.Array[Int] = { - if (startOfGroupCache eq null) - startOfGroupCache = pattern0.groupStartMapper(lastMatchIsForMatches, inputstr, ensureLastMatch.index) - startOfGroupCache - } } object Matcher { @@ -279,10 +258,8 @@ object Matcher { result } - private final class SealedResult(inputstr: String, - lastMatch: js.RegExp.ExecResult, lastMatchIsForMatches: Boolean, - pattern: Pattern, regionStart: Int, - private var startOfGroupCache: js.Array[Int]) + private final class SealedResult(lastMatch: js.RegExp.ExecResult, + lastMatchIsForMatches: Boolean, pattern: Pattern, regionStart: Int) extends MatchResult { def groupCount(): Int = pattern.groupCount @@ -291,36 +268,18 @@ object Matcher { def end(): Int = start() + group().length def group(): String = ensureLastMatch(0).get - private def startOfGroup: js.Array[Int] = { - if (startOfGroupCache eq null) - startOfGroupCache = pattern.groupStartMapper(lastMatchIsForMatches, inputstr, ensureLastMatch.index) - startOfGroupCache - } + private def indices: IndicesArray = + pattern.getIndices(ensureLastMatch, lastMatchIsForMatches) /* Note that MatchResult does *not* define the named versions of `group`, * `start` and `end`, so we don't have them here either. */ - private def startInternal(compiledGroup: Int): Int = { - val s = startOfGroup(compiledGroup) - if (s == -1) -1 - else s + regionStart - } - - def start(group: Int): Int = { - if (group == 0) start() - else startInternal(pattern.numberedGroup(group)) - } - - private def endInternal(compiledGroup: Int): Int = { - val s = startOfGroup(compiledGroup) - if (s == -1) -1 - else s + ensureLastMatch(compiledGroup).get.length + regionStart - } + def start(group: Int): Int = + indices(pattern.numberedGroup(group)).fold(-1)(_._1 + regionStart) def end(group: Int): Int = - if (group == 0) end() - else endInternal(pattern.numberedGroup(group)) + indices(pattern.numberedGroup(group)).fold(-1)(_._2 + regionStart) def group(group: Int): String = ensureLastMatch(pattern.numberedGroup(group)).orNull diff --git a/javalib/src/main/scala/java/util/regex/Pattern.scala b/javalib/src/main/scala/java/util/regex/Pattern.scala index 6658dacf83..016ab7357a 100644 --- a/javalib/src/main/scala/java/util/regex/Pattern.scala +++ b/javalib/src/main/scala/java/util/regex/Pattern.scala @@ -60,8 +60,8 @@ final class Pattern private[regex] ( private[this] val jsRegExpForMatches: js.RegExp = new js.RegExp(wrapJSPatternForMatches(jsPattern), jsFlags) - private[regex] lazy val groupStartMapper: GroupStartMapper = - GroupStartMapper(jsPattern, jsFlags) + private lazy val indicesBuilder: IndicesBuilder = + IndicesBuilder(jsPattern, jsFlags) private[regex] def execMatches(input: String): js.RegExp.ExecResult = jsRegExpForMatches.exec(input) @@ -132,6 +132,13 @@ final class Pattern private[regex] ( })) } + private[regex] def getIndices(lastMatch: js.RegExp.ExecResult, forMatches: Boolean): IndicesArray = { + val lastMatchDyn = lastMatch.asInstanceOf[js.Dynamic] + if (js.isUndefined(lastMatchDyn.indices)) + lastMatchDyn.indices = indicesBuilder(forMatches, lastMatch.input, lastMatch.index) + lastMatchDyn.indices.asInstanceOf[IndicesArray] + } + // Public API --------------------------------------------------------------- def pattern(): String = _pattern @@ -198,6 +205,8 @@ final class Pattern private[regex] ( } object Pattern { + private[regex] type IndicesArray = js.Array[js.UndefOr[js.Tuple2[Int, Int]]] + final val UNIX_LINES = 0x01 final val CASE_INSENSITIVE = 0x02 final val COMMENTS = 0x04 diff --git a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala index c43e72d0f3..b0133f48df 100644 --- a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala +++ b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala @@ -70,9 +70,9 @@ class LibrarySizeTest { ) testLinkedSizes( - expectedFastLinkSize = 186693, - expectedFullLinkSizeWithoutClosure = 173849, - expectedFullLinkSizeWithClosure = 31572, + expectedFastLinkSize = 186584, + expectedFullLinkSizeWithoutClosure = 173740, + expectedFullLinkSizeWithClosure = 31549, classDefs, moduleInitializers = MainTestModuleInitializers ) From 5c76e1930c5f925a438bc4b4550becd703f21431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Doeraene?= Date: Mon, 19 Apr 2021 15:32:45 +0200 Subject: [PATCH 5/5] Regex: Use the native 'd' flag to compute indices when available. That feature is enabled by a flag, and not by default, because of performance concerns. JS engine implementers have shown that enabling the feature has a significant performance impact. Therefore, we only enable it the first time group indices are required for any given `Pattern` instance. This means that the first match for which group indices are required will be run twice, but this is probably the better trade-off. --- .../main/scala/java/util/regex/Pattern.scala | 23 +++++++++++++++---- .../java/util/regex/PatternCompiler.scala | 9 ++++++++ .../org/scalajs/linker/LibrarySizeTest.scala | 6 ++--- project/NodeJSEnvForcePolyfills.scala | 8 ++++++- 4 files changed, 38 insertions(+), 8 deletions(-) diff --git a/javalib/src/main/scala/java/util/regex/Pattern.scala b/javalib/src/main/scala/java/util/regex/Pattern.scala index 016ab7357a..75089d46d5 100644 --- a/javalib/src/main/scala/java/util/regex/Pattern.scala +++ b/javalib/src/main/scala/java/util/regex/Pattern.scala @@ -34,6 +34,9 @@ final class Pattern private[regex] ( @inline private def jsFlagsForFind: String = jsFlags + (if (sticky && supportsSticky) "gy" else "g") + /** Whether we already added the 'd' flag to the native RegExp's. */ + private var enabledNativeIndices: Boolean = false + /** The RegExp that is used for `Matcher.find()`. * * It receives the 'g' flag so that `lastIndex` is taken into acount. @@ -45,7 +48,7 @@ final class Pattern private[regex] ( * Since that RegExp is only used locally within `execFind()`, we can * always reuse the same instance. */ - private[this] val jsRegExpForFind = + private[this] var jsRegExpForFind = new js.RegExp(jsPattern, jsFlagsForFind) /** Another version of the RegExp that is used by `Matcher.matches()`. @@ -57,7 +60,7 @@ final class Pattern private[regex] ( * Since that RegExp is only used locally within `execMatches()`, we can * always reuse the same instance. */ - private[this] val jsRegExpForMatches: js.RegExp = + private[this] var jsRegExpForMatches: js.RegExp = new js.RegExp(wrapJSPatternForMatches(jsPattern), jsFlags) private lazy val indicesBuilder: IndicesBuilder = @@ -134,8 +137,20 @@ final class Pattern private[regex] ( private[regex] def getIndices(lastMatch: js.RegExp.ExecResult, forMatches: Boolean): IndicesArray = { val lastMatchDyn = lastMatch.asInstanceOf[js.Dynamic] - if (js.isUndefined(lastMatchDyn.indices)) - lastMatchDyn.indices = indicesBuilder(forMatches, lastMatch.input, lastMatch.index) + if (js.isUndefined(lastMatchDyn.indices)) { + if (supportsIndices) { + if (!enabledNativeIndices) { + jsRegExpForFind = new js.RegExp(jsPattern, jsFlagsForFind + "d") + jsRegExpForMatches = new js.RegExp(wrapJSPatternForMatches(jsPattern), jsFlags + "d") + enabledNativeIndices = true + } + val regexp = if (forMatches) jsRegExpForMatches else jsRegExpForFind + regexp.lastIndex = lastMatch.index + lastMatchDyn.indices = regexp.exec(lastMatch.input).asInstanceOf[js.Dynamic].indices + } else { + lastMatchDyn.indices = indicesBuilder(forMatches, lastMatch.input, lastMatch.index) + } + } lastMatchDyn.indices.asInstanceOf[IndicesArray] } diff --git a/javalib/src/main/scala/java/util/regex/PatternCompiler.scala b/javalib/src/main/scala/java/util/regex/PatternCompiler.scala index c9c2717b6c..d5ea0b3c23 100644 --- a/javalib/src/main/scala/java/util/regex/PatternCompiler.scala +++ b/javalib/src/main/scala/java/util/regex/PatternCompiler.scala @@ -90,6 +90,10 @@ private[regex] object PatternCompiler { private val _supportsDotAll = (esVersion >= ESVersion.ES2018) || featureTest("us") + /** Cache for `Support.supportsIndices`. */ + private val _supportsIndices = + featureTest("d") + /** Feature-test methods. * * They are located in a separate object so that the methods can be fully @@ -112,6 +116,11 @@ private[regex] object PatternCompiler { def supportsDotAll: Boolean = (esVersion >= ESVersion.ES2018) || _supportsDotAll + /** Tests whether the underlying JS RegExp supports the 'd' flag. */ + @inline + def supportsIndices: Boolean = + _supportsIndices + /** Tests whether features requiring support for the 'u' flag are enabled. * * They are enabled if and only if the project is configured to rely on diff --git a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala index b0133f48df..35378c2401 100644 --- a/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala +++ b/linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala @@ -70,9 +70,9 @@ class LibrarySizeTest { ) testLinkedSizes( - expectedFastLinkSize = 186584, - expectedFullLinkSizeWithoutClosure = 173740, - expectedFullLinkSizeWithClosure = 31549, + expectedFastLinkSize = 186941, + expectedFullLinkSizeWithoutClosure = 174097, + expectedFullLinkSizeWithClosure = 31577, classDefs, moduleInitializers = MainTestModuleInitializers ) diff --git a/project/NodeJSEnvForcePolyfills.scala b/project/NodeJSEnvForcePolyfills.scala index 36fc5b3252..6859a3aa7b 100644 --- a/project/NodeJSEnvForcePolyfills.scala +++ b/project/NodeJSEnvForcePolyfills.scala @@ -85,7 +85,7 @@ final class NodeJSEnvForcePolyfills(esVersion: ESVersion, config: NodeJSEnv.Conf """.stripMargin } - if (esVersion < ES2018) { + if (true) { // esVersion < ES2022 ('d' flag) script += s""" |global.RegExp = (function(OrigRegExp) { | return function RegExp(pattern, flags) { @@ -96,10 +96,15 @@ final class NodeJSEnvForcePolyfills(esVersion: ESVersion, config: NodeJSEnv.Conf | if (flags.indexOf('y') >= 0) | throw new SyntaxError("unsupported flag 'y'"); |""".stripMargin)} + |${cond(ES2018, """ | if (flags.indexOf('s') >= 0) | throw new SyntaxError("unsupported flag 's'"); + |""".stripMargin)} + | if (flags.indexOf('d') >= 0) + | throw new SyntaxError("unsupported flag 'd'"); | } | + |${cond(ES2018, """ | if (typeof pattern === 'string') { | if (pattern.indexOf('(?<=') >= 0 || pattern.indexOf('(?= 0) | throw new SyntaxError("unsupported look-behinds"); @@ -108,6 +113,7 @@ final class NodeJSEnvForcePolyfills(esVersion: ESVersion, config: NodeJSEnv.Conf | if (pattern.indexOf('\\\\p{') >= 0 || pattern.indexOf('\\\\P{') >= 0) | throw new SyntaxError("unsupported Unicode character classes"); | } + |""".stripMargin)} | | return new OrigRegExp(pattern, flags); | }