Skip to content

Clean up j.u.regex.* and use the native 'd' flag when available #4470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,18 @@ package java.util.regex

import scala.annotation.{tailrec, switch}

import java.util.HashMap

import scala.scalajs.js

/** The goal of a `GroupStartMapper` is to retrieve the start position of each
* group of a matching regular expression where only the strings of the
* matched groups are known.
import Pattern.IndicesArray

/** The goal of an `IndicesBuilder` is to retrieve the start and end positions
* of each group of a matching regular expression.
*
* This is essentially a polyfill for the 'd' flag of `js.RegExp`, which is
* a Stage 4 proposal scheduled for inclusion in ECMAScript 2022. Without that
* flag, `js.RegExp` only provides the substrings matched by capturing groups,
* but not their positions. We implement the positions on top of that.
*
* For that, we use the following observation:
* If the regex /A(B)\1/ matches a string at a given index,
* then /(A)(B)\2/ matches the same string at the same index.
Expand All @@ -38,7 +43,7 @@ import scala.scalajs.js
* - It computes the start of every group thanks to the groups before it
* - It builds and returns the mapping of previous group number -> start
*
* The `pattern` that is parsed by `GroupStartMapper` is the *compiled* JS
* The `pattern` that is parsed by `IndicesBuilder` is the *compiled* JS
* pattern produced by `PatternCompiler`, not the original Java pattern. This
* means that we can simplify a number of things with the knowledge that:
*
Expand All @@ -53,13 +58,13 @@ import scala.scalajs.js
*
* @author Mikaël Mayer
*/
private[regex] class GroupStartMapper private (pattern: String, flags: String,
node: GroupStartMapper.Node, groupCount: Int,
private[regex] class IndicesBuilder private (pattern: String, flags: String,
node: IndicesBuilder.Node, groupCount: Int,
jsRegExpForFind: js.RegExp, jsRegExpForMatches: js.RegExp) {

import GroupStartMapper._
import IndicesBuilder._

def apply(forMatches: Boolean, string: String, index: Int): js.Array[Int] = {
def apply(forMatches: Boolean, string: String, index: Int): IndicesArray = {
val regExp =
if (forMatches) jsRegExpForMatches
else jsRegExpForFind
Expand All @@ -73,31 +78,42 @@ private[regex] class GroupStartMapper private (pattern: String, flags: String,
s"Original pattern '$pattern' with flags '$flags' did match however.")
}

// Prepare a `groupStartMap` array with the correct length filled with -1
val len = groupCount + 1 // index 0 is not used
val groupStartMap = new js.Array[Int](len)
var i = 0
val start = index // by definition
val end = start + allMatchResult(0).get.length()

/* Initialize the `indices` array with:
* - `[start, end]` at index 0, which represents the whole match, and
* - `undefined` in the other slots.
*
* We explicitly store `undefined` in the other slots to prevent the array
* from containing *empty* slots. That would make it a sparse array, which
* is less efficient.
*/
val len = groupCount + 1
val indices = new IndicesArray(len)
indices(0) = js.Tuple2(start, end)
var i = 1
while (i != len) {
groupStartMap(i) = -1
indices(i) = js.undefined
i += 1
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't a js.Array contain js.undefined by default?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really. It creates an array of empty slots, not of slots with actual undefined values.. That causes the array to become a sparse array, which is less efficient.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added a comment about that.


node.propagateFromStart(allMatchResult, groupStartMap, index)
node.propagate(allMatchResult, indices, start, end)

groupStartMap
indices
}
}

private[regex] object GroupStartMapper {
def apply(pattern: String, flags: String): GroupStartMapper = {
private[regex] object IndicesBuilder {
def apply(pattern: String, flags: String): IndicesBuilder = {
val parser = new Parser(pattern)
val node = parser.parseTopLevel()
node.setNewGroup(1)
val allMatchingPattern = node.buildRegex(parser.groupNodeMap)
val jsRegExpForFind = new js.RegExp(allMatchingPattern, flags + "g")
val jsRegExpForMatches =
new js.RegExp(Pattern.wrapJSPatternForMatches(allMatchingPattern), flags)
new GroupStartMapper(pattern, flags, node, parser.parsedGroupCount,
new IndicesBuilder(pattern, flags, node, parser.parsedGroupCount,
jsRegExpForFind, jsRegExpForMatches)
}

Expand Down Expand Up @@ -155,16 +171,16 @@ private[regex] object GroupStartMapper {
* `end`, while other nodes propagate the `start`.
*/
def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit
indices: IndicesArray, start: Int, end: Int): Unit

/** Propagates the appropriate positions to the descendants of this node
* from its end position.
*/
final def propagateFromEnd(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], end: Int): Unit = {
indices: IndicesArray, end: Int): Unit = {

val start = matchResult(newGroup).fold(-1)(matched => end - matched.length)
propagate(matchResult, groupStartMap, start, end)
propagate(matchResult, indices, start, end)
}

/** Propagates the appropriate positions to the descendants of this node
Expand All @@ -173,10 +189,10 @@ private[regex] object GroupStartMapper {
* @return the end position of this node, as a convenience for `SequenceNode.propagate`
*/
final def propagateFromStart(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int): Int = {
indices: IndicesArray, start: Int): Int = {

val end = matchResult(newGroup).fold(-1)(matched => start + matched.length)
propagate(matchResult, groupStartMap, start, end)
propagate(matchResult, indices, start, end)
end
}
}
Expand All @@ -190,15 +206,15 @@ private[regex] object GroupStartMapper {
"(" + inner.buildRegex(groupNodeMap) + ")"

def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
indices: IndicesArray, start: Int, end: Int): Unit = {
/* #3901: A GroupNode within a negative look-ahead node may receive
* `start != -1` from above, yet not match anything itself. We must
* always keep the default `-1` if this group node does not match
* anything.
*/
if (matchResult(newGroup).isDefined)
groupStartMap(number) = start
inner.propagate(matchResult, groupStartMap, start, end)
indices(number) = js.Tuple2(start, end)
inner.propagate(matchResult, indices, start, end)
}
}

Expand All @@ -217,11 +233,11 @@ private[regex] object GroupStartMapper {
"((" + indicator + inner.buildRegex(groupNodeMap) + "))"

def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
indices: IndicesArray, start: Int, end: Int): Unit = {
if (isLookBehind)
inner.propagateFromEnd(matchResult, groupStartMap, end)
inner.propagateFromEnd(matchResult, indices, end)
else
inner.propagateFromStart(matchResult, groupStartMap, start)
inner.propagateFromStart(matchResult, indices, start)
}
}

Expand All @@ -236,8 +252,8 @@ private[regex] object GroupStartMapper {
"(" + inner.buildRegex(groupNodeMap) + repeater + ")"

def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
inner.propagateFromEnd(matchResult, groupStartMap, end)
indices: IndicesArray, start: Int, end: Int): Unit = {
inner.propagateFromEnd(matchResult, indices, end)
}
}

Expand All @@ -247,7 +263,7 @@ private[regex] object GroupStartMapper {
"(" + regex + ")"

def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
indices: IndicesArray, start: Int, end: Int): Unit = {
// nothing to do
}
}
Expand All @@ -262,7 +278,7 @@ private[regex] object GroupStartMapper {
}

def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
indices: IndicesArray, start: Int, end: Int): Unit = {
// nothing to do
}
}
Expand Down Expand Up @@ -292,13 +308,13 @@ private[regex] object GroupStartMapper {
}

def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
indices: IndicesArray, start: Int, end: Int): Unit = {
val len = sequence.length
var i = 0
var nextStart = start
while (i != len) {
nextStart =
sequence(i).propagateFromStart(matchResult, groupStartMap, nextStart)
sequence(i).propagateFromStart(matchResult, indices, nextStart)
i += 1
}
}
Expand Down Expand Up @@ -333,11 +349,11 @@ private[regex] object GroupStartMapper {
}

def propagate(matchResult: js.RegExp.ExecResult,
groupStartMap: js.Array[Int], start: Int, end: Int): Unit = {
indices: IndicesArray, start: Int, end: Int): Unit = {
val len = alternatives.length
var i = 0
while (i != len) {
alternatives(i).propagate(matchResult, groupStartMap, start, end)
alternatives(i).propagate(matchResult, indices, start, end)
i += 1
}
}
Expand Down
Loading