Skip to content

Commit 55909f6

Browse files
committed
WiP Correct j.u.regex support
1 parent d09ce38 commit 55909f6

File tree

23 files changed

+2899
-190
lines changed

23 files changed

+2899
-190
lines changed

javalib/src/main/scala/java/util/regex/GroupStartMapper.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,16 @@ private[regex] object GroupStartMapper {
384384
case '(' =>
385385
val indicator = pattern.substring(pIndex + 1, pIndex + 3)
386386
if (indicator == "?=" || indicator == "?!") {
387-
// Non-capturing test group
387+
// Look-ahead group
388388
pIndex += 3
389389
val inner = parseInsideParensAndClosingParen()
390390
new ZeroLengthTestNode(indicator, inner)
391+
} else if (indicator == "?<") {
392+
// Look-behind group, which must be ?<= or ?<!
393+
val fullIndicator = pattern.substring(pIndex + 1, pIndex + 4)
394+
pIndex += 4
395+
val inner = parseInsideParensAndClosingParen()
396+
new ZeroLengthTestNode(fullIndicator, inner)
391397
} else if (indicator == "?:") {
392398
// Non-capturing group
393399
pIndex += 3

javalib/src/main/scala/java/util/regex/Matcher.scala

Lines changed: 73 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ package java.util.regex
1414

1515
import scala.language.implicitConversions
1616

17-
import scala.annotation.switch
17+
import scala.annotation.{switch, tailrec}
1818

1919
import scala.scalajs.js
2020

@@ -43,13 +43,8 @@ final class Matcher private[regex] (
4343

4444
def matches(): Boolean = {
4545
resetMatch()
46-
find()
47-
// TODO this check is wrong with non-greedy patterns
48-
// Further, it might be wrong to just use ^$ delimiters for two reasons:
49-
// - They might already be there
50-
// - They might not behave as expected when newline characters are present
51-
if ((lastMatch ne null) && (ensureLastMatch.index != 0 || group().length() != inputstr.length()))
52-
resetMatch()
46+
47+
lastMatch = pattern().jsRegExpForMatches.exec(inputstr)
5348
lastMatch ne null
5449
}
5550

@@ -61,18 +56,59 @@ final class Matcher private[regex] (
6156
lastMatch ne null
6257
}
6358

64-
def find(): Boolean = if (canStillFind) {
65-
lastMatchIsValid = true
66-
lastMatch = regexp.exec(inputstr)
67-
if (lastMatch ne null) {
68-
if (lastMatch(0).get.isEmpty)
69-
regexp.lastIndex += 1
59+
def find(): Boolean = {
60+
if (canStillFind) {
61+
lastMatchIsValid = true
62+
63+
def advanceOnce(): Boolean = {
64+
lastMatch = regexp.exec(inputstr)
65+
if (lastMatch ne null) {
66+
if (lastMatch(0).get.isEmpty)
67+
regexp.lastIndex += 1
68+
true
69+
} else {
70+
canStillFind = false
71+
false
72+
}
73+
}
74+
75+
if (PatternCompiler.Support.supportsUnicode) {
76+
advanceOnce()
77+
} else {
78+
/* When the native RegExp does not support the 'u' flag (introduced in
79+
* ECMAScript 2015), it can find a match starting in the middle of a
80+
* surrogate pair. This can happen if the pattern can match a substring
81+
* starting with a lone low surrogate. However, that is not valid,
82+
* because surrogate pairs must always stick together.
83+
*
84+
* In all the other situations, the `PatternCompiler` makes sure that
85+
* surrogate pairs are always matched together or not at all, but it
86+
* cannot avoid this specific situation because there is no look-behind
87+
* support in that case either. So we take care of it now by skipping
88+
* matches that start in the middle of a surrogate pair.
89+
*/
90+
@tailrec
91+
def loop(): Unit = {
92+
val start = regexp.lastIndex
93+
if (advanceOnce()) {
94+
val index = lastMatch.index
95+
if (index > start && index < inputstr.length() &&
96+
Character.isLowSurrogate(inputstr.charAt(index)) &&
97+
Character.isHighSurrogate(inputstr.charAt(index - 1))) {
98+
regexp.lastIndex = index + 1
99+
loop()
100+
}
101+
}
102+
}
103+
loop()
104+
}
105+
106+
startOfGroupCache = null
107+
lastMatch ne null
70108
} else {
71-
canStillFind = false
109+
false
72110
}
73-
startOfGroupCache = null
74-
lastMatch ne null
75-
} else false
111+
}
76112

77113
def find(start: Int): Boolean = {
78114
reset()
@@ -186,7 +222,7 @@ final class Matcher private[regex] (
186222
lastMatch
187223
}
188224

189-
def groupCount(): Int = Matcher.getGroupCount(lastMatch, pattern())
225+
def groupCount(): Int = pattern().groupCount
190226

191227
def start(): Int = ensureLastMatch.index + regionStart()
192228
def end(): Int = start() + group().length
@@ -197,18 +233,22 @@ final class Matcher private[regex] (
197233
else startOfGroup(group)
198234
}
199235

236+
def start(name: String): Int =
237+
start(pattern().namedGroup(name))
238+
200239
def end(group: Int): Int = {
201240
val s = start(group)
202241
if (s == -1) -1
203242
else s + this.group(group).length
204243
}
205244

245+
def end(name: String): Int =
246+
end(pattern().namedGroup(name))
247+
206248
def group(group: Int): String = ensureLastMatch(group).orNull
207249

208-
def group(name: String): String = {
209-
ensureLastMatch
210-
throw new IllegalArgumentException
211-
}
250+
def group(name: String): String =
251+
group(pattern().namedGroup(name))
212252

213253
// Seal the state
214254

@@ -267,21 +307,12 @@ object Matcher {
267307
result
268308
}
269309

270-
private def getGroupCount(lastMatch: js.RegExp.ExecResult,
271-
pattern: Pattern): Int = {
272-
/* `pattern.groupCount` has the answer, but it can require some
273-
* computation to get it, so try and use lastMatch's group count if we can.
274-
*/
275-
if (lastMatch != null) lastMatch.length - 1
276-
else pattern.groupCount
277-
}
278-
279310
private final class SealedResult(inputstr: String,
280311
lastMatch: js.RegExp.ExecResult, pattern: Pattern,
281312
regionStart: Int, private var startOfGroupCache: js.Array[Int])
282313
extends MatchResult {
283314

284-
def groupCount(): Int = getGroupCount(lastMatch, pattern)
315+
def groupCount(): Int = pattern.groupCount
285316

286317
def start(): Int = ensureLastMatch.index + regionStart
287318
def end(): Int = start() + group().length
@@ -298,14 +329,23 @@ object Matcher {
298329
else startOfGroup(group)
299330
}
300331

332+
def start(name: String): Int =
333+
start(pattern.namedGroup(name))
334+
301335
def end(group: Int): Int = {
302336
val s = start(group)
303337
if (s == -1) -1
304338
else s + this.group(group).length
305339
}
306340

341+
def end(name: String): Int =
342+
end(pattern.namedGroup(name))
343+
307344
def group(group: Int): String = ensureLastMatch(group).orNull
308345

346+
def group(name: String): String =
347+
group(pattern.namedGroup(name))
348+
309349
private def ensureLastMatch: js.RegExp.ExecResult = {
310350
if (lastMatch == null)
311351
throw new IllegalStateException("No match available")

javalib/src/main/scala/java/util/regex/Pattern.scala

Lines changed: 52 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -12,52 +12,69 @@
1212

1313
package java.util.regex
1414

15-
import scala.annotation.switch
16-
1715
import scala.scalajs.js
1816

19-
import java.util.ScalaOps._
20-
21-
final class Pattern private (jsRegExp: js.RegExp, _pattern: String, _flags: Int)
22-
extends Serializable {
17+
final class Pattern private[regex] (
18+
_pattern: String,
19+
_flags: Int,
20+
jsPattern: String,
21+
jsFlags: String,
22+
private[regex] val groupCount: Int,
23+
namedGroups: js.Dictionary[Int]
24+
) extends Serializable {
2325

2426
import Pattern._
2527

28+
/** Compile the native RegExp once.
29+
*
30+
* In `newJSRegExp()`, we clone that native RegExp using
31+
* `new js.RegExp(jsRegExpBlueprint)`, which the JS engine hopefully
32+
* optimizes by reusing the compiled internal representation of the RegExp.
33+
* Otherwise, well, there's not much we can do about it.
34+
*/
35+
private[this] val jsRegExpBlueprint = new js.RegExp(jsPattern, jsFlags + "g")
36+
37+
/** Another version of the RegExp that is used by `Matcher.matches()`.
38+
*
39+
* It forces `^` and `$` at the beginning and end of the pattern so that
40+
* only entire inputs are matched. In addition, it does not have the 'g'
41+
* flag, so that it can be repeatedly used without managing `lastIndex`.
42+
*
43+
* Since that RegExp is only used locally within `matches()`, and not stored
44+
* in the `Matcher`, we can always reuse the same instance.
45+
*/
46+
private[regex] lazy val jsRegExpForMatches: js.RegExp =
47+
new js.RegExp("^" + jsPattern + "$", jsFlags)
48+
2649
def pattern(): String = _pattern
2750
def flags(): Int = _flags
2851

29-
private def jsPattern: String = jsRegExp.source
30-
31-
private def jsFlags: String = {
32-
(if (jsRegExp.global) "g" else "") +
33-
(if (jsRegExp.ignoreCase) "i" else "") +
34-
(if (jsRegExp.multiline) "m" else "")
35-
}
36-
37-
private[regex] lazy val groupCount: Int =
38-
new js.RegExp("|" + jsPattern).exec("").length - 1
39-
4052
private[regex] lazy val groupStartMapper: GroupStartMapper =
41-
GroupStartMapper(jsPattern, jsFlags)
53+
GroupStartMapper(jsPattern, jsFlags + "g")
4254

4355
override def toString(): String = pattern()
4456

4557
private[regex] def newJSRegExp(): js.RegExp = {
46-
val r = new js.RegExp(jsRegExp)
47-
if (r ne jsRegExp) {
58+
val r = new js.RegExp(jsRegExpBlueprint)
59+
if (r ne jsRegExpBlueprint) {
4860
r
4961
} else {
5062
/* Workaround for the PhantomJS 1.x bug
5163
* https://github.com/ariya/phantomjs/issues/11494
52-
* which causes new js.RegExp(jsRegExp) to return the same object,
53-
* rather than a new one.
54-
* We therefore reconstruct the pattern and flags used to create
55-
* jsRegExp and create a new one from there.
64+
* which causes new js.RegExp(jsRegExpBlueprint) to return the same
65+
* object, rather than a new one.
66+
* In that case, we reconstruct a new js.RegExp from scratch.
5667
*/
57-
new js.RegExp(jsPattern, jsFlags)
68+
new js.RegExp(jsPattern, jsFlags + "g")
5869
}
5970
}
6071

72+
private[regex] def namedGroup(name: String): Int = {
73+
namedGroups.getOrElse(name, {
74+
throw new IllegalArgumentException(s"No group with name <$name>")
75+
})
76+
}
77+
6178
def matcher(input: CharSequence): Matcher =
6279
new Matcher(this, input, 0, input.length)
6380

@@ -123,27 +140,8 @@ object Pattern {
123140
final val CANON_EQ = 0x80
124141
final val UNICODE_CHARACTER_CLASS = 0x100
125142

126-
def compile(regex: String, flags: Int): Pattern = {
127-
val (jsPattern, flags1) = {
128-
if ((flags & LITERAL) != 0) {
129-
(quote(regex), flags)
130-
} else {
131-
trySplitHack(regex, flags) orElse
132-
tryFlagHack(regex, flags) getOrElse
133-
(regex, flags)
134-
}
135-
}
136-
137-
val jsFlags = {
138-
"g" +
139-
(if ((flags1 & CASE_INSENSITIVE) != 0) "i" else "") +
140-
(if ((flags1 & MULTILINE) != 0) "m" else "")
141-
}
142-
143-
val jsRegExp = new js.RegExp(jsPattern, jsFlags)
144-
145-
new Pattern(jsRegExp, regex, flags1)
146-
}
143+
def compile(regex: String, flags: Int): Pattern =
144+
PatternCompiler.compile(regex, flags)
147145

148146
def compile(regex: String): Pattern =
149147
compile(regex, 0)
@@ -152,66 +150,14 @@ object Pattern {
152150
compile(regex).matcher(input).matches()
153151

154152
def quote(s: String): String = {
155-
var result = ""
156-
var i = 0
157-
while (i < s.length) {
158-
val c = s.charAt(i)
159-
result += ((c: @switch) match {
160-
case '\\' | '.' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
161-
| '?' | '*' | '+' | '^' | '$' => "\\"+c
162-
case _ => c
163-
})
164-
i += 1
153+
var result = "\\Q"
154+
var start = 0
155+
var end = s.indexOf("\\E", start)
156+
while (end >= 0) {
157+
result += s.substring(start, end) + "\\E\\\\E\\Q"
158+
start = end + 2
159+
end = s.indexOf("\\E", start)
165160
}
166-
result
161+
result + s.substring(start) + "\\E"
167162
}
168-
169-
/** This is a hack to support StringLike.split().
170-
* It replaces occurrences of \Q<char>\E by quoted(<char>)
171-
*/
172-
@inline
173-
private def trySplitHack(pat: String, flags: Int) = {
174-
val m = splitHackPat.exec(pat)
175-
if (m != null)
176-
Some((quote(m(1).get), flags))
177-
else
178-
None
179-
}
180-
181-
@inline
182-
private def tryFlagHack(pat: String, flags0: Int) = {
183-
val m = flagHackPat.exec(pat)
184-
if (m != null) {
185-
val newPat = pat.substring(m(0).get.length) // cut off the flag specifiers
186-
var flags = flags0
187-
for (chars <- m(1)) {
188-
for (i <- 0 until chars.length())
189-
flags |= charToFlag(chars.charAt(i))
190-
}
191-
for (chars <- m(2)) {
192-
for (i <- 0 until chars.length())
193-
flags &= ~charToFlag(chars.charAt(i))
194-
}
195-
Some((newPat, flags))
196-
} else
197-
None
198-
}
199-
200-
private def charToFlag(c: Char) = (c: @switch) match {
201-
case 'i' => CASE_INSENSITIVE
202-
case 'd' => UNIX_LINES
203-
case 'm' => MULTILINE
204-
case 's' => DOTALL
205-
case 'u' => UNICODE_CASE
206-
case 'x' => COMMENTS
207-
case 'U' => UNICODE_CHARACTER_CLASS
208-
case _ => throw new IllegalArgumentException("bad in-pattern flag")
209-
}
210-
211-
/** matches \Q<char>\E to support StringLike.split */
212-
private val splitHackPat = new js.RegExp("^\\\\Q(.|\\n|\\r)\\\\E$")
213-
214-
/** regex to match flag specifiers in regex. E.g. (?u), (?-i), (?U-i) */
215-
private val flagHackPat =
216-
new js.RegExp("^\\(\\?([idmsuxU]*)(?:-([idmsuxU]*))?\\)")
217163
}

0 commit comments

Comments
 (0)