Skip to content

Commit f2604f5

Browse files
committed
Refactor compileTopLevelOrInsideGroup for readability.
- Extract the body of ^, $ and . in separate functions. - Separate non-repeatable token handling from repeatable token handling.
1 parent dffd951 commit f2604f5

File tree

1 file changed

+81
-83
lines changed

1 file changed

+81
-83
lines changed

javalib/src/main/scala/java/util/regex/PatternCompiler.scala

Lines changed: 81 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -951,114 +951,57 @@ private final class PatternCompiler(private val pattern: String, private var fla
951951
var result = ""
952952

953953
while (pIndex != len) {
954-
// Record the current compiledGroupCount, for possessive quantifiers
955-
val compiledGroupCountBeforeThisToken = compiledGroupCount
956-
957-
// Set to false when parsing a token that cannot be repeated
958-
var repeaterAllowed = true
959-
960954
val dispatchCP = pattern.codePointAt(pIndex)
961-
val compiledToken: String = (dispatchCP: @switch) match {
955+
(dispatchCP: @switch) match {
956+
// Cases that mess with the control flow and/or that cannot be repeated
957+
962958
case ')' =>
963959
if (!insideGroup)
964960
parseError("Unmatched closing ')'")
965961
pIndex += 1
966962
return result
967963

968-
case '\\' =>
969-
compileEscape()
970-
971-
case '[' =>
972-
compileCharacterClass()
973-
974-
case '?' | '*' | '+' | '{' =>
975-
parseError("Dangling meta character '" + codePointToString(dispatchCP) + "'")
976-
977-
case '(' =>
978-
compileGroup()
979-
980964
case '|' =>
981965
if (sticky && !insideGroup)
982966
parseError("\\G is not supported when there is an alternative at the top level")
983967
pIndex += 1
984-
repeaterAllowed = false
985-
"|"
986-
987-
case '^' =>
988-
pIndex += 1
989-
if (multiline) {
990-
/* `multiline` implies ES2018, so we can use look-behind assertions.
991-
* We cannot use the 'm' flag of JavaScript RegExps because its
992-
* semantics differ from the Java ones (either with or without
993-
* `UNIX_LINES`).
994-
*/
995-
if (unixLines)
996-
"(?<=^|\n)"
997-
else
998-
"(?<=^|\r(?!\n)|[\n\u0085\u2028\u2029])"
999-
} else {
1000-
/* Wrap as (?:^) in case it ends up being repeated, for example
1001-
* `^+` becomes `(?:^)+`. This is necessary because `^+` is not
1002-
* syntactically valid in JS, although it is valid once wrapped in
1003-
* a group.
1004-
* (Not that repeating ^ has any useful purpose, but the spec does
1005-
* not prevent it.)
1006-
*/
1007-
"(?:^)"
1008-
}
1009-
1010-
case '$' =>
1011-
pIndex += 1
1012-
if (multiline) {
1013-
/* `multiline` implies ES2018, so we can use look-behind assertions.
1014-
* We cannot use the 'm' flag of JavaScript RegExps (see ^ above).
1015-
*/
1016-
if (unixLines)
1017-
"(?=$|\n)"
1018-
else
1019-
"(?=$|(?<!\r)\n|[\r\u0085\u2028\u2029])"
1020-
} else {
1021-
// Wrap as (?:$) for the same reason as ^ above
1022-
"(?:$)"
1023-
}
1024-
1025-
case '.' =>
1026-
/* Since JavaScript's `.`'s interpretation of new lines is not the
1027-
* same as Java's (with or without UNIX_LINES), we compile `.` to
1028-
* custom character classes.
1029-
*/
1030-
pIndex += 1
1031-
val rejected = {
1032-
if (dotAll) ""
1033-
else if (unixLines) "\n"
1034-
else "\n\r\u0085\u2028\u2029"
1035-
}
1036-
codePointNotAmong(rejected)
968+
result += "|"
1037969

1038970
// experimentally, this is the set of chars considered as whitespace for comments
1039971
case ' ' | '\t' | '\n' | '\u000B' | '\f' | '\r' if comments =>
1040972
pIndex += 1
1041-
repeaterAllowed = false
1042-
""
1043973

1044974
case '#' if comments =>
1045975
// ignore until the end of a line
1046976
@inline def isEOL(c: Char): Boolean =
1047977
c == '\r' || c == '\n' || c == '\u0085' || c == '\u2028' || c == '\u2029'
1048978
while (pIndex != len && !isEOL(pattern.charAt(pIndex)))
1049979
pIndex += 1
1050-
repeaterAllowed = false
1051-
""
980+
981+
case '?' | '*' | '+' | '{' =>
982+
parseError("Dangling meta character '" + codePointToString(dispatchCP) + "'")
983+
984+
// Regular cases, which can be repeated
1052985

1053986
case _ =>
1054-
pIndex += charCount(dispatchCP)
1055-
literal(dispatchCP)
1056-
}
987+
// Record the current compiledGroupCount, for possessive quantifiers
988+
val compiledGroupCountBeforeThisToken = compiledGroupCount
1057989

1058-
if (repeaterAllowed)
1059-
result += compileRepeater(compiledGroupCountBeforeThisToken, compiledToken)
1060-
else
1061-
result += compiledToken
990+
val compiledToken = (dispatchCP: @switch) match {
991+
case '\\' => compileEscape()
992+
case '[' => compileCharacterClass()
993+
case '(' => compileGroup()
994+
case '^' => compileCaret()
995+
case '$' => compileDollar()
996+
case '.' => compileDot()
997+
998+
case _ =>
999+
pIndex += charCount(dispatchCP)
1000+
literal(dispatchCP)
1001+
}
1002+
1003+
result += compileRepeater(compiledGroupCountBeforeThisToken, compiledToken)
1004+
}
10621005
}
10631006

10641007
if (insideGroup)
@@ -1180,6 +1123,61 @@ private final class PatternCompiler(private val pattern: String, private var fla
11801123
s"(?:(?=($amendedToken$baseRepeater))\\$myGroupNumber)"
11811124
}
11821125

1126+
@inline
1127+
private def compileCaret(): String = {
1128+
pIndex += 1
1129+
if (multiline) {
1130+
/* `multiline` implies ES2018, so we can use look-behind assertions.
1131+
* We cannot use the 'm' flag of JavaScript RegExps because its semantics
1132+
* differ from the Java ones (either with or without `UNIX_LINES`).
1133+
*/
1134+
if (unixLines)
1135+
"(?<=^|\n)"
1136+
else
1137+
"(?<=^|\r(?!\n)|[\n\u0085\u2028\u2029])"
1138+
} else {
1139+
/* Wrap as (?:^) in case it ends up being repeated, for example `^+`
1140+
* becomes `(?:^)+`. This is necessary because `^+` is not syntactically
1141+
* valid in JS, although it is valid once wrapped in a group.
1142+
* (Not that repeating ^ has any useful purpose, but the spec does not
1143+
* prevent it.)
1144+
*/
1145+
"(?:^)"
1146+
}
1147+
}
1148+
1149+
@inline
1150+
private def compileDollar(): String = {
1151+
pIndex += 1
1152+
if (multiline) {
1153+
/* `multiline` implies ES2018, so we can use look-behind assertions.
1154+
* We cannot use the 'm' flag of JavaScript RegExps (see ^ above).
1155+
*/
1156+
if (unixLines)
1157+
"(?=$|\n)"
1158+
else
1159+
"(?=$|(?<!\r)\n|[\r\u0085\u2028\u2029])"
1160+
} else {
1161+
// Wrap as (?:$) for the same reason as ^ above
1162+
"(?:$)"
1163+
}
1164+
}
1165+
1166+
@inline
1167+
private def compileDot(): String = {
1168+
/* Since JavaScript's `.`'s interpretation of new lines is not the same as
1169+
* Java's (with or without UNIX_LINES), we compile `.` to custom character
1170+
* classes.
1171+
*/
1172+
pIndex += 1
1173+
val rejected = {
1174+
if (dotAll) ""
1175+
else if (unixLines) "\n"
1176+
else "\n\r\u0085\u2028\u2029"
1177+
}
1178+
codePointNotAmong(rejected)
1179+
}
1180+
11831181
private def compileEscape(): String = {
11841182
val pattern = this.pattern // local copy
11851183
val len = pattern.length()

0 commit comments

Comments
 (0)