@@ -951,114 +951,57 @@ private final class PatternCompiler(private val pattern: String, private var fla
951
951
var result = " "
952
952
953
953
while (pIndex != len) {
954
- // Record the current compiledGroupCount, for possessive quantifiers
955
- val compiledGroupCountBeforeThisToken = compiledGroupCount
956
-
957
- // Set to false when parsing a token that cannot be repeated
958
- var repeaterAllowed = true
959
-
960
954
val dispatchCP = pattern.codePointAt(pIndex)
961
- val compiledToken : String = (dispatchCP : @ switch) match {
955
+ (dispatchCP : @ switch) match {
956
+ // Cases that mess with the control flow and/or that cannot be repeated
957
+
962
958
case ')' =>
963
959
if (! insideGroup)
964
960
parseError(" Unmatched closing ')'" )
965
961
pIndex += 1
966
962
return result
967
963
968
- case '\\ ' =>
969
- compileEscape()
970
-
971
- case '[' =>
972
- compileCharacterClass()
973
-
974
- case '?' | '*' | '+' | '{' =>
975
- parseError(" Dangling meta character '" + codePointToString(dispatchCP) + " '" )
976
-
977
- case '(' =>
978
- compileGroup()
979
-
980
964
case '|' =>
981
965
if (sticky && ! insideGroup)
982
966
parseError(" \\ G is not supported when there is an alternative at the top level" )
983
967
pIndex += 1
984
- repeaterAllowed = false
985
- " |"
986
-
987
- case '^' =>
988
- pIndex += 1
989
- if (multiline) {
990
- /* `multiline` implies ES2018, so we can use look-behind assertions.
991
- * We cannot use the 'm' flag of JavaScript RegExps because its
992
- * semantics differ from the Java ones (either with or without
993
- * `UNIX_LINES`).
994
- */
995
- if (unixLines)
996
- " (?<=^|\n )"
997
- else
998
- " (?<=^|\r (?!\n )|[\n\u0085\u2028\u2029 ])"
999
- } else {
1000
- /* Wrap as (?:^) in case it ends up being repeated, for example
1001
- * `^+` becomes `(?:^)+`. This is necessary because `^+` is not
1002
- * syntactically valid in JS, although it is valid once wrapped in
1003
- * a group.
1004
- * (Not that repeating ^ has any useful purpose, but the spec does
1005
- * not prevent it.)
1006
- */
1007
- " (?:^)"
1008
- }
1009
-
1010
- case '$' =>
1011
- pIndex += 1
1012
- if (multiline) {
1013
- /* `multiline` implies ES2018, so we can use look-behind assertions.
1014
- * We cannot use the 'm' flag of JavaScript RegExps (see ^ above).
1015
- */
1016
- if (unixLines)
1017
- " (?=$|\n )"
1018
- else
1019
- " (?=$|(?<!\r )\n |[\r\u0085\u2028\u2029 ])"
1020
- } else {
1021
- // Wrap as (?:$) for the same reason as ^ above
1022
- " (?:$)"
1023
- }
1024
-
1025
- case '.' =>
1026
- /* Since JavaScript's `.`'s interpretation of new lines is not the
1027
- * same as Java's (with or without UNIX_LINES), we compile `.` to
1028
- * custom character classes.
1029
- */
1030
- pIndex += 1
1031
- val rejected = {
1032
- if (dotAll) " "
1033
- else if (unixLines) " \n "
1034
- else " \n\r\u0085\u2028\u2029 "
1035
- }
1036
- codePointNotAmong(rejected)
968
+ result += " |"
1037
969
1038
970
// experimentally, this is the set of chars considered as whitespace for comments
1039
971
case ' ' | '\t ' | '\n ' | '\u000B ' | '\f ' | '\r ' if comments =>
1040
972
pIndex += 1
1041
- repeaterAllowed = false
1042
- " "
1043
973
1044
974
case '#' if comments =>
1045
975
// ignore until the end of a line
1046
976
@ inline def isEOL (c : Char ): Boolean =
1047
977
c == '\r ' || c == '\n ' || c == '\u0085 ' || c == '\u2028 ' || c == '\u2029 '
1048
978
while (pIndex != len && ! isEOL(pattern.charAt(pIndex)))
1049
979
pIndex += 1
1050
- repeaterAllowed = false
1051
- " "
980
+
981
+ case '?' | '*' | '+' | '{' =>
982
+ parseError(" Dangling meta character '" + codePointToString(dispatchCP) + " '" )
983
+
984
+ // Regular cases, which can be repeated
1052
985
1053
986
case _ =>
1054
- pIndex += charCount(dispatchCP)
1055
- literal(dispatchCP)
1056
- }
987
+ // Record the current compiledGroupCount, for possessive quantifiers
988
+ val compiledGroupCountBeforeThisToken = compiledGroupCount
1057
989
1058
- if (repeaterAllowed)
1059
- result += compileRepeater(compiledGroupCountBeforeThisToken, compiledToken)
1060
- else
1061
- result += compiledToken
990
+ val compiledToken = (dispatchCP : @ switch) match {
991
+ case '\\ ' => compileEscape()
992
+ case '[' => compileCharacterClass()
993
+ case '(' => compileGroup()
994
+ case '^' => compileCaret()
995
+ case '$' => compileDollar()
996
+ case '.' => compileDot()
997
+
998
+ case _ =>
999
+ pIndex += charCount(dispatchCP)
1000
+ literal(dispatchCP)
1001
+ }
1002
+
1003
+ result += compileRepeater(compiledGroupCountBeforeThisToken, compiledToken)
1004
+ }
1062
1005
}
1063
1006
1064
1007
if (insideGroup)
@@ -1180,6 +1123,61 @@ private final class PatternCompiler(private val pattern: String, private var fla
1180
1123
s " (?:(?=( $amendedToken$baseRepeater)) \\ $myGroupNumber) "
1181
1124
}
1182
1125
1126
+ @ inline
1127
+ private def compileCaret (): String = {
1128
+ pIndex += 1
1129
+ if (multiline) {
1130
+ /* `multiline` implies ES2018, so we can use look-behind assertions.
1131
+ * We cannot use the 'm' flag of JavaScript RegExps because its semantics
1132
+ * differ from the Java ones (either with or without `UNIX_LINES`).
1133
+ */
1134
+ if (unixLines)
1135
+ " (?<=^|\n )"
1136
+ else
1137
+ " (?<=^|\r (?!\n )|[\n\u0085\u2028\u2029 ])"
1138
+ } else {
1139
+ /* Wrap as (?:^) in case it ends up being repeated, for example `^+`
1140
+ * becomes `(?:^)+`. This is necessary because `^+` is not syntactically
1141
+ * valid in JS, although it is valid once wrapped in a group.
1142
+ * (Not that repeating ^ has any useful purpose, but the spec does not
1143
+ * prevent it.)
1144
+ */
1145
+ " (?:^)"
1146
+ }
1147
+ }
1148
+
1149
+ @ inline
1150
+ private def compileDollar (): String = {
1151
+ pIndex += 1
1152
+ if (multiline) {
1153
+ /* `multiline` implies ES2018, so we can use look-behind assertions.
1154
+ * We cannot use the 'm' flag of JavaScript RegExps (see ^ above).
1155
+ */
1156
+ if (unixLines)
1157
+ " (?=$|\n )"
1158
+ else
1159
+ " (?=$|(?<!\r )\n |[\r\u0085\u2028\u2029 ])"
1160
+ } else {
1161
+ // Wrap as (?:$) for the same reason as ^ above
1162
+ " (?:$)"
1163
+ }
1164
+ }
1165
+
1166
+ @ inline
1167
+ private def compileDot (): String = {
1168
+ /* Since JavaScript's `.`'s interpretation of new lines is not the same as
1169
+ * Java's (with or without UNIX_LINES), we compile `.` to custom character
1170
+ * classes.
1171
+ */
1172
+ pIndex += 1
1173
+ val rejected = {
1174
+ if (dotAll) " "
1175
+ else if (unixLines) " \n "
1176
+ else " \n\r\u0085\u2028\u2029 "
1177
+ }
1178
+ codePointNotAmong(rejected)
1179
+ }
1180
+
1183
1181
private def compileEscape (): String = {
1184
1182
val pattern = this .pattern // local copy
1185
1183
val len = pattern.length()
0 commit comments