Skip to content

Commit 0a436a6

Browse files
committed
Handle comments in character classes.
1 parent 886cce4 commit 0a436a6

File tree

3 files changed

+144
-23
lines changed

3 files changed

+144
-23
lines changed

javalib/src/main/scala/java/util/regex/PatternCompiler.scala

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
package java.util.regex
1414

15-
import scala.annotation.switch
15+
import scala.annotation.{switch, tailrec}
1616

1717
import java.lang.Character.{
1818
charCount,
@@ -977,11 +977,7 @@ private final class PatternCompiler(private val pattern: String, private var fla
977977
pIndex += 1
978978

979979
case '#' if comments =>
980-
// ignore until the end of a line
981-
@inline def isEOL(c: Char): Boolean =
982-
c == '\r' || c == '\n' || c == '\u0085' || c == '\u2028' || c == '\u2029'
983-
while (pIndex != len && !isEOL(pattern.charAt(pIndex)))
984-
pIndex += 1
980+
skipSharpComment()
985981

986982
case '?' | '*' | '+' | '{' =>
987983
parseError("Dangling meta character '" + codePointToString(dispatchCP) + "'")
@@ -1016,6 +1012,49 @@ private final class PatternCompiler(private val pattern: String, private var fla
10161012
// scalastyle:on return
10171013
}
10181014

1015+
/** Skip a '#' comment.
1016+
*
1017+
* Pre-condition: `comments && pattern.charAt(pIndex) == '#'` is true
1018+
*/
1019+
private def skipSharpComment(): Unit = {
1020+
val pattern = this.pattern // local copy
1021+
val len = pattern.length()
1022+
1023+
@inline def isEOL(c: Char): Boolean =
1024+
c == '\n' || c == '\r' || c == '\u0085' || c == '\u2028' || c == '\u2029'
1025+
1026+
while (pIndex != len && !isEOL(pattern.charAt(pIndex)))
1027+
pIndex += 1
1028+
}
1029+
1030+
/** Skip all comments.
1031+
*
1032+
* Pre-condition: `comments` is true
1033+
*/
1034+
@noinline
1035+
private def skipComments(): Unit = {
1036+
val pattern = this.pattern // local copy
1037+
val len = pattern.length()
1038+
1039+
@inline @tailrec
1040+
def loop(): Unit = {
1041+
if (pIndex != len) {
1042+
(pattern.charAt(pIndex): @switch) match {
1043+
case ' ' | '\t' | '\n' | '\u000B' | '\f' | '\r' =>
1044+
pIndex += 1
1045+
loop()
1046+
case '#' =>
1047+
skipSharpComment()
1048+
loop()
1049+
case _ =>
1050+
()
1051+
}
1052+
}
1053+
}
1054+
1055+
loop()
1056+
}
1057+
10191058
private def compileRepeater(compiledGroupCountBeforeThisToken: Int, compiledToken: String): String = {
10201059
val pattern = this.pattern // local copy
10211060
val len = pattern.length()
@@ -1638,26 +1677,34 @@ private final class PatternCompiler(private val pattern: String, private var fla
16381677

16391678
while (pIndex != len) {
16401679
def processRangeOrSingleCodePoint(startCodePoint: Int): Unit = {
1641-
@inline def canBeRangeEnd(c: Char): Boolean = c != '[' && c != ']'
1642-
1643-
if (pIndex + 2 <= len && pattern.charAt(pIndex) == '-' &&
1644-
canBeRangeEnd(pattern.charAt(pIndex + 1))) {
1645-
// Range of code points
1680+
if (comments)
1681+
skipComments()
16461682

1683+
if (pIndex != len && pattern.charAt(pIndex) == '-') {
1684+
// Perhaps a range of code points, unless the '-' is followed by '[' or ']'
16471685
pIndex += 1
1648-
if (pIndex + 2 > len)
1686+
if (comments)
1687+
skipComments()
1688+
1689+
if (pIndex == len)
16491690
parseError("Unclosed character class")
16501691

16511692
val cpEnd = pattern.codePointAt(pIndex)
1652-
pIndex += charCount(cpEnd)
1653-
val endCodePoint =
1654-
if (cpEnd == '\\') parseSingleCodePointEscape()
1655-
else cpEnd
1656-
1657-
if (endCodePoint < startCodePoint)
1658-
parseError("Illegal character range")
16591693

1660-
builder.addCodePointRange(startCodePoint, endCodePoint)
1694+
if (cpEnd == '[' || cpEnd == ']') {
1695+
// Oops, it wasn't a range after all
1696+
builder.addSingleCodePoint(startCodePoint)
1697+
builder.addSingleCodePoint('-')
1698+
} else {
1699+
// Range of code points
1700+
pIndex += charCount(cpEnd)
1701+
val endCodePoint =
1702+
if (cpEnd == '\\') parseSingleCodePointEscape()
1703+
else cpEnd
1704+
if (endCodePoint < startCodePoint)
1705+
parseError("Illegal character range")
1706+
builder.addCodePointRange(startCodePoint, endCodePoint)
1707+
}
16611708
} else {
16621709
// Single code point
16631710
builder.addSingleCodePoint(startCodePoint)
@@ -1702,6 +1749,11 @@ private final class PatternCompiler(private val pattern: String, private var fla
17021749
processRangeOrSingleCodePoint(parseSingleCodePointEscape())
17031750
}
17041751

1752+
case ' ' | '\t' | '\n' | '\u000B' | '\f' | '\r' if comments =>
1753+
pIndex += 1
1754+
case '#' if comments =>
1755+
skipSharpComment()
1756+
17051757
case codePoint =>
17061758
pIndex += charCount(codePoint)
17071759
processRangeOrSingleCodePoint(codePoint)

linker/shared/src/test/scala/org/scalajs/linker/LibrarySizeTest.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,9 @@ class LibrarySizeTest {
7070
)
7171

7272
testLinkedSizes(
73-
expectedFastLinkSize = 185765,
74-
expectedFullLinkSizeWithoutClosure = 172906,
75-
expectedFullLinkSizeWithClosure = 31670,
73+
expectedFastLinkSize = 188076,
74+
expectedFullLinkSizeWithoutClosure = 175219,
75+
expectedFullLinkSizeWithClosure = 32036,
7676
classDefs,
7777
moduleInitializers = MainTestModuleInitializers
7878
)

test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,75 @@ class RegexEngineTest {
731731

732732
assertMatches(lotsOfComments, "abc\u0085d\u2028e\u2029fghi")
733733

734+
val commentsInCharClass = compile(
735+
"[\n" +
736+
" A-Z # an uppercase letter\n" +
737+
" _ \t # or an underscore\n" +
738+
" f - # gosh, we can even have ranges\n" +
739+
" j # split by comments!\n" +
740+
"]",
741+
Comments)
742+
743+
assertMatches(commentsInCharClass, "A")
744+
assertMatches(commentsInCharClass, "F")
745+
assertMatches(commentsInCharClass, "R")
746+
assertMatches(commentsInCharClass, "Z")
747+
assertMatches(commentsInCharClass, "f")
748+
assertMatches(commentsInCharClass, "g")
749+
assertMatches(commentsInCharClass, "h")
750+
assertMatches(commentsInCharClass, "i")
751+
assertMatches(commentsInCharClass, "j")
752+
assertMatches(commentsInCharClass, "_")
753+
assertNotFind(commentsInCharClass, " ")
754+
assertNotFind(commentsInCharClass, "\t")
755+
assertNotFind(commentsInCharClass, "\n")
756+
assertNotFind(commentsInCharClass, "#")
757+
assertNotFind(commentsInCharClass, "-")
758+
assertNotFind(commentsInCharClass, "!")
759+
assertNotFind(commentsInCharClass, "a")
760+
assertNotFind(commentsInCharClass, "e")
761+
assertNotFind(commentsInCharClass, "k")
762+
assertNotFind(commentsInCharClass, "l")
763+
assertNotFind(commentsInCharClass, "z")
764+
765+
val fakeRangeWithComments = compile("[A-D G # comment\n -]", Comments)
766+
assertMatches(fakeRangeWithComments, "A")
767+
assertMatches(fakeRangeWithComments, "C")
768+
assertMatches(fakeRangeWithComments, "D")
769+
assertMatches(fakeRangeWithComments, "G")
770+
assertMatches(fakeRangeWithComments, "-")
771+
assertNotMatches(fakeRangeWithComments, "E")
772+
assertNotMatches(fakeRangeWithComments, "I")
773+
assertNotMatches(fakeRangeWithComments, "e")
774+
assertNotMatches(fakeRangeWithComments, "]")
775+
assertNotMatches(fakeRangeWithComments, " ")
776+
assertNotMatches(fakeRangeWithComments, "\n")
777+
assertNotMatches(fakeRangeWithComments, "#")
778+
779+
/* If there is a comment between the '-' and the ']', the JVM does not
780+
* detect that it is a fake range, and reports a syntax error. Our
781+
* implementation correctly detects that case, because it was easier than
782+
* not detecting it.
783+
*/
784+
if (executingInJVM) {
785+
assertSyntaxError("[A-D G - ]", Comments, "irrelevant", 0)
786+
assertSyntaxError("[A-D G -# comment\n]", Comments, "irrelevant", 0)
787+
} else {
788+
val fakeRangeWithCommentsOnRHS = compile("[A-D G - # comment\n ]", Comments)
789+
assertMatches(fakeRangeWithCommentsOnRHS, "A")
790+
assertMatches(fakeRangeWithCommentsOnRHS, "C")
791+
assertMatches(fakeRangeWithCommentsOnRHS, "D")
792+
assertMatches(fakeRangeWithCommentsOnRHS, "G")
793+
assertMatches(fakeRangeWithCommentsOnRHS, "-")
794+
assertNotMatches(fakeRangeWithCommentsOnRHS, "E")
795+
assertNotMatches(fakeRangeWithCommentsOnRHS, "I")
796+
assertNotMatches(fakeRangeWithCommentsOnRHS, "e")
797+
assertNotMatches(fakeRangeWithCommentsOnRHS, "]")
798+
assertNotMatches(fakeRangeWithCommentsOnRHS, " ")
799+
assertNotMatches(fakeRangeWithCommentsOnRHS, "\n")
800+
assertNotMatches(fakeRangeWithCommentsOnRHS, "#")
801+
}
802+
734803
// We can still match against whitespace in the input
735804
assertMatches("\ta\\ b\t", Comments, "a b")
736805
assertMatches("\ta.b\t", Comments, "a b")

0 commit comments

Comments
 (0)