@@ -308,7 +308,11 @@ class RegexEngineTest {
308
308
assertMatches(" a\\ +" , " a+" )
309
309
assertMatches(" a\\ {3}" , " a{3}" )
310
310
assertMatches(" \\ (5\\ )" , " (5)" )
311
+
312
+ // Escapes for characters that are syntax characters only when using Comments (the escapes work regardless)
313
+ assertMatches(" \\ \\\t\\\n\\\u000B\\\f\\\r " , " \t\n\u000B\f\r " )
311
314
assertMatches(" \\ \\\t\\\n\\\u000B\\\f\\\r " , Comments , " \t\n\u000B\f\r " )
315
+ assertMatches(" \\ #abc" , " #abc" )
312
316
assertMatches(" \\ #abc" , Comments , " #abc" )
313
317
314
318
// Letter escapes for special chars
@@ -327,7 +331,10 @@ class RegexEngineTest {
327
331
assertMatches(" \\ c_" , " \u001F " )
328
332
assertMatches(" \\ c?" , 0x007f .toChar.toString())
329
333
330
- // More control escapes that are not really meant to be used
334
+ /* More control escapes that are not really meant to be used.
335
+ * In general, '\cx' means `x ^ 0x40`, as explained at
336
+ * https://stackoverflow.com/questions/35208570/java-regular-expression-cx-control-characters
337
+ */
331
338
assertMatches(" \\ cb" , 0x0022 .toChar.toString())
332
339
assertMatches(" \\ c" + GClefHigh , " " + (0xd834 ^ 0x40 ).toChar)
333
340
assertMatches(" \\ c" + GClef , GClefHigh + (0xdd1e ^ 0x40 ).toChar)
@@ -433,8 +440,10 @@ class RegexEngineTest {
433
440
assertFind(repeatedSupplementaryCodePoint, " bca\uD834\uDD1E\uDD1E def" , 2 , 5 )
434
441
435
442
// After quotes, a quantifier applies to the last code point (sic!)
436
- assertFind(" a\\ Qbc\\ d\\ E*" , " aaabc\\ dbc\\ dbc" , 2 , 7 )
437
- assertFind(" a\\ Qbc\\ b\\ E*" , " aaabc\\ bbc\\ bbc" , 2 , 8 )
443
+ val repeatedQuote = compile(" a\\ Qbc\\ d\\ E*" )
444
+ assertFind(repeatedQuote, " aaabc\\ dbc\\ dbc" , 2 , 7 )
445
+ assertFind(repeatedQuote, " aaabc\\ ddc" , 2 , 8 )
446
+ assertFind(repeatedQuote, " aaabc\\ bc" , 2 , 6 )
438
447
439
448
val repeatedQuoteEndingWithSupplementaryCodePoint = compile(" a\\ Qbc\\\uD834\uDD1E\\ E*" )
440
449
assertFind(repeatedQuoteEndingWithSupplementaryCodePoint, " aaabc\\\uD834\uDD1E\uD834\uDD1E bc" , 2 , 10 )
@@ -443,29 +452,54 @@ class RegexEngineTest {
443
452
}
444
453
445
454
@ Test def lazyQuantifiers (): Unit = {
446
- val starLazy = compile(" ba*?" )
447
- assertMatches(starLazy, " b" )
448
- assertMatches(starLazy, " ba" )
449
- assertMatches(starLazy, " baaaaa" )
450
- assertFind(starLazy, " cbaaassefaa" , 1 , 2 )
451
- assertFind(starLazy, " cbsssefaaaaa" , 1 , 2 )
452
- assertNotFind(starLazy, " qsessqsssddff" )
453
-
454
- val plusLazy = compile(" ba+?" )
455
- assertMatches(plusLazy, " ba" )
456
- assertMatches(plusLazy, " baaaa" )
457
- assertNotFind(plusLazy, " b" )
458
- assertFind(plusLazy, " cbaaassefaa" , 1 , 3 )
459
- assertFind(plusLazy, " cbsssefbaaa" , 7 , 9 )
460
- assertNotFind(plusLazy, " qsebsqsbsddfb" )
461
-
462
- val questionLazy = compile(" ba??" )
463
- assertMatches(questionLazy, " b" )
464
- assertMatches(questionLazy, " ba" )
465
- assertNotMatches(questionLazy, " baa" )
466
- assertFind(questionLazy, " cbaaassefaa" , 1 , 2 )
467
- assertFind(questionLazy, " cbssefbaaa" , 1 , 2 )
468
- assertNotFind(questionLazy, " qsessqsssddff" )
455
+ val starLazy = compile(" a[bc]*?b" )
456
+ assertMatches(starLazy, " ab" )
457
+ assertMatches(starLazy, " abbbb" )
458
+ assertMatches(starLazy, " abccbb" )
459
+ assertFind(starLazy, " abbb" , 0 , 2 )
460
+ assertFind(starLazy, " accbbbccb" , 0 , 4 )
461
+ assertNotFind(starLazy, " accc" )
462
+
463
+ val starLazyAtEnd = compile(" ba*?" )
464
+ assertMatches(starLazyAtEnd, " b" )
465
+ assertMatches(starLazyAtEnd, " ba" )
466
+ assertMatches(starLazyAtEnd, " baaaaa" )
467
+ assertFind(starLazyAtEnd, " cbaaassefaa" , 1 , 2 )
468
+ assertFind(starLazyAtEnd, " cbsssefaaaaa" , 1 , 2 )
469
+ assertNotFind(starLazyAtEnd, " qsessqsssddff" )
470
+
471
+ val plusLazy = compile(" a[bc]+?b" )
472
+ assertMatches(plusLazy, " abb" )
473
+ assertMatches(plusLazy, " acb" )
474
+ assertMatches(plusLazy, " abbbcccbb" )
475
+ assertFind(plusLazy, " abbb" , 0 , 3 )
476
+ assertFind(plusLazy, " accbbbccb" , 0 , 4 )
477
+ assertNotFind(plusLazy, " accc" )
478
+ assertNotFind(plusLazy, " ab" )
479
+
480
+ val plusLazyAtEnd = compile(" ba+?" )
481
+ assertMatches(plusLazyAtEnd, " ba" )
482
+ assertMatches(plusLazyAtEnd, " baaaa" )
483
+ assertNotFind(plusLazyAtEnd, " b" )
484
+ assertFind(plusLazyAtEnd, " cbaaassefaa" , 1 , 3 )
485
+ assertFind(plusLazyAtEnd, " cbsssefbaaa" , 7 , 9 )
486
+ assertNotFind(plusLazyAtEnd, " qsebsqsbsddfb" )
487
+
488
+ val questionLazy = compile(" a[bc]??b" )
489
+ assertMatches(questionLazy, " ab" )
490
+ assertMatches(questionLazy, " abb" )
491
+ assertMatches(questionLazy, " acb" )
492
+ assertFind(questionLazy, " abbb" , 0 , 2 )
493
+ assertFind(questionLazy, " acbbbccb" , 0 , 3 )
494
+ assertNotFind(questionLazy, " accbb" )
495
+
496
+ val questionLazyAtEnd = compile(" ba??" )
497
+ assertMatches(questionLazyAtEnd, " b" )
498
+ assertMatches(questionLazyAtEnd, " ba" )
499
+ assertNotMatches(questionLazyAtEnd, " baa" )
500
+ assertFind(questionLazyAtEnd, " cbaaassefaa" , 1 , 2 )
501
+ assertFind(questionLazyAtEnd, " cbssefbaaa" , 1 , 2 )
502
+ assertNotFind(questionLazyAtEnd, " qsessqsssddff" )
469
503
}
470
504
471
505
@ Test def possessiveQuantifiers (): Unit = {
@@ -477,7 +511,7 @@ class RegexEngineTest {
477
511
val plusPossessive = compile(" ab++[bd]" )
478
512
assertFind(plusPossessive, " a abbbb abbbdba " , 11 , 16 )
479
513
assertFind(plusPossessive, " a abbbb adba abdd " , 17 , 20 )
480
- assertNotFind(plusPossessive, " a abbbb dab adba " )
514
+ assertNotFind(plusPossessive, " a ad abbbb dab adba " )
481
515
482
516
val questionPossessive = compile(" ab?+[bd]" )
483
517
assertFind(questionPossessive, " a ab abb abb " , 9 , 12 )
@@ -576,7 +610,7 @@ class RegexEngineTest {
576
610
assertMatches(dotAll, " \u2029 " )
577
611
578
612
assertNotMatches(dotAll, " \r\n " )
579
- assertFind(dotUnixLines , " \r\n " , 0 , 1 )
613
+ assertFind(dotAll , " \r\n " , 0 , 1 )
580
614
581
615
val dotAllUnixLines = compile(" ." , DotAll | UnixLines )
582
616
@@ -594,7 +628,7 @@ class RegexEngineTest {
594
628
assertMatches(dotAllUnixLines, " \u2029 " )
595
629
596
630
assertNotMatches(dotAllUnixLines, " \r\n " )
597
- assertFind(dotUnixLines , " \r\n " , 0 , 1 )
631
+ assertFind(dotAllUnixLines , " \r\n " , 0 , 1 )
598
632
599
633
// Test case for #1847, and for the (?s) leading flag
600
634
val codeMatcher = Pattern .compile(" (?s).*<code>(.*?)</code>.*" )
@@ -683,7 +717,7 @@ class RegexEngineTest {
683
717
}
684
718
685
719
@ Test def comments (): Unit = {
686
- val abc = compile(
720
+ val lotsOfComments = compile(
687
721
" \t a # a comment is interrupted by \r " +
688
722
" b # or \n " +
689
723
" c # or \u0085 " +
@@ -695,7 +729,17 @@ class RegexEngineTest {
695
729
" i" ,
696
730
Comments )
697
731
698
- assertMatches(abc, " abc\u0085 d\u2028 e\u2029 fghi" )
732
+ assertMatches(lotsOfComments, " abc\u0085 d\u2028 e\u2029 fghi" )
733
+
734
+ // We can still match against whitespace in the input
735
+ assertMatches(" \t a\\ b\t " , Comments , " a b" )
736
+ assertMatches(" \t a.b\t " , Comments , " a b" )
737
+ assertMatches(" \t a[\\ c]b\t " , Comments , " a b" )
738
+
739
+ // We can still match against '#' in the input
740
+ assertMatches(" \t a\\ #b\t " , Comments , " a#b" )
741
+ assertMatches(" \t a.b\t " , Comments , " a#b" )
742
+ assertMatches(" \t a[\\ #c]b\t " , Comments , " a#b" )
699
743
}
700
744
701
745
@ Test def predefinedCharacterClasses (): Unit = {
@@ -1571,18 +1615,18 @@ class RegexEngineTest {
1571
1615
assertNotMatches(an_and_ks, " A" )
1572
1616
assertNotMatches(an_and_ks, " N" )
1573
1617
1574
- val az_butNot_def = compile(" [a-z&&[^dfh]]" )
1575
- assertMatches(az_butNot_def , " a" )
1576
- assertMatches(az_butNot_def , " c" )
1577
- assertMatches(az_butNot_def , " e" )
1578
- assertMatches(az_butNot_def , " i" )
1579
- assertMatches(az_butNot_def , " r" )
1580
- assertNotMatches(az_butNot_def , " d" )
1581
- assertNotMatches(az_butNot_def , " f" )
1582
- assertNotMatches(az_butNot_def , " h" )
1583
- assertNotMatches(az_butNot_def , " A" )
1584
- assertNotMatches(az_butNot_def , " 0" )
1585
- assertNotMatches(az_butNot_def , " \n " )
1618
+ val az_butNot_dfh = compile(" [a-z&&[^dfh]]" )
1619
+ assertMatches(az_butNot_dfh , " a" )
1620
+ assertMatches(az_butNot_dfh , " c" )
1621
+ assertMatches(az_butNot_dfh , " e" )
1622
+ assertMatches(az_butNot_dfh , " i" )
1623
+ assertMatches(az_butNot_dfh , " r" )
1624
+ assertNotMatches(az_butNot_dfh , " d" )
1625
+ assertNotMatches(az_butNot_dfh , " f" )
1626
+ assertNotMatches(az_butNot_dfh , " h" )
1627
+ assertNotMatches(az_butNot_dfh , " A" )
1628
+ assertNotMatches(az_butNot_dfh , " 0" )
1629
+ assertNotMatches(az_butNot_dfh , " \n " )
1586
1630
1587
1631
val az_butNot_mp = compile(" [a-z&&[^m-p]]" )
1588
1632
assertMatches(az_butNot_mp, " a" )
@@ -1877,40 +1921,20 @@ class RegexEngineTest {
1877
1921
assertFind(lineBreakUnixLines, " ab\n\n cd" , 2 , 3 )
1878
1922
}
1879
1923
1880
- @ Test def matchesWithNonGreedyOperators (): Unit = {
1881
- val opt = compile(" ab??" )
1882
- assertMatches(opt, " a" )
1883
- assertMatches(opt, " ab" )
1884
- assertFind(opt, " ab" , 0 , 1 )
1885
-
1886
- val star = compile(" ab*?" )
1887
- assertMatches(star, " a" )
1888
- assertMatches(star, " ab" )
1889
- assertMatches(star, " abbbbbb" )
1890
- assertFind(star, " abbbb" , 0 , 1 )
1891
-
1892
- val plus = compile(" ab+?" )
1893
- assertNotFind(plus, " a" )
1894
- assertMatches(plus, " ab" )
1895
- assertMatches(plus, " abb" )
1896
- assertMatches(plus, " abbbbbb" )
1897
- assertFind(plus, " abbbb" , 0 , 2 )
1898
- }
1899
-
1900
1924
@ Test def namedCaptureGroups (): Unit = {
1901
- val named = compile(raw " .*((?<pizza>Pizza).*?)+ " )
1925
+ val named = compile(" .*((?<pizza>Pizza).*?)+" )
1902
1926
val m = assertMatchesAndGroupsEquals(named, " PizzaWithPizza" , " Pizza" , " Pizza" )
1903
1927
assertEquals(" Pizza" , m.group(" pizza" ))
1904
1928
1905
- val ref = compile(raw " (?<pizza>Pizza)\k<pizza>*? " )
1929
+ val ref = compile(" (?<pizza>Pizza)\ \ k<pizza>*?" )
1906
1930
assertMatches(ref, " Pizza" )
1907
1931
assertMatches(ref, " PizzaPizza" )
1908
1932
assertMatches(ref, " PizzaPizzaPizza" )
1909
1933
assertNotMatches(ref, " PizzaPizzicatoPizza" )
1910
1934
1911
- assertSyntaxError(""" (?<A>a?)\k<B>?"" " , " named capturing group <B> does not exit" , 12 )
1935
+ assertSyntaxError(" (?<A>a?)\\ k<B>?" , " named capturing group <B> does not exit" , 12 )
1912
1936
1913
- assertSyntaxError(""" (?<A>a?)(?<A>dupe)"" " , " named capturing group <A> is already defined" , 12 )
1937
+ assertSyntaxError(" (?<A>a?)(?<A>dupe)" , " named capturing group <A> is already defined" , 12 )
1914
1938
}
1915
1939
1916
1940
@ Test def recursiveCapturingGroups (): Unit = {
@@ -1922,18 +1946,18 @@ class RegexEngineTest {
1922
1946
assertNotMatches(rec, " aaa" )
1923
1947
1924
1948
// The JVM kind of supports "back references" to later groups, but we don't
1925
- assertSyntaxErrorInJS(""" (a?\2?)(b?\1?)"" " , " numbered capturing group <2> does not exist" , 4 )
1949
+ assertSyntaxErrorInJS(" (a?\\ 2?)(b?\\ 1?)" , " numbered capturing group <2> does not exist" , 4 )
1926
1950
1927
1951
// The JVM tolerates "back references" to non-existing groups, but we don't
1928
- assertSyntaxErrorInJS(""" (a?\3?)(b?\1?)"" " , " numbered capturing group <3> does not exist" , 4 )
1952
+ assertSyntaxErrorInJS(" (a?\\ 3?)(b?\\ 1?)" , " numbered capturing group <3> does not exist" , 4 )
1929
1953
1930
- val namedRec = compile(raw " (?<A>a?\k<A>?)\k<A> " )
1954
+ val namedRec = compile(" (?<A>a?\\ k<A>?)\ \ k<A>" )
1931
1955
assertMatches(namedRec, " aa" )
1932
1956
assertMatches(namedRec, " " )
1933
1957
assertNotMatches(namedRec, " ab" )
1934
1958
assertNotMatches(namedRec, " a" )
1935
1959
1936
- assertSyntaxError(""" (?<A>a?\k<B>?)(?<B>b?\k<A>?)"" " , " named capturing group <B> does not exit" , 11 )
1960
+ assertSyntaxError(" (?<A>a?\\ k<B>?)(?<B>b?\\ k<A>?)" , " named capturing group <B> does not exit" , 11 )
1937
1961
}
1938
1962
1939
1963
@ Test def backReferenceLimit (): Unit = {
@@ -2015,6 +2039,24 @@ class RegexEngineTest {
2015
2039
* When the groups are fetched in the original code, we check the groups
2016
2040
* here. Otherwise, we don't, even if there are capturing groups in the
2017
2041
* regex.
2042
+ *
2043
+ * These tests only really test that the regexes still work, but not that
2044
+ * they work *in the same way* as before. In fact, they don't for some
2045
+ * corner cases. By inspection, all the regexes below use features in 4
2046
+ * categories:
2047
+ *
2048
+ * - Features whose semantics are equivalent in `js.RegExp` and `Pattern`,
2049
+ * notably ASCII characters, repeaters, classes of ASCII characters, the
2050
+ * '\d' character class, the '^' and '$' boundary matchers (without
2051
+ * multiline).
2052
+ * - The '.', which *is* different: it matches '\x85' in `js.RegExp` but not
2053
+ * in `Pattern`; this was judged acceptable as unlikely to cause a real
2054
+ * difference in practice.
2055
+ * - One regex uses the `CASE_INSENSITIVE` with a pattern that contains only
2056
+ * ASCII letters: it now really only matches other ASCII letters; this was
2057
+ * judged acceptable as probably the intended meaning anyway.
2058
+ * - One regex uses '\s' and '\S', for which we obtained confirmation from
2059
+ * the maintainer that the change in semantics was not an issue.
2018
2060
*/
2019
2061
@ Test def regexesFoundInLibraries (): Unit = {
2020
2062
// scalastyle:off line.size.limit
0 commit comments