From 9b45e962aa1e07e00d6cefc0a60dc7a7a85b2b59 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 24 Mar 2025 15:41:34 +0100 Subject: [PATCH 1/6] Never return false from an override of applyPropertyAlias --- .../org/unicode/jsp/UnicodeSetUtilities.java | 30 +++++++++++-------- .../org/unicode/props/UnicodeProperty.java | 8 +++-- .../props/UnicodePropertySymbolTable.java | 28 ++++++++++------- .../unicode/text/UCD/UnicodeMapParser.java | 3 +- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 6e3e40f96..28a09c5f0 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -195,34 +195,41 @@ public boolean applyPropertyAlias( } else { int debug = 0; } - status = applyPropertyAlias0(prop, propertyValue, result, invert); + applyPropertyAlias0(prop, propertyValue, result, invert); + status = true; } else { try { - status = applyPropertyAlias0(gcProp, versionlessPropertyName, result, invert); + applyPropertyAlias0(gcProp, versionlessPropertyName, result, invert); + status = true; } catch (Exception e) { } ; if (!status) { try { - status = - applyPropertyAlias0( - scProp, versionlessPropertyName, result, invert); + applyPropertyAlias0(scProp, versionlessPropertyName, result, invert); + status = true; } catch (Exception e) { } if (!status) { if (prop.isType(UnicodeProperty.BINARY_OR_ENUMERATED_OR_CATALOG_MASK)) { try { - status = applyPropertyAlias0(prop, "No", result, !invert); + applyPropertyAlias0(prop, "No", result, !invert); + status = true; } catch (Exception e) { } } if (!status) { - status = applyPropertyAlias0(prop, "", result, invert); + applyPropertyAlias0(prop, "", result, invert); + status = true; } } } } - return status; + if (!status) { + throw new IllegalArgumentException( + "Invalid query-expression " + propertyName + "=" + propertyValue); + } + return true; } private static Map @@ -245,8 +252,7 @@ public boolean applyPropertyAlias( UcdPropertyValues.General_Category_Values.Separator, new String[] {"Zl", "Zp", "Zs"}); - // TODO(eggrobin): I think this function only ever returns true; might as well make it void. - private boolean applyPropertyAlias0( + private void applyPropertyAlias0( UnicodeProperty prop, String propertyValue, UnicodeSet result, boolean invert) { result.clear(); String propertyName = prop.getName(); @@ -337,7 +343,7 @@ private boolean applyPropertyAlias0( for (var value : entry.getValue()) { prop.getSet(value, result); } - return true; + return; } } } @@ -375,7 +381,7 @@ private boolean applyPropertyAlias0( } } result.addAll(set); - return true; + return; } throw new IllegalArgumentException("Illegal property: " + propertyName); } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 4e8c06d5f..40433a357 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -1003,10 +1003,12 @@ public boolean applyPropertyAlias( String propertyName, String propertyValue, UnicodeSet result) { if (false) System.out.println(propertyName + "=" + propertyValue); UnicodeProperty prop = getProperty(propertyName); - if (prop == null) return false; + if (prop == null) { + throw new IllegalArgumentException("No property " + propertyName); + } result.clear(); - UnicodeSet x = prop.getSet(propertyValue, result); - return x.size() != 0; + prop.getSet(propertyValue, result); + return true; } } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java b/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java index 5a495783e..e2f88dbc5 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java @@ -74,24 +74,29 @@ public boolean applyPropertyAlias( } propertyValue = propertyValue.trim(); if (propertyValue.length() != 0) { - status = applyPropertyAlias0(propertyName, propertyValue, result); + applyPropertyAlias0(propertyName, propertyValue, result); + status = true; } else { try { - status = applyPropertyAlias0("gc", propertyName, result); + applyPropertyAlias0("gc", propertyName, result); + status = true; } catch (Exception e) { } if (!status) { try { - status = applyPropertyAlias0("sc", propertyName, result); + applyPropertyAlias0("sc", propertyName, result); + status = true; } catch (Exception e) { } if (!status) { try { - status = applyPropertyAlias0(propertyName, "Yes", result); + applyPropertyAlias0(propertyName, "Yes", result); + status = true; } catch (Exception e) { } if (!status) { - status = applyPropertyAlias0(propertyName, "", result); + applyPropertyAlias0(propertyName, "", result); + status = true; } } } @@ -99,7 +104,11 @@ public boolean applyPropertyAlias( if (status && invert) { result.complement(); } - return status; + if (!status) { + throw new IllegalArgumentException( + "Invalid query-expression " + propertyName + "=" + propertyValue); + } + return true; } static final HashMap GC_REMAP = new HashMap(); @@ -131,8 +140,7 @@ public boolean applyPropertyAlias( GC_REMAP.put("separator", GC_REMAP.get("z")); } - public boolean applyPropertyAlias0( - String propertyName, String propertyValue, UnicodeSet result) { + public void applyPropertyAlias0(String propertyName, String propertyValue, UnicodeSet result) { result.clear(); UnicodeProperty prop = factory.getProperty(propertyName); String canonicalName = prop.getName(); @@ -145,7 +153,7 @@ public boolean applyPropertyAlias0( for (String part : parts) { prop.getSet(part, result); } - return true; + return; } } @@ -227,7 +235,7 @@ public boolean applyPropertyAlias0( set = prop.getSet(patternMatcher); } result.addAll(set); - return true; + return; } throw new IllegalArgumentException("Illegal property: " + propertyName); } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java b/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java index b527fbfdc..219eb1146 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java @@ -105,7 +105,8 @@ public boolean applyPropertyAlias( return true; } } - return false; + throw new IllegalArgumentException( + "Invalid UnicodeMap query-expression " + propertyName + "=" + propertyValue); } } From 219ef05da569c0a83c5ab7ae99771dd2f03cf687 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 24 Mar 2025 15:44:51 +0100 Subject: [PATCH 2/6] Work around deficient symbol table --- .../main/resources/org/unicode/tools/SegmenterDefault.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt index e3f0bbbfe..c108c0144 100644 --- a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt +++ b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt @@ -20,7 +20,7 @@ $ConjunctLinker=\p{Indic_Conjunct_Break=Linker} $LinkingConsonant=\p{Indic_Conjunct_Break=Consonant} ## $E_Base=\p{Grapheme_Cluster_Break=E_Base} ## $E_Modifier=\p{Grapheme_Cluster_Break=E_Modifier} -$ExtPict=\p{Extended_Pictographic} +$ExtPict=\p{Extended_Pictographic=True} $ConjunctExtender=[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}] ## $EBG=\p{Grapheme_Cluster_Break=E_Base_GAZ} ## $Glue_After_Zwj=\p{Grapheme_Cluster_Break=Glue_After_Zwj} @@ -124,7 +124,7 @@ $DottedCircle = [◌] $CPmEastAsian=[$CP-$EastAsian] $OPmEastAsian=[$OP-$EastAsian] -$ExtPictUnassigned=[\p{Extended_Pictographic}&\p{gc=Cn}] +$ExtPictUnassigned=[\p{Extended_Pictographic=True}&\p{gc=Cn}] # Some rules refer to the start and end of text. We could just use a literal ^ for sot, but naming # it as in the spec makes it easier to compare. The parser will eat (and choke on) $, so we play a @@ -364,7 +364,7 @@ $Single_Quote=\p{Word_Break=Single_Quote} ## $E_Modifier=\p{Word_Break=E_Modifier} $ZWJ=\p{Word_Break=ZWJ} # Note: The following may overlap with the above -$ExtPict=\p{Extended_Pictographic} +$ExtPict=\p{Extended_Pictographic=True} ## $EBG=\p{Word_Break=E_Base_GAZ} ## $Glue_After_Zwj=\p{Word_Break=Glue_After_Zwj} $WSegSpace=\p{Word_Break=WSegSpace} From b94c664a6ae5188bc252f498d0a20b7c12dd872f Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 24 Mar 2025 15:55:30 +0100 Subject: [PATCH 3/6] Regenerate UCD --- .../data/ucd/dev/auxiliary/GraphemeBreakTest.html | 12 ++++++------ .../data/ucd/dev/auxiliary/GraphemeBreakTest.txt | 6 +++--- .../data/ucd/dev/auxiliary/LineBreakTest.html | 4 ++-- .../data/ucd/dev/auxiliary/WordBreakTest.html | 14 +++++++------- .../data/ucd/dev/auxiliary/WordBreakTest.txt | 6 +++--- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.html index a83d5c475..58e55f556 100644 --- a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.html +++ b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.html @@ -7,7 +7,7 @@

Grapheme_Cluster_Break Chart

Unicode Version: 17.0.0

-

Date: 2025-02-14, 00:14:44 GMT

+

Date: 2025-03-24, 14:45:55 GMT

This page illustrates the application of the Grapheme_Cluster_Break specification. The material here is informative, not normative.

The first chart shows where breaks would appear between different sample characters or strings. The sample characters are chosen mechanically to represent the different properties used by the specification.

Each cell shows the break-status for the position between the character(s) in its row header and the character(s) in its column header. The × symbol indicates no break, while the ÷ symbol indicates a break. The cells with × are also shaded to make it easier to scan the table. For example, in the cell at the intersection of the row headed by “CR” and the column headed by “LF”, there is a × symbol, indicating that there is no break between CR and LF.

After the heavy blue line in the table are additional rows, either with different sample characters or for sequences.

In the row and column headers of the Table, in the Rules, when hovering over characters in the Samples, and in the comments in the associated list of test cases GraphemeBreakTest.txt:

  1. The following sets are used:
      @@ -24,7 +24,7 @@

      Grapheme_Cluster_Break Chart

    • ExtPict = -\p{Extended_Pictographic} +\p{Extended_Pictographic=True}
    • LinkingConsonant @@ -232,15 +232,15 @@

      Sample Strings

      23 -     -   -   +     +   +   24   a      -   +   25 diff --git a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.txt index 6e4f049ab..e1215547c 100644 --- a/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.txt +++ b/unicodetools/data/ucd/dev/auxiliary/GraphemeBreakTest.txt @@ -1,5 +1,5 @@ # GraphemeBreakTest-17.0.0.txt -# Date: 2025-02-14, 00:14:44 GMT +# Date: 2025-03-24, 14:45:55 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -768,8 +768,8 @@ ÷ 1F476 × 1F3FF × 0308 × 200D × 1F476 × 1F3FF ÷ # ÷ [0.2] BABY (ExtPict) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend_ConjunctExtendermConjunctLinker) × [9.0] COMBINING DIAERESIS (Extend_ConjunctExtendermConjunctLinker) × [9.0] ZERO WIDTH JOINER (ZWJ) × [11.0] BABY (ExtPict) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend_ConjunctExtendermConjunctLinker) ÷ [0.3] ÷ 1F6D1 × 200D × 1F6D1 ÷ # ÷ [0.2] OCTAGONAL SIGN (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3] ÷ 0061 × 200D ÷ 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (XXmLinkingConsonantmExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3] -÷ 2701 × 200D × 2701 ÷ # ÷ [0.2] UPPER BLADE SCISSORS (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ) × [11.0] UPPER BLADE SCISSORS (ExtPict) ÷ [0.3] -÷ 0061 × 200D ÷ 2701 ÷ # ÷ [0.2] LATIN SMALL LETTER A (XXmLinkingConsonantmExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] UPPER BLADE SCISSORS (ExtPict) ÷ [0.3] +÷ 2701 × 200D ÷ 2701 ÷ # ÷ [0.2] UPPER BLADE SCISSORS (XXmLinkingConsonantmExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] UPPER BLADE SCISSORS (XXmLinkingConsonantmExtPict) ÷ [0.3] +÷ 0061 × 200D ÷ 2701 ÷ # ÷ [0.2] LATIN SMALL LETTER A (XXmLinkingConsonantmExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] UPPER BLADE SCISSORS (XXmLinkingConsonantmExtPict) ÷ [0.3] ÷ 0915 ÷ 0924 ÷ # ÷ [0.2] DEVANAGARI LETTER KA (LinkingConsonant) ÷ [999.0] DEVANAGARI LETTER TA (LinkingConsonant) ÷ [0.3] ÷ 0915 × 094D × 0924 ÷ # ÷ [0.2] DEVANAGARI LETTER KA (LinkingConsonant) × [9.0] DEVANAGARI SIGN VIRAMA (Extend_ConjunctLinker) × [9.3] DEVANAGARI LETTER TA (LinkingConsonant) ÷ [0.3] ÷ 0915 × 094D × 094D × 0924 ÷ # ÷ [0.2] DEVANAGARI LETTER KA (LinkingConsonant) × [9.0] DEVANAGARI SIGN VIRAMA (Extend_ConjunctLinker) × [9.0] DEVANAGARI SIGN VIRAMA (Extend_ConjunctLinker) × [9.3] DEVANAGARI LETTER TA (LinkingConsonant) ÷ [0.3] diff --git a/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html index a00ddfe78..7873e3d71 100644 --- a/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html +++ b/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html @@ -7,7 +7,7 @@

      Line_Break Chart

      Unicode Version: 17.0.0

      -

      Date: 2025-02-14, 17:30:27 GMT

      +

      Date: 2025-03-24, 14:45:57 GMT

      This page illustrates the application of the Line_Break specification. The material here is informative, not normative.

      The first chart shows where breaks would appear between different sample characters or strings. The sample characters are chosen mechanically to represent the different properties used by the specification.

      Each cell shows the break-status for the position between the character(s) in its row header and the character(s) in its column header. The symbol × indicates a prohibited break, even with intervening spaces; the ÷ symbol indicates a (direct) break; the symbol ∻ indicates a break only in the presence of an intervening space (an indirect break).The cells with × or ∻ are also shaded to make it easier to scan the table. For example, in the cell at the intersection of the row headed by “CR” and the column headed by “LF”, there is a × symbol, indicating that there is no break between CR and LF.

      In the row and column headers of the Table, in the Rules, when hovering over characters in the Samples, and in the comments in the associated list of test cases LineBreakTest.txt:

      1. The following sets are used:
          @@ -49,7 +49,7 @@

          Line_Break Chart

        • ExtPictUnassigned = -[\p{Extended_Pictographic}&\p{gc=Cn}] +[\p{Extended_Pictographic=True}&\p{gc=Cn}]
        • NS diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html index 4dbf79b3a..580854c64 100644 --- a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html +++ b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html @@ -7,7 +7,7 @@

          Word_Break Chart

          Unicode Version: 17.0.0

          -

          Date: 2024-11-27, 17:44:59 GMT

          +

          Date: 2025-03-24, 14:46:35 GMT

          This page illustrates the application of the Word_Break specification. The material here is informative, not normative.

          The first chart shows where breaks would appear between different sample characters or strings. The sample characters are chosen mechanically to represent the different properties used by the specification.

          Each cell shows the break-status for the position between the character(s) in its row header and the character(s) in its column header. The × symbol indicates no break, while the ÷ symbol indicates a break. The cells with × are also shaded to make it easier to scan the table. For example, in the cell at the intersection of the row headed by “CR” and the column headed by “LF”, there is a × symbol, indicating that there is no break between CR and LF.

          After the heavy blue line in the table are additional rows, either with different sample characters or for sequences, such as “ALetter MidLetter”.

          In the row and column headers of the Table, in the Rules, when hovering over characters in the Samples, and in the comments in the associated list of test cases WordBreakTest.txt:

          1. The following sets are used:
              @@ -19,7 +19,7 @@

              Word_Break Chart

            • ExtPict = -\p{Extended_Pictographic} +\p{Extended_Pictographic=True}
            • MidNumLetQ @@ -292,15 +292,15 @@

              Sample Strings

              27 -     -   -   +     +   +   28   a   -   -   +   +   29 diff --git a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt index 4a16428ca..042b02e77 100644 --- a/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt +++ b/unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt @@ -1,5 +1,5 @@ # WordBreakTest-17.0.0.txt -# Date: 2025-01-27, 18:09:43 GMT +# Date: 2025-03-24, 14:46:35 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -1850,8 +1850,8 @@ ÷ 1F476 × 1F3FF ÷ 1F476 ÷ # ÷ [0.2] BABY (ExtPictmALetter) × [4.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPictmALetter) ÷ [0.3] ÷ 1F6D1 × 200D × 1F6D1 ÷ # ÷ [0.2] OCTAGONAL SIGN (ExtPictmALetter) × [4.0] ZERO WIDTH JOINER (ZWJ) × [3.3] OCTAGONAL SIGN (ExtPictmALetter) ÷ [0.3] ÷ 0061 × 200D × 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALettermExtPict) × [4.0] ZERO WIDTH JOINER (ZWJ) × [3.3] OCTAGONAL SIGN (ExtPictmALetter) ÷ [0.3] -÷ 2701 × 200D × 2701 ÷ # ÷ [0.2] UPPER BLADE SCISSORS (ExtPictmALetter) × [4.0] ZERO WIDTH JOINER (ZWJ) × [3.3] UPPER BLADE SCISSORS (ExtPictmALetter) ÷ [0.3] -÷ 0061 × 200D × 2701 ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALettermExtPict) × [4.0] ZERO WIDTH JOINER (ZWJ) × [3.3] UPPER BLADE SCISSORS (ExtPictmALetter) ÷ [0.3] +÷ 2701 × 200D ÷ 2701 ÷ # ÷ [0.2] UPPER BLADE SCISSORS (XXmExtPict) × [4.0] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] UPPER BLADE SCISSORS (XXmExtPict) ÷ [0.3] +÷ 0061 × 200D ÷ 2701 ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALettermExtPict) × [4.0] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] UPPER BLADE SCISSORS (XXmExtPict) ÷ [0.3] ÷ 1F476 × 1F3FF × 0308 × 200D × 1F476 × 1F3FF ÷ # ÷ [0.2] BABY (ExtPictmALetter) × [4.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) × [4.0] COMBINING DIAERESIS (Extend) × [4.0] ZERO WIDTH JOINER (ZWJ) × [3.3] BABY (ExtPictmALetter) × [4.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [0.3] ÷ 1F6D1 × 1F3FF ÷ # ÷ [0.2] OCTAGONAL SIGN (ExtPictmALetter) × [4.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [0.3] ÷ 200D × 1F6D1 × 1F3FF ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [3.3] OCTAGONAL SIGN (ExtPictmALetter) × [4.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [0.3] From 0360f990a34e17829bb5610980cf4b17c4d6ff51 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 24 Mar 2025 16:23:25 +0100 Subject: [PATCH 4/6] More =true --- .../java/org/unicode/test/TestUnicodeMapParser.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java b/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java index aba78176e..d1f32dfc6 100644 --- a/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java +++ b/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java @@ -33,12 +33,12 @@ public void testBasic() { new UnicodeMap() .put('a', " ") .put('c', "d") - .putAll(new UnicodeSet("[:whitespace:]"), "x"); + .putAll(new UnicodeSet("[:whitespace=true:]"), "x"); - String test = "{\\u{61}=\\u{20},c=,[:whitespace:]=x}"; + String test = "{\\u{61}=\\u{20},c=,[:whitespace=true:]=x}"; check(ump, test, null, 17); - test = " { a = \\u{20} , c = d , [:whitespace:] = x } "; + test = " { a = \\u{20} , c = d , [:whitespace=true:] = x } "; check(ump, test, expected, -1); ValueParser integerParser = new IntegerParser(); @@ -47,12 +47,12 @@ public void testBasic() { new UnicodeMap() .put('a', 1) .put('c', 2) - .putAll(new UnicodeSet("[:whitespace:]"), 33); + .putAll(new UnicodeSet("[:whitespace=true:]"), 33); - String test2 = "{a=1,c=2,[:whitespace:]=33}"; + String test2 = "{a=1,c=2,[:whitespace=true:]=33}"; check(ump2, test2, expected2, -1); - test2 = " { a = 1 , c = 2 , [:whitespace:] = 33 } "; + test2 = " { a = 1 , c = 2 , [:whitespace=true:] = 33 } "; check(ump2, test2, expected2, -1); } From afbc6a24eeb9cc9858d0e322d086fa8100a9fac6 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 24 Mar 2025 18:34:42 +0100 Subject: [PATCH 5/6] Revert "Never return false from an override of applyPropertyAlias" This reverts commit 9b45e962aa1e07e00d6cefc0a60dc7a7a85b2b59. --- .../org/unicode/jsp/UnicodeSetUtilities.java | 30 ++++++++----------- .../org/unicode/props/UnicodeProperty.java | 8 ++--- .../props/UnicodePropertySymbolTable.java | 28 +++++++---------- .../unicode/text/UCD/UnicodeMapParser.java | 3 +- 4 files changed, 26 insertions(+), 43 deletions(-) diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 28a09c5f0..6e3e40f96 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -195,41 +195,34 @@ public boolean applyPropertyAlias( } else { int debug = 0; } - applyPropertyAlias0(prop, propertyValue, result, invert); - status = true; + status = applyPropertyAlias0(prop, propertyValue, result, invert); } else { try { - applyPropertyAlias0(gcProp, versionlessPropertyName, result, invert); - status = true; + status = applyPropertyAlias0(gcProp, versionlessPropertyName, result, invert); } catch (Exception e) { } ; if (!status) { try { - applyPropertyAlias0(scProp, versionlessPropertyName, result, invert); - status = true; + status = + applyPropertyAlias0( + scProp, versionlessPropertyName, result, invert); } catch (Exception e) { } if (!status) { if (prop.isType(UnicodeProperty.BINARY_OR_ENUMERATED_OR_CATALOG_MASK)) { try { - applyPropertyAlias0(prop, "No", result, !invert); - status = true; + status = applyPropertyAlias0(prop, "No", result, !invert); } catch (Exception e) { } } if (!status) { - applyPropertyAlias0(prop, "", result, invert); - status = true; + status = applyPropertyAlias0(prop, "", result, invert); } } } } - if (!status) { - throw new IllegalArgumentException( - "Invalid query-expression " + propertyName + "=" + propertyValue); - } - return true; + return status; } private static Map @@ -252,7 +245,8 @@ public boolean applyPropertyAlias( UcdPropertyValues.General_Category_Values.Separator, new String[] {"Zl", "Zp", "Zs"}); - private void applyPropertyAlias0( + // TODO(eggrobin): I think this function only ever returns true; might as well make it void. + private boolean applyPropertyAlias0( UnicodeProperty prop, String propertyValue, UnicodeSet result, boolean invert) { result.clear(); String propertyName = prop.getName(); @@ -343,7 +337,7 @@ private void applyPropertyAlias0( for (var value : entry.getValue()) { prop.getSet(value, result); } - return; + return true; } } } @@ -381,7 +375,7 @@ private void applyPropertyAlias0( } } result.addAll(set); - return; + return true; } throw new IllegalArgumentException("Illegal property: " + propertyName); } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 40433a357..4e8c06d5f 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -1003,12 +1003,10 @@ public boolean applyPropertyAlias( String propertyName, String propertyValue, UnicodeSet result) { if (false) System.out.println(propertyName + "=" + propertyValue); UnicodeProperty prop = getProperty(propertyName); - if (prop == null) { - throw new IllegalArgumentException("No property " + propertyName); - } + if (prop == null) return false; result.clear(); - prop.getSet(propertyValue, result); - return true; + UnicodeSet x = prop.getSet(propertyValue, result); + return x.size() != 0; } } diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java b/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java index e2f88dbc5..5a495783e 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodePropertySymbolTable.java @@ -74,29 +74,24 @@ public boolean applyPropertyAlias( } propertyValue = propertyValue.trim(); if (propertyValue.length() != 0) { - applyPropertyAlias0(propertyName, propertyValue, result); - status = true; + status = applyPropertyAlias0(propertyName, propertyValue, result); } else { try { - applyPropertyAlias0("gc", propertyName, result); - status = true; + status = applyPropertyAlias0("gc", propertyName, result); } catch (Exception e) { } if (!status) { try { - applyPropertyAlias0("sc", propertyName, result); - status = true; + status = applyPropertyAlias0("sc", propertyName, result); } catch (Exception e) { } if (!status) { try { - applyPropertyAlias0(propertyName, "Yes", result); - status = true; + status = applyPropertyAlias0(propertyName, "Yes", result); } catch (Exception e) { } if (!status) { - applyPropertyAlias0(propertyName, "", result); - status = true; + status = applyPropertyAlias0(propertyName, "", result); } } } @@ -104,11 +99,7 @@ public boolean applyPropertyAlias( if (status && invert) { result.complement(); } - if (!status) { - throw new IllegalArgumentException( - "Invalid query-expression " + propertyName + "=" + propertyValue); - } - return true; + return status; } static final HashMap GC_REMAP = new HashMap(); @@ -140,7 +131,8 @@ public boolean applyPropertyAlias( GC_REMAP.put("separator", GC_REMAP.get("z")); } - public void applyPropertyAlias0(String propertyName, String propertyValue, UnicodeSet result) { + public boolean applyPropertyAlias0( + String propertyName, String propertyValue, UnicodeSet result) { result.clear(); UnicodeProperty prop = factory.getProperty(propertyName); String canonicalName = prop.getName(); @@ -153,7 +145,7 @@ public void applyPropertyAlias0(String propertyName, String propertyValue, Unico for (String part : parts) { prop.getSet(part, result); } - return; + return true; } } @@ -235,7 +227,7 @@ public void applyPropertyAlias0(String propertyName, String propertyValue, Unico set = prop.getSet(patternMatcher); } result.addAll(set); - return; + return true; } throw new IllegalArgumentException("Illegal property: " + propertyName); } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java b/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java index 219eb1146..b527fbfdc 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/UnicodeMapParser.java @@ -105,8 +105,7 @@ public boolean applyPropertyAlias( return true; } } - throw new IllegalArgumentException( - "Invalid UnicodeMap query-expression " + propertyName + "=" + propertyValue); + return false; } } From 05efc4b437603c46ed0e6019b5c22cdffb00377a Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 24 Mar 2025 18:34:58 +0100 Subject: [PATCH 6/6] Revert "More =true" This reverts commit 0360f990a34e17829bb5610980cf4b17c4d6ff51. --- .../java/org/unicode/test/TestUnicodeMapParser.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java b/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java index d1f32dfc6..aba78176e 100644 --- a/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java +++ b/unicodetools/src/test/java/org/unicode/test/TestUnicodeMapParser.java @@ -33,12 +33,12 @@ public void testBasic() { new UnicodeMap() .put('a', " ") .put('c', "d") - .putAll(new UnicodeSet("[:whitespace=true:]"), "x"); + .putAll(new UnicodeSet("[:whitespace:]"), "x"); - String test = "{\\u{61}=\\u{20},c=,[:whitespace=true:]=x}"; + String test = "{\\u{61}=\\u{20},c=,[:whitespace:]=x}"; check(ump, test, null, 17); - test = " { a = \\u{20} , c = d , [:whitespace=true:] = x } "; + test = " { a = \\u{20} , c = d , [:whitespace:] = x } "; check(ump, test, expected, -1); ValueParser integerParser = new IntegerParser(); @@ -47,12 +47,12 @@ public void testBasic() { new UnicodeMap() .put('a', 1) .put('c', 2) - .putAll(new UnicodeSet("[:whitespace=true:]"), 33); + .putAll(new UnicodeSet("[:whitespace:]"), 33); - String test2 = "{a=1,c=2,[:whitespace=true:]=33}"; + String test2 = "{a=1,c=2,[:whitespace:]=33}"; check(ump2, test2, expected2, -1); - test2 = " { a = 1 , c = 2 , [:whitespace=true:] = 33 } "; + test2 = " { a = 1 , c = 2 , [:whitespace:] = 33 } "; check(ump2, test2, expected2, -1); }