unicode-org · eggrobin · May 28, 2023 · May 25, 2023 · May 25, 2023 · May 26, 2023
diff --git a/unicodetools/src/main/java/org/unicode/props/BagFormatter.java b/unicodetools/src/main/java/org/unicode/props/BagFormatter.java
@@ -538,10 +538,13 @@ public void doAt(Object c, PrintWriter out) {
 
             tabber.add(minSpacesBeforeComment + 2, Tabber.LEFT); // comment character
 
-            labelSize =
-                    maxLabelWidthOverride > 0
-                            ? maxLabelWidthOverride
-                            : getLabelSource(true).getMaxWidth(shortLabel);
+            labelSize = getLabelSource(true).getMaxWidth(shortLabel);
+            if (refinedLabelSource != null) {
+                labelSize = Math.max(labelSize, refinedLabelSource.getMaxWidth(shortLabel));
+            }
+            if (maxLabelWidthOverride > 0) {
+                labelSize = maxLabelWidthOverride;
+            }
             if (labelSize > 0) {
                 tabber.add(labelSize + 1, Tabber.LEFT); // value
             }

diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java
@@ -686,7 +686,12 @@ protected UnicodeMap<String> _getUnicodeMap() {
 
         @Override
         protected String _getValue(int codepoint) {
-            return _getUnicodeMap().get(codepoint);
+            final String result = _getUnicodeMap().get(codepoint);
+            if (DefaultValueType.forString(result) == DefaultValueType.CODE_POINT) {
+                return Character.toString(codepoint);
+            } else {
+                return result;
+            }
         }
 
         @Override

diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java
@@ -593,7 +593,14 @@ static void parseSourceFile(
                     // do nothing, already none;
                     break;
                 case CODE_POINT:
-                    // requires special handling later
+                    // NOTE(egg): The naïve thing here would be
+                    //   for (final String cp : nullValues) {
+                    //     data.put(cp, cp);
+                    //   }
+                    // However, UnicodeMap is extremely slow with large numbers of values.
+                    // Instead we fill it with <code point>, and let IndexUnicodeProperty resolve
+                    // that.
+                    data.putAll(nullValues, propInfo.getDefaultValue());
                     break;
                 default:
                     throw new UnicodePropertyException(); // unexpected error

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java
@@ -1617,10 +1617,23 @@ private static void writeStringValues(
             System.out.println("Writing String Values: " + prop.getName());
         }
         pw.println();
+        final var shownSet = new UnicodeSet();
+        if (ps.skipValue == null) {
+            shownSet.addAll(UnicodeSet.ALL_CODE_POINTS);
+        } else {
+            for (int c = 0; c <= 0x10FFFF; ++c) {
+                final String value = prop.getValue(c);
+                final String skipValue =
+                        ps.skipValue.equals("<code point>") ? Character.toString(c) : ps.skipValue;
+                if (!value.equals(skipValue)) {
+                    shownSet.add(c);
+                }
+            }
+        }
         bf.setValueSource(prop)
                 .setHexValue(true)
                 .setMergeRanges(ps.mergeRanges)
-                .showSetNames(pw, new UnicodeSet(0, 0x10FFFF));
+                .showSetNames(pw, shownSet);
     }
 
     static class RangeStartComparator implements Comparator<String> {

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -20,6 +20,7 @@
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
 import org.unicode.cldr.draft.FileUtilities;
 import org.unicode.cldr.util.Tabber;
@@ -995,6 +996,12 @@ private static void showSet(ParsePosition pp, final String value) {
         if (doHtml) {
             out.println("<table class='s'>");
         }
+        // Show the GC if it happens to be constant over a range, but do not split because of it:
+        // We limit the output based on unsplit ranges.
+        showLister
+                .setLabelSource(null)
+                .setRangeBreakSource(null)
+                .setRefinedLabelSource(LATEST_PROPS.getProperty("General_Category"));
         showLister.showSetNames(out, valueSet);
         if (doHtml) {
             out.println("</table>");
@@ -1224,14 +1231,38 @@ static class VersionedProperty {
         private UnicodeProperty property;
         private final transient PatternMatcher matcher = new UnicodeProperty.RegexMatcher();
 
+        private static final Set<String> TOOL_ONLY_PROPERTIES =
+                Set.of("toNFC", "toNFD", "toNFKC", "toNFKD");
+
+        private static boolean isTrivial(UnicodeMap<String> map) {
+            return map.isEmpty()
+                    || (map.values().size() == 1
+                            && map.getSet(map.values().iterator().next())
+                                    .equals(UnicodeSet.ALL_CODE_POINTS));
+        }
+
         public VersionedProperty set(String xPropertyName) {
             xPropertyName = xPropertyName.trim();
+            boolean allowRetroactive = false;
             if (xPropertyName.contains(":")) {
                 final String[] names = xPropertyName.split(":");
-                if (names.length != 2 || !names[0].startsWith("U")) {
+                if (names.length != 2) {
                     throw new IllegalArgumentException("Too many ':' fields in " + xPropertyName);
                 }
-                if (names[0].equalsIgnoreCase("U-1")) {
+                if (names[0].isEmpty()) {
+                    throw new IllegalArgumentException("Empty version field in " + xPropertyName);
+                }
+                switch (names[0].charAt(0)) {
+                    case 'U':
+                        break;
+                    case 'R':
+                        allowRetroactive = true;
+                        break;
+                    default:
+                        throw new IllegalArgumentException(
+                                "Version field should start with U or R in " + xPropertyName);
+                }
+                if (names[0].substring(1).equals("-1")) {
                     version = LAST_VERSION;
                 } else {
                     version = names[0].substring(1);
@@ -1242,18 +1273,19 @@ public VersionedProperty set(String xPropertyName) {
             }
             ;
             propertyName = xPropertyName;
-            propSource = getProperties(version);
+            propSource = getIndexedProperties(version);
             property = propSource.getProperty(xPropertyName);
-            if (property == null) {
-                propSource = getIndexedProperties(version);
+            if ((property == null && TOOL_ONLY_PROPERTIES.contains(xPropertyName))
+                    || (isTrivial(property.getUnicodeMap()) && allowRetroactive)) {
+                propSource = getProperties(version);
                 property = propSource.getProperty(xPropertyName);
-                if (property == null) {
-                    throw new IllegalArgumentException(
-                            "Can't create property from name: "
-                                    + propertyName
-                                    + " and version: "
-                                    + version);
-                }
+            }
+            if (property == null || isTrivial(property.getUnicodeMap())) {
+                throw new IllegalArgumentException(
+                        "Can't create property from name: "
+                                + propertyName
+                                + " and version: "
+                                + version);
             }
             return this;
         }

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java b/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java
@@ -689,7 +689,7 @@ public String _getValue(int cp) {
                         @Override
                         public String _getValue(final int cp) {
                             if (!ucd.isRepresented(cp)) {
-                                return null;
+                                return Character.toString(cp);
                             }
                             boolean debug = false;
                             if (cp == -1) { // change to a real code point for debugging
@@ -707,7 +707,7 @@ public String _getValue(final int cp) {
                             final String case1 = ucd.getCase(cp, foldingType, UCD_Types.FOLD);
                             final String b = nfkc.normalize(case1);
                             if (equals(cp, b)) {
-                                return null;
+                                return Character.toString(cp);
                             }
                             if (debug) {
                                 System.out.println(

diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt
@@ -2,10 +2,12 @@
 
 # Missing @-missing values
 
+# No @-missing lines for binary properties.
+# TODO(egg): just derive that from the property type.
 # @missing: 0000..10FFFF; ASCII_Hex_Digit; No
 # @missing: 0000..10FFFF; Alphabetic; No
 # @missing: 0000..10FFFF; Bidi_Control; No
-# @missing: 0000..10FFFF; Bidi_Class; Left_To_Right
+# @missing: 0000..10FFFF; Bidi_Mirrored; No
 # @missing: 0000..10FFFF; Case_Ignorable; No
 # @missing: 0000..10FFFF; Cased; No
 # @missing: 0000..10FFFF; Changes_When_Casefolded; No
@@ -25,8 +27,11 @@
 # @missing: 0000..10FFFF; Grapheme_Link; No
 # @missing: 0000..10FFFF; Hex_Digit; No
 # @missing: 0000..10FFFF; Hyphen; No
+# @missing: 0000..10FFFF; IDS_Unary_Operator; No
 # @missing: 0000..10FFFF; IDS_Binary_Operator; No
 # @missing: 0000..10FFFF; IDS_Trinary_Operator; No
+# @missing: 0000..10FFFF; ID_Compat_Math_Continue; No
+# @missing: 0000..10FFFF; ID_Compat_Math_Start; No
 # @missing: 0000..10FFFF; ID_Continue; No
 # @missing: 0000..10FFFF; ID_Start; No
 # @missing: 0000..10FFFF; Ideographic; No
@@ -48,8 +53,6 @@
 # @missing: 0000..10FFFF; White_Space; No
 # @missing: 0000..10FFFF; XID_Continue; No
 # @missing: 0000..10FFFF; XID_Start; No
-
-# @missing: 0000..10FFFF; Bidi_Mirrored; No
 # @missing: 0000..10FFFF; Composition_Exclusion; No
 # @missing: 0000..10FFFF; Expands_On_NFC   ; No
 # @missing: 0000..10FFFF; Expands_On_NFD   ; No
@@ -74,8 +77,33 @@
 # @missing: 0000..10FFFF; RGI_Emoji_Tag_Sequence  ;  No
 # @missing: 0000..10FFFF; RGI_Emoji_Zwj_Sequence  ;  No
 
+# @missing: 0000..10FFFF; Emoji  	; No
+# @missing: 0000..10FFFF; Emoji_Presentation  	; No
+# @missing: 0000..10FFFF; Emoji_Modifier  	; No
+# @missing: 0000..10FFFF; Emoji_Modifier_Base  	; No
+# @missing: 0000..10FFFF; Emoji_Component  	; No
+# @missing: 0000..10FFFF; Extended_Pictographic  	; No
+
+# End of binary properties.
+
+# @missing: 0000..10FFFF; Canonical_Combining_Class; Not_Reordered
+
+# @missing: 0000..10FFFF; Lowercase_Mapping; <slc>
+# @missing: 0000..10FFFF; Uppercase_Mapping; <suc>
+# @missing: 0000..10FFFF; Titlecase_Mapping; <stc>
+
+# @missing: 0000..10FFFF; kSimplifiedVariant ; <none>
+# @missing: 0000..10FFFF; kTraditionalVariant ; <none>
+
+# @missing: 0000..10FFFF; Joining_Group ; No_Joining_Group
+# @missing: 0000..10FFFF; Joining_Type ; Non_Joining
+
 # Overrides for bugs
 
+# TODO(egg): These are specified in their respective files, we should not need them here.
+# @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; <none>
+# @missing: 0000..10FFFF; Equivalent_Unified_Ideograph; <none>
+
 # Extras
 
 # @missing: 0000..10FFFF; Idn_Status   ; disallowed
@@ -119,24 +147,10 @@ idtype	; a		; Aspirational
 idtype	; inc	; Inclusion				
 idtype	; rec	; Recommended	
 
-# @-missing: 0000..10FFFF; Confusable_SL    	; <code point>		
-# @-missing: 0000..10FFFF; Confusable_SA    	; <code point>		
-# @-missing: 0000..10FFFF; Confusable_ML    	; <code point>		
-# @-missing: 0000..10FFFF; Confusable_MA    	; <code point>		
-
-# @missing: 0000..10FFFF; Emoji  	; No
-# @missing: 0000..10FFFF; Emoji_Presentation  	; No
-# @missing: 0000..10FFFF; Emoji_Modifier  	; No
-# @missing: 0000..10FFFF; Emoji_Modifier_Base  	; No
-# @missing: 0000..10FFFF; Emoji_Component  	; No
-# @missing: 0000..10FFFF; Extended_Pictographic  	; No
-
-# @missing: 0000..10FFFF; Basic_Emoji  	; No
-# @missing: 0000..10FFFF; RGI_Emoji_Modifier_Sequence  	; No
-# @missing: 0000..10FFFF; RGI_Emoji_Flag_Sequence  	; No
-# @missing: 0000..10FFFF; RGI_Emoji_Keycap_Sequence  	; No
-# @missing: 0000..10FFFF; RGI_Emoji_Tag_Sequence  	; No
-# @missing: 0000..10FFFF; RGI_Emoji_Zwj_Sequence  	; No
+# @missing: 0000..10FFFF; Confusable_SL    	; <code point>
+# @missing: 0000..10FFFF; Confusable_SA    	; <code point>
+# @missing: 0000..10FFFF; Confusable_ML    	; <code point>
+# @missing: 0000..10FFFF; Confusable_MA    	; <code point>
 
 sc ; Hanb                             ; Han_with_Bopomofo
 sc ; Jpan                             ; Japanese

diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -204,9 +204,9 @@ In \P{U-1:gc=Cn} U-1:NFKC_Casefold = NFKC_Casefold
 # Not yet a stability policy, but see https://www.unicode.org/L2/L2023/23005.htm#174-A11.
 # Simple counterparts of the above.
 In \P{U-1:gc=Cn} U-1:Simple_Case_Folding * U-1:toNFKC = Simple_Case_Folding * toNFKC
-In \p{U-1:XID_Continue} U-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
+In \p{U-1:XID_Continue} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
 # As above, this one would not be guaranteed by the stability policy.
-In \P{U-1:gc=Cn} U-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
+In \P{U-1:gc=Cn} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
 
 # Case Pair Stability: If two characters form a case pair in a version of Unicode, they will remain a case pair in each subsequent version of Unicode. If two characters do not form a case pair in a version of Unicode, they will never become a case pair in any subsequent version of Unicode.
 # TODO
@@ -464,7 +464,7 @@ $decimalValue ⊇ \p{General_Category=Decimal_Number}
 
 # All and only those items with numeric types have numeric values
 
-Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+.[0-9]+/}
+Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/}
 [\p{Numeric_Type=Decimal} \p{Numeric_Type=Digit} \p{Numeric_Type=Numeric}] = $anyNumericValue
 
 ##########################

diff --git a/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java b/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java
@@ -24,7 +24,9 @@
 import org.unicode.cldr.util.Counter;
 import org.unicode.props.GenerateEnums;
 import org.unicode.props.IndexUnicodeProperties;
+import org.unicode.props.IndexUnicodeProperties.DefaultValueType;
 import org.unicode.props.PropertyNames;
+import org.unicode.props.PropertyType;
 import org.unicode.props.PropertyValueSets;
 import org.unicode.props.UcdProperty;
 import org.unicode.props.UcdPropertyValues;
@@ -472,6 +474,58 @@ public void TestValues() {
         //        logln(x + " " + z + " " + w);
     }
 
+    @Test
+    public void TestDefaults() {
+        assertEquals(
+                "Wrong CCC for U+FFFF",
+                "Not_Reordered",
+                iup.getProperty(UcdProperty.Canonical_Combining_Class).getValue('\uFFFF'));
+        assertEquals(
+                "Wrong Simple_Lowercase_Mapping for a",
+                "a",
+                iup.getProperty(UcdProperty.Simple_Lowercase_Mapping).getValue('a'));
+        assertEquals(
+                "Wrong Simple_Uppercase_Mapping for A",
+                "A",
+                iup.getProperty(UcdProperty.Simple_Uppercase_Mapping).getValue('A'));
+        assertEquals(
+                "Wrong Case_Folding for a",
+                "a",
+                iup.getProperty(UcdProperty.Case_Folding).getValue('a'));
+        assertEquals(
+                "Wrong Simple_Case_Folding for a",
+                "a",
+                iup.getProperty(UcdProperty.Simple_Case_Folding).getValue('a'));
+        assertEquals(
+                "Wrong Lowercase_Mapping for a",
+                "a",
+                iup.getProperty(UcdProperty.Lowercase_Mapping).getValue('a'));
+        assertEquals(
+                "Wrong Uppercase_Mapping for a",
+                "A",
+                iup.getProperty(UcdProperty.Uppercase_Mapping).getValue('a'));
+        assertEquals(
+                "Wrong Titlecase_Mapping for a",
+                "A",
+                iup.getProperty(UcdProperty.Titlecase_Mapping).getValue('a'));
+
+        for (var property : UcdProperty.values()) {
+            if (property.getType() != PropertyType.Miscellaneous
+                    && IndexUnicodeProperties.getResolvedDefaultValueType(property)
+                            != DefaultValueType.NONE) {
+                assertNotNull(
+                        "Null "
+                                + property.name()
+                                + " for U+FFFF but property is "
+                                + property.getType()
+                                + " with default value type "
+                                + IndexUnicodeProperties.getResolvedDefaultValueType(property)
+                                + ". Add an @missing line to ExtraPropertyValueAliases.txt if needed.",
+                        iup.getProperty(property).getValue('\uFFFF'));
+            }
+        }
+    }
+
     @Test
     public void TestNumbers() {
         for (final Age_Values age : Age_Values.values()) {