diff --git a/unicodetools/src/main/java/org/unicode/props/BagFormatter.java b/unicodetools/src/main/java/org/unicode/props/BagFormatter.java index d23b458fb..a19032f4e 100644 --- a/unicodetools/src/main/java/org/unicode/props/BagFormatter.java +++ b/unicodetools/src/main/java/org/unicode/props/BagFormatter.java @@ -538,10 +538,13 @@ public void doAt(Object c, PrintWriter out) { tabber.add(minSpacesBeforeComment + 2, Tabber.LEFT); // comment character - labelSize = - maxLabelWidthOverride > 0 - ? maxLabelWidthOverride - : getLabelSource(true).getMaxWidth(shortLabel); + labelSize = getLabelSource(true).getMaxWidth(shortLabel); + if (refinedLabelSource != null) { + labelSize = Math.max(labelSize, refinedLabelSource.getMaxWidth(shortLabel)); + } + if (maxLabelWidthOverride > 0) { + labelSize = maxLabelWidthOverride; + } if (labelSize > 0) { tabber.add(labelSize + 1, Tabber.LEFT); // value } diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index 2e3cc1cc6..89a8aea0b 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -686,7 +686,12 @@ protected UnicodeMap _getUnicodeMap() { @Override protected String _getValue(int codepoint) { - return _getUnicodeMap().get(codepoint); + final String result = _getUnicodeMap().get(codepoint); + if (DefaultValueType.forString(result) == DefaultValueType.CODE_POINT) { + return Character.toString(codepoint); + } else { + return result; + } } @Override diff --git a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java index 244e48e5f..e64326853 100644 --- a/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java +++ b/unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java @@ -593,7 +593,14 @@ static void parseSourceFile( // do nothing, already none; break; case CODE_POINT: - // requires special handling later + // NOTE(egg): The naïve thing here would be + // for (final String cp : nullValues) { + // data.put(cp, cp); + // } + // However, UnicodeMap is extremely slow with large numbers of values. + // Instead we fill it with , and let IndexUnicodeProperty resolve + // that. + data.putAll(nullValues, propInfo.getDefaultValue()); break; default: throw new UnicodePropertyException(); // unexpected error diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index 8a0706526..e1ff508ad 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -1617,10 +1617,23 @@ private static void writeStringValues( System.out.println("Writing String Values: " + prop.getName()); } pw.println(); + final var shownSet = new UnicodeSet(); + if (ps.skipValue == null) { + shownSet.addAll(UnicodeSet.ALL_CODE_POINTS); + } else { + for (int c = 0; c <= 0x10FFFF; ++c) { + final String value = prop.getValue(c); + final String skipValue = + ps.skipValue.equals("") ? Character.toString(c) : ps.skipValue; + if (!value.equals(skipValue)) { + shownSet.add(c); + } + } + } bf.setValueSource(prop) .setHexValue(true) .setMergeRanges(ps.mergeRanges) - .showSetNames(pw, new UnicodeSet(0, 0x10FFFF)); + .showSetNames(pw, shownSet); } static class RangeStartComparator implements Comparator { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index a37de2c81..af97cfdac 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -20,6 +20,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Tabber; @@ -995,6 +996,12 @@ private static void showSet(ParsePosition pp, final String value) { if (doHtml) { out.println(""); } + // Show the GC if it happens to be constant over a range, but do not split because of it: + // We limit the output based on unsplit ranges. + showLister + .setLabelSource(null) + .setRangeBreakSource(null) + .setRefinedLabelSource(LATEST_PROPS.getProperty("General_Category")); showLister.showSetNames(out, valueSet); if (doHtml) { out.println("
"); @@ -1224,14 +1231,38 @@ static class VersionedProperty { private UnicodeProperty property; private final transient PatternMatcher matcher = new UnicodeProperty.RegexMatcher(); + private static final Set TOOL_ONLY_PROPERTIES = + Set.of("toNFC", "toNFD", "toNFKC", "toNFKD"); + + private static boolean isTrivial(UnicodeMap map) { + return map.isEmpty() + || (map.values().size() == 1 + && map.getSet(map.values().iterator().next()) + .equals(UnicodeSet.ALL_CODE_POINTS)); + } + public VersionedProperty set(String xPropertyName) { xPropertyName = xPropertyName.trim(); + boolean allowRetroactive = false; if (xPropertyName.contains(":")) { final String[] names = xPropertyName.split(":"); - if (names.length != 2 || !names[0].startsWith("U")) { + if (names.length != 2) { throw new IllegalArgumentException("Too many ':' fields in " + xPropertyName); } - if (names[0].equalsIgnoreCase("U-1")) { + if (names[0].isEmpty()) { + throw new IllegalArgumentException("Empty version field in " + xPropertyName); + } + switch (names[0].charAt(0)) { + case 'U': + break; + case 'R': + allowRetroactive = true; + break; + default: + throw new IllegalArgumentException( + "Version field should start with U or R in " + xPropertyName); + } + if (names[0].substring(1).equals("-1")) { version = LAST_VERSION; } else { version = names[0].substring(1); @@ -1242,18 +1273,19 @@ public VersionedProperty set(String xPropertyName) { } ; propertyName = xPropertyName; - propSource = getProperties(version); + propSource = getIndexedProperties(version); property = propSource.getProperty(xPropertyName); - if (property == null) { - propSource = getIndexedProperties(version); + if ((property == null && TOOL_ONLY_PROPERTIES.contains(xPropertyName)) + || (isTrivial(property.getUnicodeMap()) && allowRetroactive)) { + propSource = getProperties(version); property = propSource.getProperty(xPropertyName); - if (property == null) { - throw new IllegalArgumentException( - "Can't create property from name: " - + propertyName - + " and version: " - + version); - } + } + if (property == null || isTrivial(property.getUnicodeMap())) { + throw new IllegalArgumentException( + "Can't create property from name: " + + propertyName + + " and version: " + + version); } return this; } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java b/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java index f55246740..97e2eabdc 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java @@ -689,7 +689,7 @@ public String _getValue(int cp) { @Override public String _getValue(final int cp) { if (!ucd.isRepresented(cp)) { - return null; + return Character.toString(cp); } boolean debug = false; if (cp == -1) { // change to a real code point for debugging @@ -707,7 +707,7 @@ public String _getValue(final int cp) { final String case1 = ucd.getCase(cp, foldingType, UCD_Types.FOLD); final String b = nfkc.normalize(case1); if (equals(cp, b)) { - return null; + return Character.toString(cp); } if (debug) { System.out.println( diff --git a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt index a6596588b..f61c06c4a 100644 --- a/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt +++ b/unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt @@ -2,10 +2,12 @@ # Missing @-missing values +# No @-missing lines for binary properties. +# TODO(egg): just derive that from the property type. # @missing: 0000..10FFFF; ASCII_Hex_Digit; No # @missing: 0000..10FFFF; Alphabetic; No # @missing: 0000..10FFFF; Bidi_Control; No -# @missing: 0000..10FFFF; Bidi_Class; Left_To_Right +# @missing: 0000..10FFFF; Bidi_Mirrored; No # @missing: 0000..10FFFF; Case_Ignorable; No # @missing: 0000..10FFFF; Cased; No # @missing: 0000..10FFFF; Changes_When_Casefolded; No @@ -25,8 +27,11 @@ # @missing: 0000..10FFFF; Grapheme_Link; No # @missing: 0000..10FFFF; Hex_Digit; No # @missing: 0000..10FFFF; Hyphen; No +# @missing: 0000..10FFFF; IDS_Unary_Operator; No # @missing: 0000..10FFFF; IDS_Binary_Operator; No # @missing: 0000..10FFFF; IDS_Trinary_Operator; No +# @missing: 0000..10FFFF; ID_Compat_Math_Continue; No +# @missing: 0000..10FFFF; ID_Compat_Math_Start; No # @missing: 0000..10FFFF; ID_Continue; No # @missing: 0000..10FFFF; ID_Start; No # @missing: 0000..10FFFF; Ideographic; No @@ -48,8 +53,6 @@ # @missing: 0000..10FFFF; White_Space; No # @missing: 0000..10FFFF; XID_Continue; No # @missing: 0000..10FFFF; XID_Start; No - -# @missing: 0000..10FFFF; Bidi_Mirrored; No # @missing: 0000..10FFFF; Composition_Exclusion; No # @missing: 0000..10FFFF; Expands_On_NFC ; No # @missing: 0000..10FFFF; Expands_On_NFD ; No @@ -74,8 +77,33 @@ # @missing: 0000..10FFFF; RGI_Emoji_Tag_Sequence ; No # @missing: 0000..10FFFF; RGI_Emoji_Zwj_Sequence ; No +# @missing: 0000..10FFFF; Emoji ; No +# @missing: 0000..10FFFF; Emoji_Presentation ; No +# @missing: 0000..10FFFF; Emoji_Modifier ; No +# @missing: 0000..10FFFF; Emoji_Modifier_Base ; No +# @missing: 0000..10FFFF; Emoji_Component ; No +# @missing: 0000..10FFFF; Extended_Pictographic ; No + +# End of binary properties. + +# @missing: 0000..10FFFF; Canonical_Combining_Class; Not_Reordered + +# @missing: 0000..10FFFF; Lowercase_Mapping; +# @missing: 0000..10FFFF; Uppercase_Mapping; +# @missing: 0000..10FFFF; Titlecase_Mapping; + +# @missing: 0000..10FFFF; kSimplifiedVariant ; +# @missing: 0000..10FFFF; kTraditionalVariant ; + +# @missing: 0000..10FFFF; Joining_Group ; No_Joining_Group +# @missing: 0000..10FFFF; Joining_Type ; Non_Joining + # Overrides for bugs +# TODO(egg): These are specified in their respective files, we should not need them here. +# @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; +# @missing: 0000..10FFFF; Equivalent_Unified_Ideograph; + # Extras # @missing: 0000..10FFFF; Idn_Status ; disallowed @@ -119,24 +147,10 @@ idtype ; a ; Aspirational idtype ; inc ; Inclusion idtype ; rec ; Recommended -# @-missing: 0000..10FFFF; Confusable_SL ; -# @-missing: 0000..10FFFF; Confusable_SA ; -# @-missing: 0000..10FFFF; Confusable_ML ; -# @-missing: 0000..10FFFF; Confusable_MA ; - -# @missing: 0000..10FFFF; Emoji ; No -# @missing: 0000..10FFFF; Emoji_Presentation ; No -# @missing: 0000..10FFFF; Emoji_Modifier ; No -# @missing: 0000..10FFFF; Emoji_Modifier_Base ; No -# @missing: 0000..10FFFF; Emoji_Component ; No -# @missing: 0000..10FFFF; Extended_Pictographic ; No - -# @missing: 0000..10FFFF; Basic_Emoji ; No -# @missing: 0000..10FFFF; RGI_Emoji_Modifier_Sequence ; No -# @missing: 0000..10FFFF; RGI_Emoji_Flag_Sequence ; No -# @missing: 0000..10FFFF; RGI_Emoji_Keycap_Sequence ; No -# @missing: 0000..10FFFF; RGI_Emoji_Tag_Sequence ; No -# @missing: 0000..10FFFF; RGI_Emoji_Zwj_Sequence ; No +# @missing: 0000..10FFFF; Confusable_SL ; +# @missing: 0000..10FFFF; Confusable_SA ; +# @missing: 0000..10FFFF; Confusable_ML ; +# @missing: 0000..10FFFF; Confusable_MA ; sc ; Hanb ; Han_with_Bopomofo sc ; Jpan ; Japanese diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index ceece135a..0c4d1bc18 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -204,9 +204,9 @@ In \P{U-1:gc=Cn} U-1:NFKC_Casefold = NFKC_Casefold # Not yet a stability policy, but see https://www.unicode.org/L2/L2023/23005.htm#174-A11. # Simple counterparts of the above. In \P{U-1:gc=Cn} U-1:Simple_Case_Folding * U-1:toNFKC = Simple_Case_Folding * toNFKC -In \p{U-1:XID_Continue} U-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold +In \p{U-1:XID_Continue} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold # As above, this one would not be guaranteed by the stability policy. -In \P{U-1:gc=Cn} U-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold +In \P{U-1:gc=Cn} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold # Case Pair Stability: If two characters form a case pair in a version of Unicode, they will remain a case pair in each subsequent version of Unicode. If two characters do not form a case pair in a version of Unicode, they will never become a case pair in any subsequent version of Unicode. # TODO @@ -464,7 +464,7 @@ $decimalValue ⊇ \p{General_Category=Decimal_Number} # All and only those items with numeric types have numeric values -Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+.[0-9]+/} +Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/} [\p{Numeric_Type=Decimal} \p{Numeric_Type=Digit} \p{Numeric_Type=Numeric}] = $anyNumericValue ########################## diff --git a/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java b/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java index b9386f5fc..8f8a246c7 100644 --- a/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java +++ b/unicodetools/src/test/java/org/unicode/propstest/TestProperties.java @@ -24,7 +24,9 @@ import org.unicode.cldr.util.Counter; import org.unicode.props.GenerateEnums; import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.IndexUnicodeProperties.DefaultValueType; import org.unicode.props.PropertyNames; +import org.unicode.props.PropertyType; import org.unicode.props.PropertyValueSets; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; @@ -472,6 +474,58 @@ public void TestValues() { // logln(x + " " + z + " " + w); } + @Test + public void TestDefaults() { + assertEquals( + "Wrong CCC for U+FFFF", + "Not_Reordered", + iup.getProperty(UcdProperty.Canonical_Combining_Class).getValue('\uFFFF')); + assertEquals( + "Wrong Simple_Lowercase_Mapping for a", + "a", + iup.getProperty(UcdProperty.Simple_Lowercase_Mapping).getValue('a')); + assertEquals( + "Wrong Simple_Uppercase_Mapping for A", + "A", + iup.getProperty(UcdProperty.Simple_Uppercase_Mapping).getValue('A')); + assertEquals( + "Wrong Case_Folding for a", + "a", + iup.getProperty(UcdProperty.Case_Folding).getValue('a')); + assertEquals( + "Wrong Simple_Case_Folding for a", + "a", + iup.getProperty(UcdProperty.Simple_Case_Folding).getValue('a')); + assertEquals( + "Wrong Lowercase_Mapping for a", + "a", + iup.getProperty(UcdProperty.Lowercase_Mapping).getValue('a')); + assertEquals( + "Wrong Uppercase_Mapping for a", + "A", + iup.getProperty(UcdProperty.Uppercase_Mapping).getValue('a')); + assertEquals( + "Wrong Titlecase_Mapping for a", + "A", + iup.getProperty(UcdProperty.Titlecase_Mapping).getValue('a')); + + for (var property : UcdProperty.values()) { + if (property.getType() != PropertyType.Miscellaneous + && IndexUnicodeProperties.getResolvedDefaultValueType(property) + != DefaultValueType.NONE) { + assertNotNull( + "Null " + + property.name() + + " for U+FFFF but property is " + + property.getType() + + " with default value type " + + IndexUnicodeProperties.getResolvedDefaultValueType(property) + + ". Add an @missing line to ExtraPropertyValueAliases.txt if needed.", + iup.getProperty(property).getValue('\uFFFF')); + } + } + } + @Test public void TestNumbers() { for (final Age_Values age : Age_Values.values()) {