Skip to content

Prefer IndexUnicodeProperties over ToolUnicodePropertySource #488

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
May 28, 2023
Merged
11 changes: 7 additions & 4 deletions unicodetools/src/main/java/org/unicode/props/BagFormatter.java
Original file line number Diff line number Diff line change
Expand Up @@ -538,10 +538,13 @@ public void doAt(Object c, PrintWriter out) {

tabber.add(minSpacesBeforeComment + 2, Tabber.LEFT); // comment character

labelSize =
maxLabelWidthOverride > 0
? maxLabelWidthOverride
: getLabelSource(true).getMaxWidth(shortLabel);
labelSize = getLabelSource(true).getMaxWidth(shortLabel);
if (refinedLabelSource != null) {
labelSize = Math.max(labelSize, refinedLabelSource.getMaxWidth(shortLabel));
}
if (maxLabelWidthOverride > 0) {
labelSize = maxLabelWidthOverride;
}
if (labelSize > 0) {
tabber.add(labelSize + 1, Tabber.LEFT); // value
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,12 @@ protected UnicodeMap<String> _getUnicodeMap() {

@Override
protected String _getValue(int codepoint) {
return _getUnicodeMap().get(codepoint);
final String result = _getUnicodeMap().get(codepoint);
if (DefaultValueType.forString(result) == DefaultValueType.CODE_POINT) {
return Character.toString(codepoint);
} else {
return result;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,14 @@ static void parseSourceFile(
// do nothing, already none;
break;
case CODE_POINT:
// requires special handling later
// NOTE(egg): The naïve thing here would be
// for (final String cp : nullValues) {
// data.put(cp, cp);
// }
// However, UnicodeMap is extremely slow with large numbers of values.
// Instead we fill it with <code point>, and let IndexUnicodeProperty resolve
// that.
data.putAll(nullValues, propInfo.getDefaultValue());
break;
default:
throw new UnicodePropertyException(); // unexpected error
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1617,10 +1617,23 @@ private static void writeStringValues(
System.out.println("Writing String Values: " + prop.getName());
}
pw.println();
final var shownSet = new UnicodeSet();
if (ps.skipValue == null) {
shownSet.addAll(UnicodeSet.ALL_CODE_POINTS);
} else {
for (int c = 0; c <= 0x10FFFF; ++c) {
final String value = prop.getValue(c);
final String skipValue =
ps.skipValue.equals("<code point>") ? Character.toString(c) : ps.skipValue;
if (!value.equals(skipValue)) {
shownSet.add(c);
}
}
}
bf.setValueSource(prop)
.setHexValue(true)
.setMergeRanges(ps.mergeRanges)
.showSetNames(pw, new UnicodeSet(0, 0x10FFFF));
.showSetNames(pw, shownSet);
}

static class RangeStartComparator implements Comparator<String> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.Tabber;
Expand Down Expand Up @@ -995,6 +996,12 @@ private static void showSet(ParsePosition pp, final String value) {
if (doHtml) {
out.println("<table class='s'>");
}
// Show the GC if it happens to be constant over a range, but do not split because of it:
// We limit the output based on unsplit ranges.
showLister
.setLabelSource(null)
.setRangeBreakSource(null)
.setRefinedLabelSource(LATEST_PROPS.getProperty("General_Category"));
showLister.showSetNames(out, valueSet);
if (doHtml) {
out.println("</table>");
Expand Down Expand Up @@ -1224,14 +1231,38 @@ static class VersionedProperty {
private UnicodeProperty property;
private final transient PatternMatcher matcher = new UnicodeProperty.RegexMatcher();

private static final Set<String> TOOL_ONLY_PROPERTIES =
Set.of("toNFC", "toNFD", "toNFKC", "toNFKD");

private static boolean isTrivial(UnicodeMap<String> map) {
return map.isEmpty()
|| (map.values().size() == 1
&& map.getSet(map.values().iterator().next())
.equals(UnicodeSet.ALL_CODE_POINTS));
}

public VersionedProperty set(String xPropertyName) {
xPropertyName = xPropertyName.trim();
boolean allowRetroactive = false;
if (xPropertyName.contains(":")) {
final String[] names = xPropertyName.split(":");
if (names.length != 2 || !names[0].startsWith("U")) {
if (names.length != 2) {
throw new IllegalArgumentException("Too many ':' fields in " + xPropertyName);
}
if (names[0].equalsIgnoreCase("U-1")) {
if (names[0].isEmpty()) {
throw new IllegalArgumentException("Empty version field in " + xPropertyName);
}
switch (names[0].charAt(0)) {
case 'U':
break;
case 'R':
allowRetroactive = true;
break;
default:
throw new IllegalArgumentException(
"Version field should start with U or R in " + xPropertyName);
}
if (names[0].substring(1).equals("-1")) {
version = LAST_VERSION;
} else {
version = names[0].substring(1);
Expand All @@ -1242,18 +1273,19 @@ public VersionedProperty set(String xPropertyName) {
}
;
propertyName = xPropertyName;
propSource = getProperties(version);
propSource = getIndexedProperties(version);
property = propSource.getProperty(xPropertyName);
if (property == null) {
propSource = getIndexedProperties(version);
if ((property == null && TOOL_ONLY_PROPERTIES.contains(xPropertyName))
|| (isTrivial(property.getUnicodeMap()) && allowRetroactive)) {
propSource = getProperties(version);
property = propSource.getProperty(xPropertyName);
if (property == null) {
throw new IllegalArgumentException(
"Can't create property from name: "
+ propertyName
+ " and version: "
+ version);
}
}
if (property == null || isTrivial(property.getUnicodeMap())) {
throw new IllegalArgumentException(
"Can't create property from name: "
+ propertyName
+ " and version: "
+ version);
}
return this;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ public String _getValue(int cp) {
@Override
public String _getValue(final int cp) {
if (!ucd.isRepresented(cp)) {
return null;
return Character.toString(cp);
}
boolean debug = false;
if (cp == -1) { // change to a real code point for debugging
Expand All @@ -707,7 +707,7 @@ public String _getValue(final int cp) {
final String case1 = ucd.getCase(cp, foldingType, UCD_Types.FOLD);
final String b = nfkc.normalize(case1);
if (equals(cp, b)) {
return null;
return Character.toString(cp);
}
if (debug) {
System.out.println(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

# Missing @-missing values

# No @-missing lines for binary properties.
# TODO(egg): just derive that from the property type.
# @missing: 0000..10FFFF; ASCII_Hex_Digit; No
# @missing: 0000..10FFFF; Alphabetic; No
# @missing: 0000..10FFFF; Bidi_Control; No
# @missing: 0000..10FFFF; Bidi_Class; Left_To_Right
# @missing: 0000..10FFFF; Bidi_Mirrored; No
# @missing: 0000..10FFFF; Case_Ignorable; No
# @missing: 0000..10FFFF; Cased; No
# @missing: 0000..10FFFF; Changes_When_Casefolded; No
Expand All @@ -25,8 +27,11 @@
# @missing: 0000..10FFFF; Grapheme_Link; No
# @missing: 0000..10FFFF; Hex_Digit; No
# @missing: 0000..10FFFF; Hyphen; No
# @missing: 0000..10FFFF; IDS_Unary_Operator; No
# @missing: 0000..10FFFF; IDS_Binary_Operator; No
# @missing: 0000..10FFFF; IDS_Trinary_Operator; No
# @missing: 0000..10FFFF; ID_Compat_Math_Continue; No
# @missing: 0000..10FFFF; ID_Compat_Math_Start; No
# @missing: 0000..10FFFF; ID_Continue; No
# @missing: 0000..10FFFF; ID_Start; No
# @missing: 0000..10FFFF; Ideographic; No
Expand All @@ -48,8 +53,6 @@
# @missing: 0000..10FFFF; White_Space; No
# @missing: 0000..10FFFF; XID_Continue; No
# @missing: 0000..10FFFF; XID_Start; No

# @missing: 0000..10FFFF; Bidi_Mirrored; No
# @missing: 0000..10FFFF; Composition_Exclusion; No
# @missing: 0000..10FFFF; Expands_On_NFC ; No
# @missing: 0000..10FFFF; Expands_On_NFD ; No
Expand All @@ -74,8 +77,33 @@
# @missing: 0000..10FFFF; RGI_Emoji_Tag_Sequence ; No
# @missing: 0000..10FFFF; RGI_Emoji_Zwj_Sequence ; No

# @missing: 0000..10FFFF; Emoji ; No
# @missing: 0000..10FFFF; Emoji_Presentation ; No
# @missing: 0000..10FFFF; Emoji_Modifier ; No
# @missing: 0000..10FFFF; Emoji_Modifier_Base ; No
# @missing: 0000..10FFFF; Emoji_Component ; No
# @missing: 0000..10FFFF; Extended_Pictographic ; No

# End of binary properties.

# @missing: 0000..10FFFF; Canonical_Combining_Class; Not_Reordered

# @missing: 0000..10FFFF; Lowercase_Mapping; <slc>
# @missing: 0000..10FFFF; Uppercase_Mapping; <suc>
# @missing: 0000..10FFFF; Titlecase_Mapping; <stc>

# @missing: 0000..10FFFF; kSimplifiedVariant ; <none>
# @missing: 0000..10FFFF; kTraditionalVariant ; <none>

# @missing: 0000..10FFFF; Joining_Group ; No_Joining_Group
# @missing: 0000..10FFFF; Joining_Type ; Non_Joining

# Overrides for bugs

# TODO(egg): These are specified in their respective files, we should not need them here.
# @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; <none>
# @missing: 0000..10FFFF; Equivalent_Unified_Ideograph; <none>

# Extras

# @missing: 0000..10FFFF; Idn_Status ; disallowed
Expand Down Expand Up @@ -119,24 +147,10 @@ idtype ; a ; Aspirational
idtype ; inc ; Inclusion
idtype ; rec ; Recommended

# @-missing: 0000..10FFFF; Confusable_SL ; <code point>
# @-missing: 0000..10FFFF; Confusable_SA ; <code point>
# @-missing: 0000..10FFFF; Confusable_ML ; <code point>
# @-missing: 0000..10FFFF; Confusable_MA ; <code point>

# @missing: 0000..10FFFF; Emoji ; No
# @missing: 0000..10FFFF; Emoji_Presentation ; No
# @missing: 0000..10FFFF; Emoji_Modifier ; No
# @missing: 0000..10FFFF; Emoji_Modifier_Base ; No
# @missing: 0000..10FFFF; Emoji_Component ; No
# @missing: 0000..10FFFF; Extended_Pictographic ; No

# @missing: 0000..10FFFF; Basic_Emoji ; No
# @missing: 0000..10FFFF; RGI_Emoji_Modifier_Sequence ; No
# @missing: 0000..10FFFF; RGI_Emoji_Flag_Sequence ; No
# @missing: 0000..10FFFF; RGI_Emoji_Keycap_Sequence ; No
# @missing: 0000..10FFFF; RGI_Emoji_Tag_Sequence ; No
# @missing: 0000..10FFFF; RGI_Emoji_Zwj_Sequence ; No
# @missing: 0000..10FFFF; Confusable_SL ; <code point>
# @missing: 0000..10FFFF; Confusable_SA ; <code point>
# @missing: 0000..10FFFF; Confusable_ML ; <code point>
# @missing: 0000..10FFFF; Confusable_MA ; <code point>

sc ; Hanb ; Han_with_Bopomofo
sc ; Jpan ; Japanese
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,9 @@ In \P{U-1:gc=Cn} U-1:NFKC_Casefold = NFKC_Casefold
# Not yet a stability policy, but see https://www.unicode.org/L2/L2023/23005.htm#174-A11.
# Simple counterparts of the above.
In \P{U-1:gc=Cn} U-1:Simple_Case_Folding * U-1:toNFKC = Simple_Case_Folding * toNFKC
In \p{U-1:XID_Continue} U-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
In \p{U-1:XID_Continue} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
# As above, this one would not be guaranteed by the stability policy.
In \P{U-1:gc=Cn} U-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
In \P{U-1:gc=Cn} R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold

# Case Pair Stability: If two characters form a case pair in a version of Unicode, they will remain a case pair in each subsequent version of Unicode. If two characters do not form a case pair in a version of Unicode, they will never become a case pair in any subsequent version of Unicode.
# TODO
Expand Down Expand Up @@ -464,7 +464,7 @@ $decimalValue ⊇ \p{General_Category=Decimal_Number}

# All and only those items with numeric types have numeric values

Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+.[0-9]+/}
Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/}
[\p{Numeric_Type=Decimal} \p{Numeric_Type=Digit} \p{Numeric_Type=Numeric}] = $anyNumericValue

##########################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
import org.unicode.cldr.util.Counter;
import org.unicode.props.GenerateEnums;
import org.unicode.props.IndexUnicodeProperties;
import org.unicode.props.IndexUnicodeProperties.DefaultValueType;
import org.unicode.props.PropertyNames;
import org.unicode.props.PropertyType;
import org.unicode.props.PropertyValueSets;
import org.unicode.props.UcdProperty;
import org.unicode.props.UcdPropertyValues;
Expand Down Expand Up @@ -472,6 +474,58 @@ public void TestValues() {
// logln(x + " " + z + " " + w);
}

@Test
public void TestDefaults() {
assertEquals(
"Wrong CCC for U+FFFF",
"Not_Reordered",
iup.getProperty(UcdProperty.Canonical_Combining_Class).getValue('\uFFFF'));
assertEquals(
"Wrong Simple_Lowercase_Mapping for a",
"a",
iup.getProperty(UcdProperty.Simple_Lowercase_Mapping).getValue('a'));
assertEquals(
"Wrong Simple_Uppercase_Mapping for A",
"A",
iup.getProperty(UcdProperty.Simple_Uppercase_Mapping).getValue('A'));
assertEquals(
"Wrong Case_Folding for a",
"a",
iup.getProperty(UcdProperty.Case_Folding).getValue('a'));
assertEquals(
"Wrong Simple_Case_Folding for a",
"a",
iup.getProperty(UcdProperty.Simple_Case_Folding).getValue('a'));
assertEquals(
"Wrong Lowercase_Mapping for a",
"a",
iup.getProperty(UcdProperty.Lowercase_Mapping).getValue('a'));
assertEquals(
"Wrong Uppercase_Mapping for a",
"A",
iup.getProperty(UcdProperty.Uppercase_Mapping).getValue('a'));
assertEquals(
"Wrong Titlecase_Mapping for a",
"A",
iup.getProperty(UcdProperty.Titlecase_Mapping).getValue('a'));

for (var property : UcdProperty.values()) {
if (property.getType() != PropertyType.Miscellaneous
&& IndexUnicodeProperties.getResolvedDefaultValueType(property)
!= DefaultValueType.NONE) {
assertNotNull(
"Null "
+ property.name()
+ " for U+FFFF but property is "
+ property.getType()
+ " with default value type "
+ IndexUnicodeProperties.getResolvedDefaultValueType(property)
+ ". Add an @missing line to ExtraPropertyValueAliases.txt if needed.",
iup.getProperty(property).getValue('\uFFFF'));
}
}
}

@Test
public void TestNumbers() {
for (final Age_Values age : Age_Values.values()) {
Expand Down