diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java index 4e8c06d5f..c5c3f2433 100644 --- a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java +++ b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java @@ -32,6 +32,7 @@ import java.util.TreeMap; import java.util.function.Predicate; import java.util.regex.Pattern; +import org.unicode.cldr.util.Rational.RationalParser; import org.unicode.cldr.util.props.UnicodeLabel; public abstract class UnicodeProperty extends UnicodeLabel { @@ -198,6 +199,7 @@ public UnicodeProperty setDelimiter(String value) { EXTENDED_MASK = 1, CORE_MASK = ~EXTENDED_MASK, BINARY_MASK = (1 << BINARY) | (1 << EXTENDED_BINARY), + NUMERIC_MASK = (1 << NUMERIC) | (1 << EXTENDED_NUMERIC), STRING_MASK = (1 << STRING) | (1 << EXTENDED_STRING), STRING_OR_MISC_MASK = (1 << STRING) | (1 << EXTENDED_STRING) | (1 << MISC) | (1 << EXTENDED_MISC), @@ -443,19 +445,25 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { if (isMultivalued && propertyValue != null && propertyValue.contains(delimiter)) { throw new IllegalArgumentException( "Multivalued property values can't contain the delimiter."); + } + if (propertyValue == null) { + return getSet(NULL_MATCHER, result); + } + Comparator comparator; + if (isType(NUMERIC_MASK)) { + // UAX44-LM1. + comparator = RATIONAL_COMPARATOR; + } else if (getName().equals("Name") || getName().equals("Name_Alias")) { + // UAX44-LM2. + comparator = CHARACTER_NAME_COMPARATOR; + } else if (isType(BINARY_OR_ENUMERATED_OR_CATALOG_MASK)) { + // UAX44-LM3 + comparator = PROPERTY_COMPARATOR; } else { - return getSet( - propertyValue == null - ? NULL_MATCHER - : new SimpleMatcher( - propertyValue, - getName().equals("Name") || getName().equals("Name_Alias") - ? CHARACTER_NAME_COMPARATOR - : isType(STRING_OR_MISC_MASK) - ? null - : PROPERTY_COMPARATOR), - result); + // String-valued or Miscellaneous property. + comparator = null; } + return getSet(new SimpleMatcher(propertyValue, comparator), result); } private UnicodeMap unicodeMap = null; @@ -725,6 +733,26 @@ public static String toSkeleton(String source) { return skeletonBuffer.toString(); } + public static final Comparator RATIONAL_COMPARATOR = + new Comparator() { + @Override + public int compare(String x, String y) { + return compareRationals(x, y); + } + }; + + public static int compareRationals(String a, String b) { + if (a == b) return 0; + if (a == null) return -1; + if (b == null) return 1; + final boolean aIsNaN = equalNames(a, "NaN"); + final boolean bIsNaN = equalNames(b, "NaN"); + if (aIsNaN && bIsNaN) return 0; + if (aIsNaN) return -1; + if (bIsNaN) return 1; + return RationalParser.BASIC.parse(a).compareTo(RationalParser.BASIC.parse(b)); + } + public static final Comparator CHARACTER_NAME_COMPARATOR = new Comparator() { @Override diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/VersionedSymbolTable.java b/unicodetools/src/main/java/org/unicode/text/UCD/VersionedSymbolTable.java new file mode 100644 index 000000000..806c56810 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/text/UCD/VersionedSymbolTable.java @@ -0,0 +1,437 @@ +package org.unicode.text.UCD; + +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.VersionInfo; +import java.util.Comparator; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues; +import org.unicode.props.UnicodeProperty; +import org.unicode.props.UnicodePropertySymbolTable; +import org.unicode.text.utility.Settings; + +/** + * This class implements the semantics of property-query as defined in the UnicodeSet specification. + */ +public class VersionedSymbolTable extends UnicodeSet.XSymbolTable { + private VersionedSymbolTable() {} + + public static VersionedSymbolTable forReview() { + var result = new VersionedSymbolTable(); + result.requireSuffixForLatest = true; + result.implicitVersion = Settings.LAST_VERSION_INFO; + result.previousVersion = Settings.LAST2_VERSION_INFO; + return result; + } + + public static VersionedSymbolTable forDevelopment() { + var result = new VersionedSymbolTable(); + result.requireSuffixForLatest = false; + result.implicitVersion = Settings.LATEST_VERSION_INFO; + result.previousVersion = Settings.LAST_VERSION_INFO; + return result; + } + + public VersionedSymbolTable setUnversionedExtensions(UnicodeProperty.Factory factory) { + unversionedExtensions = factory; + return this; + } + + @Override + public boolean applyPropertyAlias(String beforeEquals, String afterEquals, UnicodeSet result) { + result.clear(); + String leftHandSide = beforeEquals; + String propertyPredicate = afterEquals; + boolean interiorlyNegated = false; + int posNotEqual = beforeEquals.indexOf('≠'); + // TODO(egg): We cannot distinguish \p{X=} from \p{X} in this API, both give us an empty + // string as afterEquals. This is an @internal API, so we could change it to pass null in + // the unary case. + if (posNotEqual >= 0) { + propertyPredicate = + afterEquals.isEmpty() + ? beforeEquals.substring(posNotEqual + 1) + : beforeEquals.substring(posNotEqual + 1) + "=" + afterEquals; + leftHandSide = beforeEquals.substring(0, posNotEqual); + interiorlyNegated = true; + } + if (interiorlyNegated) { + final var complement = getNonNegatedPropertyQuerySet(leftHandSide, propertyPredicate); + result.addAll(complement.complement().removeAllStrings()); + } else { + result.addAll(getNonNegatedPropertyQuerySet(leftHandSide, propertyPredicate)); + } + return true; + } + + private UnicodeSet getNonNegatedPropertyQuerySet( + String leftHandSide, String propertyPredicate) { + final var mutableLeftHandSide = new StringBuilder(leftHandSide); + final var queriedVersion = parseVersionQualifier(mutableLeftHandSide); + final String unqualifiedLeftHandSide = mutableLeftHandSide.toString(); + final var deducedQueriedVersion = queriedVersion == null ? implicitVersion : queriedVersion; + + final var queriedProperties = IndexUnicodeProperties.make(deducedQueriedVersion); + + if (propertyPredicate.isEmpty()) { + return computeUnaryQuery(queriedProperties, unqualifiedLeftHandSide); + } else { + return computeBinaryQuery( + queriedProperties, unqualifiedLeftHandSide, propertyPredicate); + } + } + + private UnicodeSet computeUnaryQuery( + IndexUnicodeProperties queriedProperties, String unqualifiedQuery) { + // Either unary-property-query, or binary-property-query with an empty property-value. + final var script = queriedProperties.getProperty(UcdProperty.Script); + final var generalCategory = queriedProperties.getProperty(UcdProperty.General_Category); + if (script.isValidValue(unqualifiedQuery)) { + return script.getSet(unqualifiedQuery); + } + if (generalCategory.isValidValue(unqualifiedQuery)) { + return getGeneralCategorySet(queriedProperties, unqualifiedQuery); + } + UnicodeProperty queriedProperty = queriedProperties.getProperty(unqualifiedQuery); + if (queriedProperty == null && unversionedExtensions != null) { + queriedProperty = unversionedExtensions.getProperty(unqualifiedQuery); + } + if (queriedProperty == null) { + throw new IllegalArgumentException( + "Invalid unary-query-expression; could not find property " + unqualifiedQuery); + } + if (!queriedProperty.isType(UnicodeProperty.BINARY_MASK)) { + // TODO(egg): Remove when we can tell this is a unary query. + if (queriedProperty.isType(UnicodeProperty.STRING_OR_MISC_MASK)) { + return queriedProperty.getSet(""); + } + throw new IllegalArgumentException( + "Invalid unary-query-expression for non-binary property " + + queriedProperty.getName()); + } + return queriedProperty.getSet(UcdPropertyValues.Binary.Yes); + } + + private UnicodeSet computeBinaryQuery( + IndexUnicodeProperties queriedProperties, + String unqualifiedLeftHandSide, + String propertyPredicate) { + // We have a binary-property-query. + UnicodeProperty queriedProperty = queriedProperties.getProperty(unqualifiedLeftHandSide); + if (queriedProperty == null && unversionedExtensions != null) { + queriedProperty = unversionedExtensions.getProperty(unqualifiedLeftHandSide); + } + if (queriedProperty == null) { + throw new IllegalArgumentException( + "Invalid binary-query-expression; could not find property " + + unqualifiedLeftHandSide); + } + final boolean isAge = queriedProperty.getName().equals("Age"); + final boolean isName = queriedProperty.getName().equals("Name"); + final boolean isPropertyComparison = + propertyPredicate.startsWith("@") && propertyPredicate.endsWith("@"); + final boolean isRegularExpressionMatch = + propertyPredicate.startsWith("/") && propertyPredicate.endsWith("/"); + if (isPropertyComparison) { + if (isAge) { + throw new IllegalArgumentException( + "Invalid binary-query-expression with property-comparison for Age"); + } + final var unqualifiedRightHandSide = + new StringBuilder( + propertyPredicate.substring(1, propertyPredicate.length() - 1)); + final var comparisonVersion = parseVersionQualifier(unqualifiedRightHandSide); + if (UnicodeProperty.equalNames(unqualifiedRightHandSide.toString(), "code point")) { + if (comparisonVersion != null) { + throw new IllegalArgumentException( + "Invalid binary-query-expression with comparison version on identity query"); + } + if (!queriedProperty.isType(UnicodeProperty.STRING_MASK)) { + throw new IllegalArgumentException( + "Invalid binary-query-expression with identity query for " + + queriedProperty.getTypeName() + + " property"); + } + return getIdentitySet(queriedProperty); + } else if (UnicodeProperty.equalNames(unqualifiedRightHandSide.toString(), "none")) { + if (comparisonVersion != null) { + throw new IllegalArgumentException( + "Invalid binary-query-expression with comparison version on null query"); + } + if (!queriedProperty.isType(UnicodeProperty.STRING_OR_MISC_MASK)) { + throw new IllegalArgumentException( + "Invalid binary-query-expression with null query for " + + queriedProperty.getTypeName() + + " property"); + } + return queriedProperty.getSet((String) null); + } else { + UnicodeProperty comparisonProperty = + IndexUnicodeProperties.make( + comparisonVersion == null + ? implicitVersion + : comparisonVersion) + .getProperty(unqualifiedRightHandSide.toString()); + if (comparisonProperty == null && unversionedExtensions != null) { + comparisonProperty = + unversionedExtensions.getProperty(unqualifiedRightHandSide.toString()); + } + if (comparisonProperty == null) { + throw new IllegalArgumentException( + "Invalid binary-query-expression; could not find comparison property " + + unqualifiedRightHandSide); + } + return compareProperties(queriedProperty, comparisonProperty); + } + } else if (isRegularExpressionMatch) { + if (isAge) { + throw new IllegalArgumentException( + "Invalid binary-query-expression with regular-expression-match for Age"); + } + return queriedProperty.getSet( + new UnicodeProperty.RegexMatcher() + .set(propertyPredicate.substring(1, propertyPredicate.length() - 1))); + } else { + String propertyValue = propertyPredicate; + // Validation. For Name, validation entails computing the query, so we return here. + if (isName) { + var result = queriedProperty.getSet(propertyValue); + if (result.isEmpty()) { + result = + queriedProperties + .getProperty(UcdProperty.Name_Alias) + .getSet(propertyValue); + } + if (result.isEmpty()) { + throw new IllegalArgumentException( + "No character name nor name alias matches " + propertyValue); + } + return result; + } else if (queriedProperty.getName().equals("Name_Alias")) { + var result = queriedProperty.getSet(propertyValue); + if (result.isEmpty()) { + throw new IllegalArgumentException("No name alias matches " + propertyValue); + } + return result; + } else if (queriedProperty.isType(UnicodeProperty.NUMERIC_MASK)) { + if (UnicodeProperty.equalNames(propertyValue, "NaN") + || !RATIONAL_PATTERN.matcher(propertyValue).matches()) { + throw new IllegalArgumentException( + "Invalid value '" + + propertyValue + + "' for numeric property " + + queriedProperty.getName()); + } + } else if (queriedProperty.isType( + UnicodeProperty.BINARY_OR_ENUMERATED_OR_CATALOG_MASK)) { + if (!queriedProperty.isValidValue(propertyValue)) { + throw new IllegalArgumentException( + "The value '" + + propertyValue + + "' is illegal. Values for " + + queriedProperty.getName() + + " must be in " + + queriedProperty.getAvailableValues() + + " or in " + + queriedProperty.getValueAliases()); + } + } else { + // TODO(egg): Check for unescaped :, @, =, etc. and unescape. + } + if (isAge) { + return queriedProperty.getSet( + new UnicodePropertySymbolTable.ComparisonMatcher( + UnicodePropertySymbolTable.parseVersionInfoOrMax(propertyValue), + UnicodePropertySymbolTable.Relation.geq, + Comparator.nullsFirst(Comparator.naturalOrder()), + UnicodePropertySymbolTable::parseVersionInfoOrMax)); + } + if (queriedProperty.getName().equals("General_Category")) { + return getGeneralCategorySet(queriedProperties, propertyValue); + } + return queriedProperty.getSet(propertyValue); + } + } + + /** + * Parses a string prefixed with an optional-version-qualifier. If there is a version-qualifier, + * returns the corresponding VersionInfo and removes the prefix from the given StringBuilder. + * Otherwise returns null. + */ + private VersionInfo parseVersionQualifier(StringBuilder qualified) { + int posColon = qualified.indexOf(":", 0); + if (posColon < 0) { + return null; + } + final String versionQualifier = qualified.substring(0, posColon + 1); + qualified.delete(0, posColon + 1); + if (versionQualifier.equals("U-1")) { + return previousVersion; + } else { + switch (versionQualifier.charAt(0)) { + case 'R': + // Extension: we allow a version-qualifier starting with R for retroactive + // properties, that is, property derivations applied before the property + // existed. + // TODO(egg): Actually support that. + case 'U': + break; + default: + throw new IllegalArgumentException( + "Invalid version-qualifier " + versionQualifier); + } + String versionNumber = versionQualifier.substring(1, posColon); + if (versionNumber.endsWith("dev")) { + versionNumber = versionNumber.substring(0, versionNumber.length() - 3); + if (!versionNumber.isEmpty() + && VersionInfo.getInstance(versionNumber) != Settings.LATEST_VERSION_INFO) { + throw new IllegalArgumentException( + "Invalid version-qualifier " + + versionQualifier + + " with version-suffix dev: the current dev version is " + + Settings.latestVersion); + } + return Settings.LATEST_VERSION_INFO; + } else if (versionNumber.endsWith("α") || versionNumber.endsWith("β")) { + final String versionSuffix = versionNumber.substring(versionNumber.length() - 1); + versionNumber = versionNumber.substring(0, versionNumber.length() - 1); + if (versionSuffix != Settings.latestVersionPhase.toString()) { + throw new IllegalArgumentException( + "Invalid version-qualifier " + + versionQualifier + + " with version-suffix " + + versionSuffix + + ": the current stage is " + + Settings.latestVersionPhase); + } + if (!versionNumber.isEmpty() + && VersionInfo.getInstance(versionNumber) != Settings.LATEST_VERSION_INFO) { + throw new IllegalArgumentException( + "Invalid version-qualifier " + + versionQualifier + + " with version-suffix " + + versionNumber + + ": the current " + + versionSuffix + + " version is " + + Settings.latestVersion); + } + return Settings.LATEST_VERSION_INFO; + } else { + var result = VersionInfo.getInstance(versionNumber); + if (result == Settings.LATEST_VERSION_INFO && requireSuffixForLatest) { + throw new IllegalArgumentException( + "Invalid version-qualifier " + + versionQualifier + + " version-suffix " + + Settings.latestVersionPhase + + " required for unpublished version"); + } + return result; + } + } + } + + private static Map> + COARSE_GENERAL_CATEGORIES = + Map.of( + UcdPropertyValues.General_Category_Values.Other, + Set.of("Cc", "Cf", "Cn", "Co", "Cs"), + UcdPropertyValues.General_Category_Values.Letter, + Set.of("Ll", "Lm", "Lo", "Lt", "Lu"), + UcdPropertyValues.General_Category_Values.Cased_Letter, + Set.of("Ll", "Lt", "Lu"), + UcdPropertyValues.General_Category_Values.Mark, + Set.of("Mc", "Me", "Mn"), + UcdPropertyValues.General_Category_Values.Number, + Set.of("Nd", "Nl", "No"), + UcdPropertyValues.General_Category_Values.Punctuation, + Set.of("Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps"), + UcdPropertyValues.General_Category_Values.Symbol, + Set.of("Sc", "Sk", "Sm", "So"), + UcdPropertyValues.General_Category_Values.Separator, + Set.of("Zl", "Zp", "Zs")); + + /** + * Similar to iup.getProperty(UcdProperty.General_Category).getSet(propertyValue), but takes the + * groupings into account. Implements both unary-query-expression for a General_Category alias + * and binary-query-expression with a property-value where the queried property is + * General_Category. + */ + private static UnicodeSet getGeneralCategorySet( + IndexUnicodeProperties iup, String propertyValue) { + var gc = iup.getProperty(UcdProperty.General_Category); + for (var entry : COARSE_GENERAL_CATEGORIES.entrySet()) { + final var aliases = entry.getKey().getNames().getAllNames(); + if (aliases.stream().anyMatch(a -> UnicodeProperty.equalNames(propertyValue, a))) { + UnicodeSet result = new UnicodeSet(); + for (var value : entry.getValue()) { + gc.getSet(value, result); + } + return result; + } + } + return gc.getSet(propertyValue); + } + + private static UnicodeSet getIdentitySet(UnicodeProperty queriedProperty) { + final var result = new UnicodeSet(); + // Note that while UnicodeProperty, can return strings from getSet, which is an extension of + // the UnicodeSet property-query specification, identity queries exclude any strings of + // length other than 1, otherwise we would end up with infinite sets, e.g., the set of all + // strings that normalize to themselves. + for (int cp = 0; cp <= 0x10FFFF; ++cp) { + if (UnicodeProperty.equals(cp, queriedProperty.getValue(cp))) { + result.add(cp); + } + } + return result; + } + + private static UnicodeSet compareProperties( + UnicodeProperty queriedProperty, UnicodeProperty comparisonProperty) { + if (!((queriedProperty.isType(UnicodeProperty.BINARY_MASK) + && comparisonProperty.isType(UnicodeProperty.BINARY_MASK)) + || (queriedProperty.isType(UnicodeProperty.NUMERIC_MASK) + && comparisonProperty.isType(UnicodeProperty.NUMERIC_MASK)) + || (queriedProperty.isType(UnicodeProperty.STRING_MASK) + && comparisonProperty.isType(UnicodeProperty.STRING_MASK)) + || (queriedProperty.isType(UnicodeProperty.ENUMERATED_OR_CATALOG_MASK) + && comparisonProperty.isType(UnicodeProperty.ENUMERATED_OR_CATALOG_MASK) + && queriedProperty + .getAvailableValues() + .equals(comparisonProperty.getAvailableValues())) + || queriedProperty.getName().equals(comparisonProperty.getName()))) { + throw new IllegalArgumentException( + "Invalid property comparison between " + + queriedProperty.getTypeName() + + " property " + + queriedProperty.getName() + + " and " + + comparisonProperty.getTypeName() + + " property " + + comparisonProperty.getName()); + } + final var result = new UnicodeSet(); + // Note that while UnicodeProperty, can return strings from getSet, which is an extension of + // the UnicodeSet property-query specification, property comparisons exclude any strings of + // length other than 1. Extending them to include those leads to messy questions of + // defining the value of character properties for string (null?) and avoiding infinite sets. + for (int cp = 0; cp <= 0x10FFFF; ++cp) { + if (UnicodeProperty.equals( + queriedProperty.getValue(cp), comparisonProperty.getValue(cp))) { + result.add(cp); + } + } + return result; + } + + private VersionInfo implicitVersion; + private VersionInfo previousVersion; + private boolean requireSuffixForLatest; + private UnicodeProperty.Factory unversionedExtensions; + private static Pattern RATIONAL_PATTERN = Pattern.compile("[+-]?[0-9]+(/[0-9]*[1-9][0-9]*)?"); +} diff --git a/unicodetools/src/test/java/org/unicode/text/UCD/TestVersionedSymbolTable.java b/unicodetools/src/test/java/org/unicode/text/UCD/TestVersionedSymbolTable.java new file mode 100644 index 000000000..ac05aa9fb --- /dev/null +++ b/unicodetools/src/test/java/org/unicode/text/UCD/TestVersionedSymbolTable.java @@ -0,0 +1,344 @@ +package org.unicode.text.UCD; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.ibm.icu.text.UnicodeSet; +import java.text.ParsePosition; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * Notice to the maintainer: These tests check that the UnicodeSet property queries are correctly + * parsed. They are not here to test property assignments. Mostly they check, for every valid + * expression, that the set is nonempty, not equal to the entire code space, and that it appears + * reasonable. If they are broken by changes to property assignments, feel free to update them. + */ +public class TestVersionedSymbolTable { + UnicodeSet.XSymbolTable oldDefault; + + @BeforeEach + void setUp() { + oldDefault = UnicodeSet.getDefaultXSymbolTable(); + UnicodeSet.setDefaultXSymbolTable(VersionedSymbolTable.forDevelopment()); + } + + @AfterEach + void tearDown() { + UnicodeSet.setDefaultXSymbolTable(oldDefault); + } + + @Test + void testIntroductionBasicExamples() { + assertThatUnicodeSet("\\p{XID_Continue}") + .contains("a") + .contains("α") + .contains("𒀀") + .doesNotContain("'") + .doesNotContain(","); + assertThatUnicodeSet("[\\p{lb=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]") + .contains("(") + .doesNotContain("【"); + assertThatUnicodeSet( + "[\\p{Other_ID_Start}\\p{Other_ID_Continue}" + + "\\p{L}\\p{Nl}\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}" + + "-\\p{Pattern_Syntax}" + + "-\\p{Pattern_White_Space}]") + .contains("A") + .contains("_") + .contains("᧚") + .doesNotContain("\u2E2F") + .doesNotContain("$"); + assertThatUnicodeSet("[\\p{L}\\p{Nl}\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}-[\\u2E2F]]") + .contains("A") + .contains("_") + .doesNotContain("᧚") + .doesNotContain("\u2E2F") + .doesNotContain("$"); + } + + @Test + void testIntroductionQueryLanguageExamples() { + assertThatUnicodeSet("\\p{Uppercase_Mapping≠@Simple_Uppercase_Mapping@}") + .contains("ß") + .doesNotContain("ſ"); + assertThatUnicodeSet("\\p{U15.1:Simple_Case_Folding≠@U15.0:Simple_Case_Folding@}") + .consistsOf("ſt", "ΐ", "ΰ"); + assertThatUnicodeSet("[\\p{cjkDefinition=/\\bcat\\b/} \\p{kEH_Desc=/\\bcat\\b/}]") + .contains("貓") + .contains("𓃠") + .doesNotContain("犬") + .doesNotContain("𓃡"); + assertThatUnicodeSet("[\\p{Case_Folding≠@code point@}-\\p{Changes_When_Casefolded}]") + .contains("ǰ") + .doesNotContain("š") + .doesNotContain("ß"); + } + + @Test + void testNegations() { + assertThatUnicodeSet("\\P{Cn}").contains("a").doesNotContain("\uFFFF"); + assertThatUnicodeSet("[:^Cn:]").contains("a").doesNotContain("\uFFFF"); + assertThatUnicodeSet("\\P{General_Category=Cn}").contains("a").doesNotContain("\uFFFF"); + assertThatUnicodeSet("[:^General_Category=Cn:]").contains("a").doesNotContain("\uFFFF"); + assertThatUnicodeSet("\\p{General_Category≠Cn}").contains("a").doesNotContain("\uFFFF"); + assertThatUnicodeSet("[:General_Category≠Cn:]").contains("a").doesNotContain("\uFFFF"); + assertThatUnicodeSet("[:^General_Category≠Cn:]").doesNotContain("a").contains("\uFFFF"); + assertThatUnicodeSet("[:^General_Category≠Cn:]").doesNotContain("a").contains("\uFFFF"); + assertThatUnicodeSet("[:^General_Category≠Cn:]").doesNotContain("a").contains("\uFFFF"); + + assertThatUnicodeSet("\\P{Decomposition_Type≠compat}") + .contains("∯") + .doesNotContain("∮") + .isEqualToUnicodeSet("\\p{Decomposition_Type=compat}"); + } + + @Test + void testNamedSingleton() { + assertThatUnicodeSet("\\N{SPACE}").consistsOf(" "); + assertThatUnicodeSet("\\N{THIS IS NOT A CHARACTER}") + .isIllFormed("No character name nor name alias matches"); + assertThatUnicodeSet("\\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}") + .isEqualToUnicodeSet( + "\\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}") + .consistsOf("︘"); + assertThatUnicodeSet("\\N{Latin small ligature o-e}").consistsOf("œ"); + assertThatUnicodeSet("\\N{Hangul jungseong O-E}").consistsOf("ᆀ"); + assertThatUnicodeSet("\\N{Hangul jungseong OE}").consistsOf("ᅬ"); + } + + @Test + void testAge() { + assertThatUnicodeSet("\\p{Age=6.0}") + .contains("U") + .contains("𒌋") + .doesNotContain("𒎙") + .isEqualToUnicodeSet("[ \\P{U6:Cn} \\p{U6:Noncharacter_Code_Point} ]"); + assertThatUnicodeSet("\\p{Age=@U6:Age@}").isIllFormed("property-comparison for Age"); + assertThatUnicodeSet("\\p{Age=/1/}").isIllFormed("regular-expression-match for Age"); + } + + @Test + void testPropertyComparisons() { + // From the first set of examples in the section. + assertThatUnicodeSet("\\p{scf=@lc@}").contains("Σ").contains("σ").doesNotContain("ς"); + assertThatUnicodeSet("\\p{U15.1:scf=@U15.1:lc@}") + .contains("Σ") + .contains("σ") + .doesNotContain("ς"); + assertThatUnicodeSet("\\p{U15.0:Line_Break≠@U15.1:Line_Break@}") + .contains("ᯤ") + .doesNotContain("i"); + assertThatUnicodeSet("\\p{kIRG_GSource=@none@}").contains("𒇽").doesNotContain("人"); + assertThatUnicodeSet("\\p{case folding=@code point@}") + .contains("s") + .doesNotContain("S") + .doesNotContain("ſ") + .doesNotContain("ß"); + assertThatUnicodeSet("\\p{kIRG_GSource=@U16:none@}") + .isIllFormed("comparison version on null query"); + assertThatUnicodeSet("\\p{case folding=@U16:code point@}") + .isIllFormed("comparison version on identity query"); + + // From the third set of examples in the section. + assertThatUnicodeSet("\\p{Decomposition_Mapping=@Ideographic@}") + .isIllFormed( + "comparison between String property Decomposition_Mapping and" + + " Binary property Ideographic"); + assertThatUnicodeSet("\\p{Uppercase≠@Changes_When_Lowercased@}") + .isEqualToUnicodeSet( + "[[\\p{Uppercase}\\p{Changes_When_Lowercased}]" + + "-[\\p{Uppercase}&\\p{Changes_When_Lowercased}]]") + .contains("𝔄") + .doesNotContain("A"); + assertThatUnicodeSet("\\p{scf≠@cf@}").contains("ß").doesNotContain("ς"); + assertThatUnicodeSet("\\p{Numeric_Value=@kPrimaryNumeric@}") + .contains("A") + .contains("喵") + .contains("一") + .contains("五") + .doesNotContain("1") + .doesNotContain("伍"); + // \p{U15.0:Line_Break≠@U15.1:Line_Break@} covered above. + assertThatUnicodeSet("\\p{U16.0:kPrimaryNumeric≠@U17.0:kPrimaryNumeric@}").consistsOf("兆"); + assertThatUnicodeSet("\\p{Script_Extensions=@Script@}").contains("A").doesNotContain("।"); + } + + @Test + void testIdentityAndNullQueries() { + assertThatUnicodeSet("\\p{scf=@code point@}").contains("a").doesNotContain("A"); + assertThatUnicodeSet("[:^kIRG_GSource=@none@:]").contains("喵").doesNotContain("𓃠"); + assertThatUnicodeSet("\\p{Bidi_Paired_Bracket=@none@}") + .isEqualToUnicodeSet("\\p{Bidi_Paired_Bracket_Type=None}"); + } + + @Test + void testValidValues() { + assertThatUnicodeSet("\\p{Name=THIS IS NOT A CHARACTER}") + .isIllFormed("No character name nor name alias matches"); + assertThatUnicodeSet("\\p{Name =CUNEIFORM SIGN A}").consistsOf("𒀀"); + assertThatUnicodeSet("\\p{Name_Alias=CUNEIFORM SIGN A}") + .isIllFormed("No name alias matches"); + assertThatUnicodeSet("\\p{Line_Break=Meow}").isIllFormed("The value 'Meow' is illegal."); + assertThatUnicodeSet("\\p{kDefinition=meow}").isEmpty(); + assertThatUnicodeSet("\\p{Uppercase_Mapping=meow}").isEmpty(); + assertThatUnicodeSet("\\p{Numeric_Value=MDCCXXIX}") + .isIllFormed("Invalid value 'MDCCXXIX' for numeric property"); + assertThatUnicodeSet("\\p{Numeric_Value=1729}").isEmpty(); + } + + @Test + void testPropertyValueQueries() { + assertThatUnicodeSet("\\p{Uppercase=True}") + .isEqualToUnicodeSet("\\p{Uppercase}") + .contains("A") + .doesNotContain("a"); + assertThatUnicodeSet("\\p{Uppercase=NO}") + .isEqualToUnicodeSet("\\P{Uppercase}") + .contains("a") + .doesNotContain("A"); + assertThatUnicodeSet("\\p{Script_Extensions=Latin}") + .contains("A") + .contains("·") + .doesNotContain("𓃠") + .doesNotContain("।"); + assertThatUnicodeSet("\\p{nv=2/12}") + .isEqualToUnicodeSet("\\p{Numeric_Value=1/6}") + .contains("⅙") + .contains("𐧷") + .doesNotContain("½") + .doesNotContain("X"); + assertThatUnicodeSet("\\p{Name_Alias=New Line}") + .isEqualToUnicodeSet("\\p{Name=New Line}") + .consistsOf("\n"); + } + + @Test + void testRegularExpressionQueries() { + assertThatUnicodeSet("\\p{Name=/CAPITAL LETTER/}").contains("A").doesNotContain("a"); + assertThatUnicodeSet("\\p{Block=/^Cyrillic/}") + .contains("и") + .contains("\u1C8B") + .contains("\u1C8F") + .contains("ꙮ") + .doesNotContain("k"); + assertThatUnicodeSet("\\p{scx=/Gondi/}") + .isEqualToUnicodeSet("[\\p{scx=Gunjala_Gondi}\\p{scx=Masaram_Gondi}]") + .contains("𑴀") + .contains("𑵠") + .contains("।") + .doesNotContain("a"); + assertThatUnicodeSet("\\p{gc=/^P/}") + .isEqualToUnicodeSet("[\\p{Punctuation} \\p{Private Use} \\u2029]"); + + assertThatUnicodeSet("\\p{Name=/NO BREAK SPACE/}").isEmpty(); + assertThatUnicodeSet("\\p{Name=/NO-BREAK SPACE/}") + .contains("\u00A0") + .contains("\u202F") + .contains("\uFEFF"); + assertThatUnicodeSet("\\p{Script=/ Gondi/}").isEmpty(); + assertThatUnicodeSet("\\p{Script=/_Gondi/}").contains("𑴀").contains("𑵠"); + assertThatUnicodeSet("\\p{gc=/Cased_Letter/}").isEmpty(); + assertThatUnicodeSet("\\p{gc=Cased_Letter}") + .contains("a") + .contains("A") + .doesNotContain("𒀀"); + } + + /** Helper class for testing multiple properties of the same UnicodeSet. */ + private static class UnicodeSetTestFluent { + UnicodeSetTestFluent(String expression) { + this.expression = expression; + ParsePosition parsePosition = new ParsePosition(0); + try { + set = new UnicodeSet(expression); + set.complement().complement(); + } catch (Exception e) { + exception = e; + } + } + + public void isIllFormed(String messageSubstring) { + assertNotNull(exception, expression + " is ill-formed"); + assertTrue( + exception.getMessage().contains(messageSubstring), + "Error message '" + + exception.getMessage() + + "' for " + + expression + + " contains '" + + messageSubstring + + "'"); + } + + public UnicodeSetTestFluent isEqualToUnicodeSet( + String expectedExpression) { + final var expected = new UnicodeSet(expectedExpression); + assertTrue( + set.containsAll(expected), + "Expected " + + expected + + " ⊆ " + + expression + + " = " + + set.toPattern(true) + + " but " + + expression + + " is missing " + + expected.cloneAsThawed().removeAll(set)); + assertTrue( + expected.containsAll(set), + "Expected " + + expected + + " ⊇ " + + expression + + " = " + + set.toPattern(true) + + " but " + + expression + + " contains unexpected " + + set.cloneAsThawed().removeAll(expected)); + return this; + } + + public UnicodeSetTestFluent doesNotContain(CharSequence element) { + assertFalse( + set.contains(element), + element + " ∉ " + expression + " = " + set.toPattern(true)); + return this; + } + + public UnicodeSetTestFluent contains(CharSequence element) { + assertTrue( + set.contains(element), + element + " ∈ " + expression + " = " + set.toPattern(true)); + return this; + } + + public UnicodeSetTestFluent consistsOf(CharSequence... elements) { + for (CharSequence element : elements) { + contains(element); + } + final var expectedElements = new UnicodeSet().addAll(elements); + assertTrue( + expectedElements.containsAll(set), + expectedElements + " ⊇ " + expression + " = " + set.toPattern(true)); + return this; + } + + public UnicodeSetTestFluent isEmpty() { + assertTrue(set.isEmpty(), expression + " = " + set.toPattern(true) + " is empty"); + return this; + } + + private UnicodeSet set; + private String expression; + private Exception exception; + } + + private UnicodeSetTestFluent assertThatUnicodeSet(String expression) { + return new UnicodeSetTestFluent(expression); + } +}