Skip to content

Commit 6541843

Browse files
committed
make Cyrillic & Glagolitic use 3-byte primaries again, but one compressible lead byte, and 2-byte primaries for the Cyrillic main exemplar characters
git-svn-id: https://unicode.org/repos/unicodetools/branches/markus@478 13e8329f-0b23-4da4-9fe8-d0f6fe080806
1 parent add8e7d commit 6541843

File tree

1 file changed

+56
-14
lines changed

1 file changed

+56
-14
lines changed

uca63/org/unicode/text/UCA/PrimariesToFractional.java

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,7 @@ public PrimariesToFractional(UCA uca) {
418418
this.uca = uca;
419419

420420
groupIsCompressible[UCD_Types.GREEK_SCRIPT] = true;
421+
groupIsCompressible[UCD_Types.CYRILLIC_SCRIPT] = true;
421422
groupIsCompressible[UCD_Types.GEORGIAN_SCRIPT] = true;
422423
groupIsCompressible[UCD_Types.ARMENIAN_SCRIPT] = true;
423424
groupIsCompressible[UCD_Types.HEBREW_SCRIPT] = true;
@@ -805,14 +806,6 @@ private void findBumps() {
805806
boolean major = majorPrimaryEntry.getValue().get0();
806807
int script = majorPrimaryEntry.getValue().get1();
807808
addMajorPrimaries(lastPrimary, primary-1, lastMajor, lastScript);
808-
if (lastScript == UCD_Types.HANGUL_SCRIPT) {
809-
for (int i = lastPrimary; i < primary; ++i) {
810-
CharSequence ch2 = uca.getRepresentativePrimary(i);
811-
if (!UCD.isModernJamo(Character.codePointAt(ch2, 0))) {
812-
getProps(i).useTwoBytePrimary = false;
813-
}
814-
}
815-
}
816809
lastPrimary = primary;
817810
lastMajor = major;
818811
lastScript = script;
@@ -828,14 +821,56 @@ private void findBumps() {
828821
}
829822
}
830823

831-
char[][] singlePairs = {{'a','z'}, {' '}, {'0', '9'}, {'.'}, {','},}; // , {'\u3041', '\u30F3'}
824+
char[][] singlePairs = {
825+
{'a','z'}, {' ', ' '},
826+
{'0', '9'}, {'.', '.'}, {',', ','}
827+
};
832828
for (int j = 0; j < singlePairs.length; ++j) {
833829
char start = singlePairs[j][0];
834-
char end = singlePairs[j][singlePairs[j].length == 1 ? 0 : 1];
830+
char end = singlePairs[j][1];
835831
for (char k = start; k <= end; ++k) {
836832
setSingleBytePrimaryFor(k);
837833
}
838834
}
835+
836+
char[][] twoBytePairs = {
837+
// Cyrillic а-я
838+
{'\u0430', '\u044F'},
839+
// Further Cyrillic main exemplar characters from CLDR 22,
840+
// for common locales plus Mongolian.
841+
// TODO: We could make this dynamic, using CLDR's tools to fetch this data.
842+
// TODO: Consider adding Cyrillic auxiliary exemplar characters.
843+
{'\u0451', '\u045C'},
844+
{'\u045E', '\u045F'},
845+
{'\u0491', '\u0491'},
846+
{'\u0493', '\u0493'},
847+
{'\u0495', '\u0495'},
848+
{'\u049B', '\u049B'},
849+
{'\u049D', '\u049D'},
850+
{'\u04A3', '\u04A3'},
851+
{'\u04A5', '\u04A5'},
852+
{'\u04AF', '\u04AF'},
853+
{'\u04B1', '\u04B1'},
854+
{'\u04B3', '\u04B3'},
855+
{'\u04B7', '\u04B7'},
856+
{'\u04B9', '\u04B9'},
857+
{'\u04BB', '\u04BB'},
858+
{'\u04CA', '\u04CA'},
859+
{'\u04D5', '\u04D5'},
860+
{'\u04D9', '\u04D9'},
861+
{'\u04E3', '\u04E3'},
862+
{'\u04E9', '\u04E9'},
863+
{'\u04EF', '\u04EF'},
864+
// Jamo L, V, T
865+
{'\u1100','\u1112'}, {'\u1161','\u1175'}, {'\u11A8','\u11C2'}
866+
};
867+
for (int j = 0; j < twoBytePairs.length; ++j) {
868+
char start = twoBytePairs[j][0];
869+
char end = twoBytePairs[j][1];
870+
for (char k = start; k <= end; ++k) {
871+
setTwoBytePrimaryFor(k);
872+
}
873+
}
839874
}
840875

841876
private static boolean isThreeByteMajorScript(int script) {
@@ -846,6 +881,10 @@ private static boolean isThreeByteMajorScript(int script) {
846881
// their CEs can be stored compactly as long-primary CEs,
847882
// and the then-possible primary sort key compression makes sort keys hardly longer.
848883
return
884+
// We cherry-pick the main Cyrillic letters for two-byte primaries.
885+
script == UCD_Types.CYRILLIC_SCRIPT ||
886+
// We cherry-pick the conjoining Jamo L/V/T for two-byte primaries.
887+
script == UCD_Types.HANGUL_SCRIPT ||
849888
script == UCD_Types.ETHIOPIC_SCRIPT ||
850889
script == UCD_Types.MYANMAR_SCRIPT;
851890
}
@@ -857,8 +896,6 @@ private static boolean isTwoByteMinorScript(int script) {
857896
// and the CEs for the uppercase characters cannot be stored as "long primary" CEs.
858897
// (They would have to use less efficient storage.)
859898
//
860-
// Similar for Glagolitic: Cased, fits into the second Cyrillic lead byte.
861-
//
862899
// Note: We could also do this for Deseret:
863900
// It is also cased and has relatively few primaries,
864901
// but making them two-byte primaries would take up too much space in its reordering group
@@ -867,8 +904,7 @@ private static boolean isTwoByteMinorScript(int script) {
867904
// At least *lowercase* Deseret sorts in code point order
868905
// and can therefore be stored as a compact range.
869906
return
870-
script == UCD_Types.COPTIC ||
871-
script == UCD_Types.GLAGOLITIC;
907+
script == UCD_Types.COPTIC;
872908
}
873909

874910
private void addMajorPrimaries(int startPrimary, int endPrimary, boolean isMajor, int script) {
@@ -886,4 +922,10 @@ private void setSingleBytePrimaryFor(char ch) {
886922
int firstPrimary = CEList.getPrimary(ces.at(0));
887923
getOrCreateProps(firstPrimary).useSingleBytePrimary = true;
888924
}
925+
926+
private void setTwoBytePrimaryFor(char ch) {
927+
CEList ces = uca.getCEList(String.valueOf(ch), true);
928+
int firstPrimary = CEList.getPrimary(ces.at(0));
929+
getOrCreateProps(firstPrimary).useTwoBytePrimary = true;
930+
}
889931
}

0 commit comments

Comments
 (0)