@@ -418,6 +418,7 @@ public PrimariesToFractional(UCA uca) {
418
418
this .uca = uca ;
419
419
420
420
groupIsCompressible [UCD_Types .GREEK_SCRIPT ] = true ;
421
+ groupIsCompressible [UCD_Types .CYRILLIC_SCRIPT ] = true ;
421
422
groupIsCompressible [UCD_Types .GEORGIAN_SCRIPT ] = true ;
422
423
groupIsCompressible [UCD_Types .ARMENIAN_SCRIPT ] = true ;
423
424
groupIsCompressible [UCD_Types .HEBREW_SCRIPT ] = true ;
@@ -805,14 +806,6 @@ private void findBumps() {
805
806
boolean major = majorPrimaryEntry .getValue ().get0 ();
806
807
int script = majorPrimaryEntry .getValue ().get1 ();
807
808
addMajorPrimaries (lastPrimary , primary -1 , lastMajor , lastScript );
808
- if (lastScript == UCD_Types .HANGUL_SCRIPT ) {
809
- for (int i = lastPrimary ; i < primary ; ++i ) {
810
- CharSequence ch2 = uca .getRepresentativePrimary (i );
811
- if (!UCD .isModernJamo (Character .codePointAt (ch2 , 0 ))) {
812
- getProps (i ).useTwoBytePrimary = false ;
813
- }
814
- }
815
- }
816
809
lastPrimary = primary ;
817
810
lastMajor = major ;
818
811
lastScript = script ;
@@ -828,14 +821,56 @@ private void findBumps() {
828
821
}
829
822
}
830
823
831
- char [][] singlePairs = {{'a' ,'z' }, {' ' }, {'0' , '9' }, {'.' }, {',' },}; // , {'\u3041', '\u30F3'}
824
+ char [][] singlePairs = {
825
+ {'a' ,'z' }, {' ' , ' ' },
826
+ {'0' , '9' }, {'.' , '.' }, {',' , ',' }
827
+ };
832
828
for (int j = 0 ; j < singlePairs .length ; ++j ) {
833
829
char start = singlePairs [j ][0 ];
834
- char end = singlePairs [j ][singlePairs [ j ]. length == 1 ? 0 : 1 ];
830
+ char end = singlePairs [j ][1 ];
835
831
for (char k = start ; k <= end ; ++k ) {
836
832
setSingleBytePrimaryFor (k );
837
833
}
838
834
}
835
+
836
+ char [][] twoBytePairs = {
837
+ // Cyrillic а-я
838
+ {'\u0430' , '\u044F' },
839
+ // Further Cyrillic main exemplar characters from CLDR 22,
840
+ // for common locales plus Mongolian.
841
+ // TODO: We could make this dynamic, using CLDR's tools to fetch this data.
842
+ // TODO: Consider adding Cyrillic auxiliary exemplar characters.
843
+ {'\u0451' , '\u045C' },
844
+ {'\u045E' , '\u045F' },
845
+ {'\u0491' , '\u0491' },
846
+ {'\u0493' , '\u0493' },
847
+ {'\u0495' , '\u0495' },
848
+ {'\u049B' , '\u049B' },
849
+ {'\u049D' , '\u049D' },
850
+ {'\u04A3' , '\u04A3' },
851
+ {'\u04A5' , '\u04A5' },
852
+ {'\u04AF' , '\u04AF' },
853
+ {'\u04B1' , '\u04B1' },
854
+ {'\u04B3' , '\u04B3' },
855
+ {'\u04B7' , '\u04B7' },
856
+ {'\u04B9' , '\u04B9' },
857
+ {'\u04BB' , '\u04BB' },
858
+ {'\u04CA' , '\u04CA' },
859
+ {'\u04D5' , '\u04D5' },
860
+ {'\u04D9' , '\u04D9' },
861
+ {'\u04E3' , '\u04E3' },
862
+ {'\u04E9' , '\u04E9' },
863
+ {'\u04EF' , '\u04EF' },
864
+ // Jamo L, V, T
865
+ {'\u1100' ,'\u1112' }, {'\u1161' ,'\u1175' }, {'\u11A8' ,'\u11C2' }
866
+ };
867
+ for (int j = 0 ; j < twoBytePairs .length ; ++j ) {
868
+ char start = twoBytePairs [j ][0 ];
869
+ char end = twoBytePairs [j ][1 ];
870
+ for (char k = start ; k <= end ; ++k ) {
871
+ setTwoBytePrimaryFor (k );
872
+ }
873
+ }
839
874
}
840
875
841
876
private static boolean isThreeByteMajorScript (int script ) {
@@ -846,6 +881,10 @@ private static boolean isThreeByteMajorScript(int script) {
846
881
// their CEs can be stored compactly as long-primary CEs,
847
882
// and the then-possible primary sort key compression makes sort keys hardly longer.
848
883
return
884
+ // We cherry-pick the main Cyrillic letters for two-byte primaries.
885
+ script == UCD_Types .CYRILLIC_SCRIPT ||
886
+ // We cherry-pick the conjoining Jamo L/V/T for two-byte primaries.
887
+ script == UCD_Types .HANGUL_SCRIPT ||
849
888
script == UCD_Types .ETHIOPIC_SCRIPT ||
850
889
script == UCD_Types .MYANMAR_SCRIPT ;
851
890
}
@@ -857,8 +896,6 @@ private static boolean isTwoByteMinorScript(int script) {
857
896
// and the CEs for the uppercase characters cannot be stored as "long primary" CEs.
858
897
// (They would have to use less efficient storage.)
859
898
//
860
- // Similar for Glagolitic: Cased, fits into the second Cyrillic lead byte.
861
- //
862
899
// Note: We could also do this for Deseret:
863
900
// It is also cased and has relatively few primaries,
864
901
// but making them two-byte primaries would take up too much space in its reordering group
@@ -867,8 +904,7 @@ private static boolean isTwoByteMinorScript(int script) {
867
904
// At least *lowercase* Deseret sorts in code point order
868
905
// and can therefore be stored as a compact range.
869
906
return
870
- script == UCD_Types .COPTIC ||
871
- script == UCD_Types .GLAGOLITIC ;
907
+ script == UCD_Types .COPTIC ;
872
908
}
873
909
874
910
private void addMajorPrimaries (int startPrimary , int endPrimary , boolean isMajor , int script ) {
@@ -886,4 +922,10 @@ private void setSingleBytePrimaryFor(char ch) {
886
922
int firstPrimary = CEList .getPrimary (ces .at (0 ));
887
923
getOrCreateProps (firstPrimary ).useSingleBytePrimary = true ;
888
924
}
925
+
926
+ private void setTwoBytePrimaryFor (char ch ) {
927
+ CEList ces = uca .getCEList (String .valueOf (ch ), true );
928
+ int firstPrimary = CEList .getPrimary (ces .at (0 ));
929
+ getOrCreateProps (firstPrimary ).useTwoBytePrimary = true ;
930
+ }
889
931
}
0 commit comments