@@ -755,9 +755,12 @@ void computeDecompositions(const char* basename,
755
755
std::vector<uint32_t > nonRecursive32;
756
756
LocalUMutableCPTriePointer nonRecursiveBuilder (umutablecptrie_open (0 , 0 , status));
757
757
758
+ UBool uts46 = false ;
759
+
758
760
if (uprv_strcmp (basename, " nfkd" ) == 0 ) {
759
761
mainNormalizer = Normalizer2::getNFKDInstance (status);
760
762
} else if (uprv_strcmp (basename, " uts46d" ) == 0 ) {
763
+ uts46 = true ;
761
764
mainNormalizer = Normalizer2::getInstance (nullptr , " uts46" , UNORM2_COMPOSE, status);
762
765
} else {
763
766
mainNormalizer = nfdNormalizer;
@@ -817,23 +820,38 @@ void computeDecompositions(const char* basename,
817
820
nfcNormalizer->normalize (dst, nfc, status);
818
821
nonNfdOrRoundTrips = (src == nfc);
819
822
}
823
+ if (uts46) {
824
+ // Work around https://unicode-org.atlassian.net/browse/ICU-22658
825
+ // TODO: Remove the workaround after data corresponding to
826
+ // https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands
827
+ // for Unicode 16.
828
+ switch (c) {
829
+ case 0x2F868 :
830
+ dst.truncate (0 );
831
+ dst.append (UChar32 (0x36FC ));
832
+ break ;
833
+ case 0x2F874 :
834
+ dst.truncate (0 );
835
+ dst.append (UChar32 (0x5F53 ));
836
+ break ;
837
+ case 0x2F91F :
838
+ dst.truncate (0 );
839
+ dst.append (UChar32 (0x243AB ));
840
+ break ;
841
+ case 0x2F95F :
842
+ dst.truncate (0 );
843
+ dst.append (UChar32 (0x7AEE ));
844
+ break ;
845
+ case 0x2F9BF :
846
+ dst.truncate (0 );
847
+ dst.append (UChar32 (0x45D7 ));
848
+ break ;
849
+ }
850
+ }
820
851
int32_t len = dst.toUTF32 (utf32, DECOMPOSITION_BUFFER_SIZE, status);
852
+
821
853
if (!len || (len == 1 && utf32[0 ] == 0xFFFD && c != 0xFFFD )) {
822
- // Characters that normalize to nothing or to U+FFFD (without the
823
- // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
824
- // as in NFD in ICU4X's UTF 46 normalization in the interest
825
- // of data size and ICU4X's normalizer being unable to handle
826
- // normalizing to nothing.
827
- // When UTS 46 is implemented on top of ICU4X, a preprocessing
828
- // step is supposed to remove these characters before the
829
- // normalization step.
830
- if (uprv_strcmp (basename, " uts46d" ) != 0 ) {
831
- status.set (U_INTERNAL_PROGRAM_ERROR);
832
- handleError (status, basename);
833
- }
834
- nfdNormalizer->normalize (src, dst, status);
835
- len = dst.toUTF32 (utf32, DECOMPOSITION_BUFFER_SIZE, status);
836
- if (!len || (len == 1 && utf32[0 ] == 0xFFFD && c != 0xFFFD )) {
854
+ if (!uts46) {
837
855
status.set (U_INTERNAL_PROGRAM_ERROR);
838
856
handleError (status, basename);
839
857
}
@@ -951,7 +969,13 @@ void computeDecompositions(const char* basename,
951
969
if (!nonNfdOrRoundTrips) {
952
970
compositionPassthroughBound = c;
953
971
}
954
- if (len == 1 && utf32[0 ] <= 0xFFFF ) {
972
+ if (!len) {
973
+ if (!uts46) {
974
+ status.set (U_INTERNAL_PROGRAM_ERROR);
975
+ handleError (status, basename);
976
+ }
977
+ pendingTrieInsertions.push_back ({c, 0xFFFFFFFF , false });
978
+ } else if (len == 1 && utf32[0 ] <= 0xFFFF ) {
955
979
if (startsWithBackwardCombiningStarter) {
956
980
if (mainNormalizer == nfdNormalizer) {
957
981
// Not supposed to happen in NFD
0 commit comments