Skip to content

Commit b8389b1

Browse files
hsivonensffc
authored andcommitted
ICU-22718 Export disallowed/ignored UTS 46 data for ICU4X
1 parent 43b0901 commit b8389b1

File tree

1 file changed

+40
-16
lines changed

1 file changed

+40
-16
lines changed

icu4c/source/tools/icuexportdata/icuexportdata.cpp

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -755,9 +755,12 @@ void computeDecompositions(const char* basename,
755755
std::vector<uint32_t> nonRecursive32;
756756
LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
757757

758+
UBool uts46 = false;
759+
758760
if (uprv_strcmp(basename, "nfkd") == 0) {
759761
mainNormalizer = Normalizer2::getNFKDInstance(status);
760762
} else if (uprv_strcmp(basename, "uts46d") == 0) {
763+
uts46 = true;
761764
mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status);
762765
} else {
763766
mainNormalizer = nfdNormalizer;
@@ -817,23 +820,38 @@ void computeDecompositions(const char* basename,
817820
nfcNormalizer->normalize(dst, nfc, status);
818821
nonNfdOrRoundTrips = (src == nfc);
819822
}
823+
if (uts46) {
824+
// Work around https://unicode-org.atlassian.net/browse/ICU-22658
825+
// TODO: Remove the workaround after data corresponding to
826+
// https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands
827+
// for Unicode 16.
828+
switch (c) {
829+
case 0x2F868:
830+
dst.truncate(0);
831+
dst.append(UChar32(0x36FC));
832+
break;
833+
case 0x2F874:
834+
dst.truncate(0);
835+
dst.append(UChar32(0x5F53));
836+
break;
837+
case 0x2F91F:
838+
dst.truncate(0);
839+
dst.append(UChar32(0x243AB));
840+
break;
841+
case 0x2F95F:
842+
dst.truncate(0);
843+
dst.append(UChar32(0x7AEE));
844+
break;
845+
case 0x2F9BF:
846+
dst.truncate(0);
847+
dst.append(UChar32(0x45D7));
848+
break;
849+
}
850+
}
820851
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
852+
821853
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
822-
// Characters that normalize to nothing or to U+FFFD (without the
823-
// input being U+FFFD) in ICU4C's UTS 46 normalization normalize
824-
// as in NFD in ICU4X's UTF 46 normalization in the interest
825-
// of data size and ICU4X's normalizer being unable to handle
826-
// normalizing to nothing.
827-
// When UTS 46 is implemented on top of ICU4X, a preprocessing
828-
// step is supposed to remove these characters before the
829-
// normalization step.
830-
if (uprv_strcmp(basename, "uts46d") != 0) {
831-
status.set(U_INTERNAL_PROGRAM_ERROR);
832-
handleError(status, basename);
833-
}
834-
nfdNormalizer->normalize(src, dst, status);
835-
len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
836-
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
854+
if (!uts46) {
837855
status.set(U_INTERNAL_PROGRAM_ERROR);
838856
handleError(status, basename);
839857
}
@@ -951,7 +969,13 @@ void computeDecompositions(const char* basename,
951969
if (!nonNfdOrRoundTrips) {
952970
compositionPassthroughBound = c;
953971
}
954-
if (len == 1 && utf32[0] <= 0xFFFF) {
972+
if (!len) {
973+
if (!uts46) {
974+
status.set(U_INTERNAL_PROGRAM_ERROR);
975+
handleError(status, basename);
976+
}
977+
pendingTrieInsertions.push_back({c, 0xFFFFFFFF, false});
978+
} else if (len == 1 && utf32[0] <= 0xFFFF) {
955979
if (startsWithBackwardCombiningStarter) {
956980
if (mainNormalizer == nfdNormalizer) {
957981
// Not supposed to happen in NFD

0 commit comments

Comments
 (0)