Skip to content

Commit 9c4477c

Browse files
authored
Merge pull request #37 from Jules-Bertholet/canonical-equivalence
Ensure that canonically equivalent strings have the same width
2 parents 7c489c3 + fdf5eb7 commit 9c4477c

File tree

4 files changed

+382
-323
lines changed

4 files changed

+382
-323
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2222
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
2323
compiler_builtins = { version = "0.1", optional = true }
2424

25+
[dev-dependencies]
26+
unicode-normalization = "0.1.23"
27+
2528
[features]
2629
default = []
2730
bench = []

scripts/unicode.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,15 @@ def load_zero_widths() -> "list[bool]":
150150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151151
character. `c` is considered a zero-width character if
152152
153-
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
153+
- it is in general category `Cc`,
154+
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
154155
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155156
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156157
"""
157158

158159
zw_map = []
159160

160-
# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
161+
# Characters with general category `Cc` have 0 width
161162
with fetch_open("UnicodeData.txt") as categories:
162163
current = 0
163164
for line in categories.readlines():
@@ -168,7 +169,7 @@ def load_zero_widths() -> "list[bool]":
168169
raw_data[1],
169170
raw_data[2],
170171
]
171-
zero_width = cat_code in ["Cc", "Mn", "Me"]
172+
zero_width = cat_code == "Cc"
172173

173174
assert current <= codepoint
174175
while current <= codepoint:
@@ -188,10 +189,16 @@ def load_zero_widths() -> "list[bool]":
188189
# `Default_Ignorable_Code_Point`s also have 0 width:
189190
# https://www.unicode.org/faq/unsup_char.html#3
190191
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
192+
#
193+
# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
194+
# as well as a few `Mc` characters that need to be included so that
195+
# canonically equivalent sequences have the same width.
191196
with fetch_open("DerivedCoreProperties.txt") as properties:
192-
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
197+
single = re.compile(
198+
r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
199+
)
193200
multiple = re.compile(
194-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
201+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
195202
)
196203

197204
for line in properties.readlines():
@@ -240,6 +247,19 @@ def load_zero_widths() -> "list[bool]":
240247
# (which are considered 0-width on their own) to form a composed Hangul syllable with
241248
# width 2. Therefore, we treat it as having width 2.
242249
zw_map[0x115F] = False
250+
251+
# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
252+
# as they canonically decompose to two characters with this property,
253+
# but they aren't.
254+
zw_map[0x0CC0] = True
255+
zw_map[0x0CC7] = True
256+
zw_map[0x0CC8] = True
257+
zw_map[0x0CCA] = True
258+
zw_map[0x0CCB] = True
259+
zw_map[0x1B3B] = True
260+
zw_map[0x1B3D] = True
261+
zw_map[0x1B43] = True
262+
243263
return zw_map
244264

245265

0 commit comments

Comments
 (0)