Skip to content

Commit aed33e9

Browse files
Treat Default_Ignorable_Code_Points as zero-width
1 parent 8942487 commit aed33e9

File tree

4 files changed

+246
-200
lines changed

4 files changed

+246
-200
lines changed

scripts/unicode.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,14 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
147147
def load_zero_widths() -> "list[bool]":
148148
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149149
character. `c` is considered a zero-width character if `c` is in general categories
150-
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
150+
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`),
151+
or if it has the `Default_Ignorable_Code_Point` property (determined by fetching
152+
and processing `DerivedCoreProperties.txt`)."""
153+
154+
zw_cat_codes = ["Cc", "Cf", "Mn", "Me"]
155+
zw_map = []
156+
151157
with fetch_open("UnicodeData.txt") as categories:
152-
zw_map = []
153158
current = 0
154159
for line in categories.readlines():
155160
if len(raw_data := line.split(";")) != 15:
@@ -159,7 +164,7 @@ def load_zero_widths() -> "list[bool]":
159164
raw_data[1],
160165
raw_data[2],
161166
]
162-
zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
167+
zero_width = cat_code in zw_cat_codes
163168

164169
assert current <= codepoint
165170
while current <= codepoint:
@@ -176,7 +181,26 @@ def load_zero_widths() -> "list[bool]":
176181
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
177182
zw_map.append(False)
178183

179-
return zw_map
184+
with fetch_open("DerivedCoreProperties.txt") as properties:
185+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)")
186+
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)")
187+
188+
for line in properties.readlines():
189+
raw_data = None # (low, high, category)
190+
if match := single.match(line):
191+
raw_data = (match.group(1), match.group(1), match.group(2))
192+
elif match := multiple.match(line):
193+
raw_data = (match.group(1), match.group(2), match.group(3))
194+
else:
195+
continue
196+
low = int(raw_data[0], 16)
197+
high = int(raw_data[1], 16)
198+
cat = raw_data[2]
199+
if cat not in zw_cat_codes:
200+
for cp in range(low, high + 1):
201+
zw_map[cp] = True
202+
203+
return zw_map
180204

181205

182206
class Bucket:

src/lib.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@
4343
//! ```
4444
4545
#![deny(missing_docs, unsafe_code)]
46-
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
47-
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
48-
46+
#![doc(
47+
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
48+
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
49+
)]
4950
#![cfg_attr(feature = "bench", feature(test))]
5051
#![no_std]
5152

@@ -87,10 +88,14 @@ pub trait UnicodeWidthChar {
8788

8889
impl UnicodeWidthChar for char {
8990
#[inline]
90-
fn width(self) -> Option<usize> { cw::width(self, false) }
91+
fn width(self) -> Option<usize> {
92+
cw::width(self, false)
93+
}
9194

9295
#[inline]
93-
fn width_cjk(self) -> Option<usize> { cw::width(self, true) }
96+
fn width_cjk(self) -> Option<usize> {
97+
cw::width(self, true)
98+
}
9499
}
95100

96101
/// Methods for determining displayed width of Unicode strings.
@@ -103,7 +108,7 @@ pub trait UnicodeWidthStr {
103108
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
104109
/// as 1 column wide. This is consistent with the recommendations for
105110
/// non-CJK contexts, or when the context cannot be reliably determined.
106-
fn width<'a>(&'a self) -> usize;
111+
fn width(&self) -> usize;
107112

108113
/// Returns the string's displayed width in columns.
109114
///
@@ -113,7 +118,7 @@ pub trait UnicodeWidthStr {
113118
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
114119
/// as 2 column wide. This is consistent with the recommendations for
115120
/// CJK contexts.
116-
fn width_cjk<'a>(&'a self) -> usize;
121+
fn width_cjk(&self) -> usize;
117122
}
118123

119124
impl UnicodeWidthStr for str {

0 commit comments

Comments
 (0)