43
43
from itertools import batched
44
44
from typing import Callable , Iterable
45
45
46
- UNICODE_VERSION = "15.1 .0"
46
+ UNICODE_VERSION = "16.0 .0"
47
47
"""The version of the Unicode data files to download."""
48
48
49
49
NUM_CODEPOINTS = 0x110000
@@ -175,8 +175,11 @@ class WidthState(enum.IntEnum):
175
175
- 4th bit: whether to set top bit on emoji presentation.
176
176
If this is set but 3rd is not, the width mode is related to zwj sequences
177
177
- 5th from top: whether this is unaffected by ligature-transparent
178
+ (if set, should also set 3rd and 4th)
178
179
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
179
- where no ZWJ has been encountered yet; encountering one flips this on"""
180
+ where no ZWJ has been encountered yet; encountering one flips this on
181
+ - Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
182
+ """
180
183
181
184
# BASIC WIDTHS
182
185
@@ -264,8 +267,17 @@ class WidthState(enum.IntEnum):
264
267
TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110
265
268
"(\\ uE0061..=\\ uE007A){6} \\ uE007F \\ u200D `Emoji_Presentation`"
266
269
270
+ # Kirat Rai
271
+ KIRAT_RAI_VOWEL_SIGN_E = 0b0000_0000_0010_0000
272
+ "\\ u16D67 (\\ u16D67 \\ u16D67)+ and canonical equivalents"
273
+ KIRAT_RAI_VOWEL_SIGN_AI = 0b0000_0000_0010_0001
274
+ "(\\ u16D68)+ and canonical equivalents"
275
+
267
276
# VARIATION SELECTORS
268
277
278
+ VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
279
+ "\\ uFE00 if CJK, or \\ uFE01 otherwise"
280
+
269
281
# Text presentation sequences (not CJK)
270
282
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
271
283
"\\ uFE0E (text presentation sequences)"
@@ -361,6 +373,7 @@ def width_alone(self) -> int:
361
373
| WidthState .COMBINING_LONG_SOLIDUS_OVERLAY
362
374
| WidthState .VARIATION_SELECTOR_15
363
375
| WidthState .VARIATION_SELECTOR_16
376
+ | WidthState .VARIATION_SELECTOR_1_OR_2
364
377
):
365
378
return 0
366
379
case (
@@ -493,12 +506,6 @@ def load_zero_widths() -> list[bool]:
493
506
lambda cp : operator .setitem (zw_map , cp , True ),
494
507
)
495
508
496
- # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
497
- # as they canonically decompose to two characters with this property,
498
- # but they aren't.
499
- for c in [0x0CC0 , 0x0CC7 , 0x0CC8 , 0x0CCA , 0x0CCB , 0x1B3B , 0x1B3D , 0x1B43 ]:
500
- zw_map [c ] = True
501
-
502
509
# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
503
510
# as zero-width. This matches the behavior of glibc `wcwidth`.
504
511
#
@@ -639,6 +646,8 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
639
646
([0xA4FD ], WidthState .LISU_TONE_LETTER_MYA_NA_JEU ),
640
647
([0xFE0F ], WidthState .VARIATION_SELECTOR_16 ),
641
648
([0x10C03 ], WidthState .OLD_TURKIC_LETTER_ORKHON_I ),
649
+ ([0x16D67 ], WidthState .KIRAT_RAI_VOWEL_SIGN_E ),
650
+ ([0x16D68 ], WidthState .KIRAT_RAI_VOWEL_SIGN_AI ),
642
651
(emoji_presentation , WidthState .EMOJI_PRESENTATION ),
643
652
(emoji_modifiers , WidthState .EMOJI_MODIFIER ),
644
653
(regional_indicators , WidthState .REGIONAL_INDICATOR ),
@@ -648,9 +657,11 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
648
657
ea [cp ] = width
649
658
650
659
# East-Asian only
660
+ ea [0xFE00 ] = WidthState .VARIATION_SELECTOR_1_OR_2
651
661
ea [0x0338 ] = WidthState .COMBINING_LONG_SOLIDUS_OVERLAY
652
662
653
663
# Not East Asian only
664
+ not_ea [0xFE01 ] = WidthState .VARIATION_SELECTOR_1_OR_2
654
665
not_ea [0xFE0E ] = WidthState .VARIATION_SELECTOR_15
655
666
656
667
return (not_ea , ea )
@@ -716,7 +727,7 @@ def load_solidus_transparent(
716
727
cjk_width_map : list [WidthState ],
717
728
) -> list [tuple [Codepoint , Codepoint ]]:
718
729
"""Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
719
- Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
730
+ Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to be checked also.
720
731
"""
721
732
722
733
ccc_above_1 = set ()
@@ -748,7 +759,7 @@ def load_solidus_transparent(
748
759
num_chars = len (ccc_above_1 )
749
760
750
761
for cp in ccc_above_1 :
751
- if cp != 0xFE0F :
762
+ if cp not in [ 0xFE00 , 0xFE0F ] :
752
763
assert (
753
764
cjk_width_map [cp ].table_width () != CharWidthInTable .SPECIAL
754
765
), f"U+{ cp :X} "
@@ -1304,8 +1315,17 @@ def lookup_fns(
1304
1315
return (0, next_info.set_emoji_presentation());
1305
1316
}"""
1306
1317
1307
- if not is_cjk :
1318
+ if is_cjk :
1308
1319
s += """
1320
+ if c == '\\ u{FE00}' {
1321
+ return (0, next_info.set_vs1_2());
1322
+ }
1323
+ """
1324
+ else :
1325
+ s += """
1326
+ if c == '\\ u{FE01}' {
1327
+ return (0, next_info.set_vs1_2());
1328
+ }
1309
1329
if c == '\\ u{FE0E}' {
1310
1330
return (0, next_info.set_text_presentation());
1311
1331
}
@@ -1315,9 +1335,19 @@ def lookup_fns(
1315
1335
} else {
1316
1336
next_info = next_info.unset_text_presentation();
1317
1337
}
1318
- }"""
1338
+ } else """
1319
1339
1320
- s += """
1340
+ s += """if next_info.is_vs1_2() {
1341
+ if matches!(c, '\\ u{2018}' | '\\ u{2019}' | '\\ u{201C}' | '\\ u{201D}') {
1342
+ return ("""
1343
+
1344
+ s += str (2 - is_cjk )
1345
+
1346
+ s += """, WidthInfo::DEFAULT);
1347
+ } else {
1348
+ next_info = next_info.unset_vs1_2();
1349
+ }
1350
+ }
1321
1351
if next_info.is_ligature_transparent() {
1322
1352
if c == '\\ u{200D}' {
1323
1353
return (0, next_info.set_zwj_bit());
@@ -1496,6 +1526,22 @@ def lookup_fns(
1496
1526
return (0, WidthInfo::EMOJI_PRESENTATION)
1497
1527
}}
1498
1528
1529
+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D63}}') => {{
1530
+ return (0, WidthInfo::DEFAULT);
1531
+ }}
1532
+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D67}}') => {{
1533
+ return (0, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI);
1534
+ }}
1535
+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D68}}') => {{
1536
+ return (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E);
1537
+ }}
1538
+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\ u{{16D69}}') => {{
1539
+ return (0, WidthInfo::DEFAULT);
1540
+ }}
1541
+ (WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI, '\\ u{{16D63}}') => {{
1542
+ return (0, WidthInfo::DEFAULT);
1543
+ }}
1544
+
1499
1545
// Fallback
1500
1546
_ => {{}}
1501
1547
}}
@@ -1562,6 +1608,8 @@ def emit_module(
1562
1608
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1563
1609
struct WidthInfo(u16);
1564
1610
1611
+ const LIGATURE_TRANSPARENT_MASK: u16 = 0b0010_0000_0000_0000;
1612
+
1565
1613
impl WidthInfo {
1566
1614
/// No special handling necessary
1567
1615
const DEFAULT: Self = Self(0);
@@ -1591,51 +1639,84 @@ def emit_module(
1591
1639
1592
1640
/// Has top bit set
1593
1641
fn is_emoji_presentation(self) -> bool {{
1594
- (self.0 & 0b1000_0000_0000_0000 ) == 0b1000_0000_0000_0000
1642
+ (self.0 & WidthInfo::VARIATION_SELECTOR_16.0 ) == WidthInfo::VARIATION_SELECTOR_16.0
1595
1643
}}
1596
1644
1597
- /// Has top bit set
1598
1645
fn is_zwj_emoji_presentation(self) -> bool {{
1599
1646
(self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
1600
1647
}}
1601
1648
1602
1649
/// Set top bit
1603
1650
fn set_emoji_presentation(self) -> Self {{
1604
- if (self.0 & 0b0010_0000_0000_0000 ) == 0b0010_0000_0000_0000
1651
+ if (self.0 & LIGATURE_TRANSPARENT_MASK ) == LIGATURE_TRANSPARENT_MASK
1605
1652
|| (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
1606
1653
{{
1607
- Self(self.0 | 0b1000_0000_0000_0000)
1654
+ Self(
1655
+ self.0
1656
+ | WidthInfo::VARIATION_SELECTOR_16.0
1657
+ & !WidthInfo::VARIATION_SELECTOR_15.0
1658
+ & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1659
+ )
1608
1660
}} else {{
1609
1661
Self::VARIATION_SELECTOR_16
1610
1662
}}
1611
1663
}}
1612
1664
1613
1665
/// Clear top bit
1614
1666
fn unset_emoji_presentation(self) -> Self {{
1615
- if (self.0 & 0b0010_0000_0000_0000 ) == 0b0010_0000_0000_0000 {{
1616
- Self(self.0 & 0b0111_1111_1111_1111 )
1667
+ if (self.0 & LIGATURE_TRANSPARENT_MASK ) == LIGATURE_TRANSPARENT_MASK {{
1668
+ Self(self.0 & !WidthInfo::VARIATION_SELECTOR_16.0 )
1617
1669
}} else {{
1618
1670
Self::DEFAULT
1619
1671
}}
1620
1672
}}
1621
1673
1622
1674
/// Has 2nd bit set
1623
1675
fn is_text_presentation(self) -> bool {{
1624
- (self.0 & 0b0100_0000_0000_0000 ) == 0b0100_0000_0000_0000
1676
+ (self.0 & WidthInfo::VARIATION_SELECTOR_15.0 ) == WidthInfo::VARIATION_SELECTOR_15.0
1625
1677
}}
1626
1678
1627
1679
/// Set 2nd bit
1628
1680
fn set_text_presentation(self) -> Self {{
1629
- if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1630
- Self(self.0 | 0b0100_0000_0000_0000)
1681
+ if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1682
+ Self(
1683
+ self.0
1684
+ | WidthInfo::VARIATION_SELECTOR_15.0
1685
+ & !WidthInfo::VARIATION_SELECTOR_16.0
1686
+ & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1687
+ )
1631
1688
}} else {{
1632
- Self(0b0100_0000_0000_0000 )
1689
+ Self(WidthInfo::VARIATION_SELECTOR_15.0 )
1633
1690
}}
1634
1691
}}
1635
1692
1636
1693
/// Clear 2nd bit
1637
1694
fn unset_text_presentation(self) -> Self {{
1638
- Self(self.0 & 0b1011_1111_1111_1111)
1695
+ Self(self.0 & !WidthInfo::VARIATION_SELECTOR_15.0)
1696
+ }}
1697
+
1698
+ /// Has 7th bit set
1699
+ fn is_vs1_2(self) -> bool {{
1700
+ (self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1701
+ }}
1702
+
1703
+ /// Set 7th bit
1704
+ fn set_vs1_2(self) -> Self {{
1705
+ if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1706
+ Self(
1707
+ self.0
1708
+ | WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1709
+ & !WidthInfo::VARIATION_SELECTOR_15.0
1710
+ & !WidthInfo::VARIATION_SELECTOR_16.0,
1711
+ )
1712
+ }} else {{
1713
+ Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1714
+ }}
1715
+ }}
1716
+
1717
+ /// Clear 7th bit
1718
+ fn unset_vs1_2(self) -> Self {{
1719
+ Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1639
1720
}}
1640
1721
}}
1641
1722
0 commit comments