17
17
# - HangulSyllableType.txt
18
18
# - PropList.txt
19
19
# - ReadMe.txt
20
+ # - emoji/emoji-variation-sequences.txt
20
21
#
21
22
# Since this should not require frequent updates, we just store this
22
23
# out-of-line and check the generated module into git.
26
27
import os
27
28
import re
28
29
import sys
30
+ from collections import defaultdict
31
+ from itertools import batched
29
32
30
33
NUM_CODEPOINTS = 0x110000
31
34
"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
@@ -69,12 +72,13 @@ def fetch_open(filename: str):
69
72
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
70
73
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
71
74
"""
72
- if not os .path .exists (os .path .basename (filename )):
75
+ basename = os .path .basename (filename )
76
+ if not os .path .exists (basename ):
73
77
os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
74
78
try :
75
- return open (filename , encoding = "utf-8" )
79
+ return open (basename , encoding = "utf-8" )
76
80
except OSError :
77
- sys .stderr .write (f"cannot load { filename } " )
81
+ sys .stderr .write (f"cannot load { basename } " )
78
82
sys .exit (1 )
79
83
80
84
@@ -384,8 +388,71 @@ def make_tables(
384
388
return tables
385
389
386
390
391
+ def load_variation_sequences () -> "list[int]" :
392
+ """Outputs a list of character ranages, corresponding to all the valid characters for starting
393
+ an emoji presentation sequence."""
394
+
395
+ with fetch_open ("emoji/emoji-variation-sequences.txt" ) as sequences :
396
+ # Match all emoji presentation sequences
397
+ # (one codepoint followed by U+FE0F, and labeled "emoji style")
398
+ sequence = re .compile (r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style" )
399
+ codepoints = []
400
+ for line in sequences .readlines ():
401
+ if match := sequence .match (line ):
402
+ cp = int (match .group (1 ), 16 )
403
+ codepoints .append (cp )
404
+ return codepoints
405
+
406
+
407
+ def make_variation_sequence_table (
408
+ seqs : "list[int]" ,
409
+ width_map : "list[EffectiveWidth]" ,
410
+ ) -> "tuple[list[int], list[list[int]]]" :
411
+ """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
412
+ (Characters that are always wide may be excluded.)
413
+ The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
414
+ """
415
+
416
+ prefixes_dict = defaultdict (set )
417
+ for cp in seqs :
418
+ prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
419
+
420
+ # We don't strictly need to keep track of characters that are always wide,
421
+ # because being in an emoji variation seq won't affect their width.
422
+ # So store their info only when it wouldn't inflate the size of the tables.
423
+ for k in list (prefixes_dict .keys ()):
424
+ if all (
425
+ map (
426
+ lambda cp : width_map [(k << 10 ) | cp ] == EffectiveWidth .WIDE ,
427
+ prefixes_dict [k ],
428
+ )
429
+ ):
430
+ del prefixes_dict [k ]
431
+
432
+ indexes = list (prefixes_dict .keys ())
433
+
434
+ # Similarly, we can spuriously return `true` for always-wide characters
435
+ # even if not part of a presentation seq; this saves an additional lookup,
436
+ # so we should do it where there is no size cost.
437
+ for cp , width in enumerate (width_map ):
438
+ if width == EffectiveWidth .WIDE and (cp >> 10 ) in indexes :
439
+ prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
440
+
441
+ leaves = []
442
+ for cps in prefixes_dict .values ():
443
+ leaf = [0 ] * 128
444
+ for cp in cps :
445
+ idx_in_leaf , bit_shift = divmod (cp , 8 )
446
+ leaf [idx_in_leaf ] |= 1 << bit_shift
447
+ leaves .append (leaf )
448
+ return (indexes , leaves )
449
+
450
+
387
451
def emit_module (
388
- out_name : str , unicode_version : "tuple[int, int, int]" , tables : "list[Table]"
452
+ out_name : str ,
453
+ unicode_version : "tuple[int, int, int]" ,
454
+ tables : "list[Table]" ,
455
+ variation_table : "tuple[list[int], list[list[int]]]" ,
389
456
):
390
457
"""Outputs a Rust module to `out_name` using table data from `tables`.
391
458
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -462,6 +529,40 @@ def emit_module(
462
529
"""
463
530
)
464
531
532
+ variation_idx , variation_leaves = variation_table
533
+
534
+ module .write (
535
+ """
536
+ /// Whether this character forms an [emoji presentation sequence]
537
+ /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
538
+ /// when followed by `'\\ u{FEOF}'`.
539
+ /// Emoji presentation sequences are considered to have width 2.
540
+ /// This may spuriously return `true` or `false` for characters that are always wide.
541
+ #[inline]
542
+ pub fn starts_emoji_presentation_seq(c: char) -> bool {
543
+ let cp: u32 = c.into();
544
+ // First level of lookup uses all but 10 LSB
545
+ let top_bits = cp >> 10;
546
+ let idx_of_leaf: usize = match top_bits {
547
+ """
548
+ )
549
+
550
+ for i , msbs in enumerate (variation_idx ):
551
+ module .write (f" { msbs } => { i } ,\n " )
552
+
553
+ module .write (
554
+ """ _ => return false,
555
+ };
556
+ // Extract the 3-9th (0-indexed) least significant bits of `cp`,
557
+ // and use them to index into `leaf_row`.
558
+ let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
559
+ let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
560
+ // Use the 3 LSB of `cp` to index into `leaf_byte`.
561
+ ((leaf_byte >> (cp & 7)) & 1) == 1
562
+ }
563
+ """
564
+ )
565
+
465
566
module .write (
466
567
"""
467
568
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -510,6 +611,29 @@ def emit_module(
510
611
module .write (f" 0x{ byte :02X} ," )
511
612
module .write ("\n ];\n " )
512
613
subtable_count = new_subtable_count
614
+
615
+ # emoji table
616
+
617
+ module .write (
618
+ f"""
619
+ #[repr(align(128))]
620
+ struct Align128<T>(T);
621
+ /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
622
+ /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
623
+ static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; { len (variation_leaves )} ]> = Align128([
624
+ """
625
+ )
626
+ for leaf in variation_leaves :
627
+ module .write (" [\n " )
628
+ for row in batched (leaf , 14 ):
629
+ module .write (" " )
630
+ for entry in row :
631
+ module .write (f" 0x{ entry :02X} ," )
632
+ module .write ("\n " )
633
+ module .write (" ],\n " )
634
+
635
+ module .write (" ]);\n " )
636
+
513
637
module .write ("}\n " )
514
638
515
639
@@ -520,6 +644,7 @@ def main(module_filename: str):
520
644
521
645
We obey the following rules, in decreasing order of importance:
522
646
647
+ - Emoji presentation sequences are double-width.
523
648
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
524
649
- Hangul jamo medial vowels & final consonants are zero-width.
525
650
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
@@ -549,16 +674,25 @@ def main(module_filename: str):
549
674
550
675
tables = make_tables (TABLE_CFGS , enumerate (width_map ))
551
676
677
+ emoji_variations = load_variation_sequences ()
678
+ variation_table = make_variation_sequence_table (emoji_variations , width_map )
679
+
552
680
print ("------------------------" )
553
681
total_size = 0
554
682
for i , table in enumerate (tables ):
555
683
size_bytes = len (table .to_bytes ())
556
- print (f"Table { i } Size : { size_bytes } bytes" )
684
+ print (f"Table { i } size : { size_bytes } bytes" )
557
685
total_size += size_bytes
686
+ emoji_index_size = len (variation_table [0 ]) * 4
687
+ print (f"Emoji presentation index size: { emoji_index_size } bytes" )
688
+ total_size += emoji_index_size
689
+ emoji_leaves_size = len (variation_table [1 ]) * len (variation_table [1 ][0 ])
690
+ print (f"Emoji presentation leaves size: { emoji_leaves_size } bytes" )
691
+ total_size += emoji_leaves_size
558
692
print ("------------------------" )
559
- print (f" Total Size : { total_size } bytes" )
693
+ print (f" Total size : { total_size } bytes" )
560
694
561
- emit_module (module_filename , version , tables )
695
+ emit_module (module_filename , version , tables , variation_table )
562
696
print (f'Wrote to "{ module_filename } "' )
563
697
564
698
0 commit comments