27
27
import os
28
28
import re
29
29
import sys
30
+ import urllib .request
30
31
from collections import defaultdict
31
32
from itertools import batched
32
33
34
+ UNICODE_VERSION = "15.1.0"
35
+ """The version of the Unicode data files to download."""
36
+
33
37
NUM_CODEPOINTS = 0x110000
34
38
"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
35
39
@@ -61,24 +65,28 @@ class OffsetType(enum.IntEnum):
61
65
62
66
If this is edited, you must ensure that `emit_module` reflects your changes."""
63
67
64
- MODULE_FILENAME = "tables.rs"
65
- """The filename of the emitted Rust module (will be created in the working directory)"""
68
+ MODULE_PATH = "../src/ tables.rs"
69
+ """The path of the emitted Rust module (relative to the working directory)"""
66
70
67
71
Codepoint = int
68
72
BitPos = int
69
73
70
74
71
- def fetch_open (filename : str ):
75
+ def fetch_open (filename : str , local_prefix : str = "" ):
72
76
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
73
- fetches it from `http ://www.unicode.org/Public/UNIDATA /`. Exits with code 1 on failure.
77
+ fetches it from `https ://www.unicode.org/Public/`. Exits with code 1 on failure.
74
78
"""
75
79
basename = os .path .basename (filename )
76
- if not os .path .exists (basename ):
77
- os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
80
+ localname = os .path .join (local_prefix , basename )
81
+ if not os .path .exists (localname ):
82
+ urllib .request .urlretrieve (
83
+ f"https://www.unicode.org/Public/{ UNICODE_VERSION } /ucd/{ filename } " ,
84
+ localname ,
85
+ )
78
86
try :
79
- return open (basename , encoding = "utf-8" )
87
+ return open (localname , encoding = "utf-8" )
80
88
except OSError :
81
- sys .stderr .write (f"cannot load { basename } " )
89
+ sys .stderr .write (f"cannot load { localname } " )
82
90
sys .exit (1 )
83
91
84
92
@@ -637,7 +645,7 @@ def emit_module(
637
645
module .write ("}\n " )
638
646
639
647
640
- def main (module_filename : str ):
648
+ def main (module_path : str ):
641
649
"""Obtain character data from the latest version of Unicode, transform it into a multi-level
642
650
lookup table for character width, and write a Rust module utilizing that table to
643
651
`module_filename`.
@@ -677,6 +685,9 @@ def main(module_filename: str):
677
685
emoji_variations = load_variation_sequences ()
678
686
variation_table = make_variation_sequence_table (emoji_variations , width_map )
679
687
688
+ # Download normalization test file for use by tests
689
+ fetch_open ("NormalizationTest.txt" , "../tests/" )
690
+
680
691
print ("------------------------" )
681
692
total_size = 0
682
693
for i , table in enumerate (tables ):
@@ -692,9 +703,9 @@ def main(module_filename: str):
692
703
print ("------------------------" )
693
704
print (f" Total size: { total_size } bytes" )
694
705
695
- emit_module (module_filename , version , tables , variation_table )
696
- print (f'Wrote to "{ module_filename } "' )
706
+ emit_module (module_path , version , tables , variation_table )
707
+ print (f'Wrote to "{ module_path } "' )
697
708
698
709
699
710
if __name__ == "__main__" :
700
- main (MODULE_FILENAME )
711
+ main (MODULE_PATH )
0 commit comments