@@ -4,94 +4,94 @@ class UnicodeTrie
4
4
5
5
# Shift size for getting the index-2 table offset.
6
6
SHIFT_2 = 5
7
-
7
+
8
8
# Difference between the two shift sizes,
9
9
# for getting an index-1 offset from an index-2 offset. 6=11-5
10
10
SHIFT_1_2 = SHIFT_1 - SHIFT_2
11
-
11
+
12
12
# Number of index-1 entries for the BMP. 32=0x20
13
13
# This part of the index-1 table is omitted from the serialized form.
14
14
OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1
15
-
15
+
16
16
# Number of entries in an index-2 block. 64=0x40
17
17
INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2
18
-
18
+
19
19
# Mask for getting the lower bits for the in-index-2-block offset. */
20
20
INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1
21
-
21
+
22
22
# Shift size for shifting left the index array values.
23
23
# Increases possible data size with 16-bit index values at the cost
24
24
# of compactability.
25
25
# This requires data blocks to be aligned by DATA_GRANULARITY.
26
26
INDEX_SHIFT = 2
27
-
27
+
28
28
# Number of entries in a data block. 32=0x20
29
29
DATA_BLOCK_LENGTH = 1 << SHIFT_2
30
-
30
+
31
31
# Mask for getting the lower bits for the in-data-block offset.
32
32
DATA_MASK = DATA_BLOCK_LENGTH - 1
33
-
33
+
34
34
# The part of the index-2 table for U+D800..U+DBFF stores values for
35
35
# lead surrogate code _units_ not code _points_.
36
36
# Values for lead surrogate code _points_ are indexed with this portion of the table.
37
37
# Length=32=0x20=0x400>>SHIFT_2. (There are 1024=0x400 lead surrogates.)
38
38
LSCP_INDEX_2_OFFSET = 0x10000 >> SHIFT_2
39
39
LSCP_INDEX_2_LENGTH = 0x400 >> SHIFT_2
40
-
40
+
41
41
# Count the lengths of both BMP pieces. 2080=0x820
42
42
INDEX_2_BMP_LENGTH = LSCP_INDEX_2_OFFSET + LSCP_INDEX_2_LENGTH
43
-
43
+
44
44
# The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
45
45
# Length 32=0x20 for lead bytes C0..DF, regardless of SHIFT_2.
46
46
UTF8_2B_INDEX_2_OFFSET = INDEX_2_BMP_LENGTH
47
47
UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6 # U+0800 is the first code point after 2-byte UTF-8
48
-
48
+
49
49
# The index-1 table, only used for supplementary code points, at offset 2112=0x840.
50
50
# Variable length, for code points up to highStart, where the last single-value range starts.
51
51
# Maximum length 512=0x200=0x100000>>SHIFT_1.
52
52
# (For 0x100000 supplementary code points U+10000..U+10ffff.)
53
- #
53
+ #
54
54
# The part of the index-2 table for supplementary code points starts
55
55
# after this index-1 table.
56
- #
56
+ #
57
57
# Both the index-1 table and the following part of the index-2 table
58
58
# are omitted completely if there is only BMP data.
59
59
INDEX_1_OFFSET = UTF8_2B_INDEX_2_OFFSET + UTF8_2B_INDEX_2_LENGTH
60
-
60
+
61
61
# The alignment size of a data block. Also the granularity for compaction.
62
62
DATA_GRANULARITY = 1 << INDEX_SHIFT
63
-
63
+
64
64
constructor : (json = {}) ->
65
65
@data = json .data or []
66
66
@highStart = json .highStart ? 0
67
67
@errorValue = json .errorValue ? - 1
68
-
68
+
69
69
get : (codePoint ) ->
70
70
if codePoint < 0 or codePoint > 0x10ffff
71
71
return @errorValue
72
-
72
+
73
73
if (codePoint < 0xd800 or (codePoint > 0xdbff and codePoint <= 0xffff ))
74
74
# Ordinary BMP code point, excluding leading surrogates.
75
75
# BMP uses a single level lookup. BMP index starts at offset 0 in the index.
76
76
# data is stored in the index array itself.
77
77
index = (@data [codePoint >> SHIFT_2] << INDEX_SHIFT) + (codePoint & DATA_MASK)
78
78
return @data [index]
79
-
79
+
80
80
if codePoint <= 0xffff
81
81
# Lead Surrogate Code Point. A Separate index section is stored for
82
82
# lead surrogate code units and code points.
83
83
# The main index has the code unit data.
84
84
# For this function, we need the code point data.
85
85
index = (@data [LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800 ) >> SHIFT_2)] << INDEX_SHIFT) + (codePoint & DATA_MASK)
86
86
return @data [index]
87
-
87
+
88
88
if codePoint < @highStart
89
89
# Supplemental code point, use two-level lookup.
90
90
index = @data [(INDEX_1_OFFSET - OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> SHIFT_1)]
91
91
index = @data [index + ((codePoint >> SHIFT_2) & INDEX_2_MASK)]
92
92
index = (index << INDEX_SHIFT) + (codePoint & DATA_MASK)
93
93
return @data [index]
94
-
94
+
95
95
return @data [@data .length - DATA_GRANULARITY]
96
-
96
+
97
97
module .exports = UnicodeTrie
0 commit comments