23
23
* @since 2013-jan-02 (mostly pulled out of {@link FractionalUCA})
24
24
*/
25
25
public final class PrimariesToFractional {
26
+ /**
27
+ * Scripts that start reordering groups, and normally get two-byte primary weights.
28
+ */
26
29
private static final BitSet MAJOR_SCRIPTS = new BitSet ();
27
30
static {
28
31
for (byte i : new Byte []{
29
32
UCD_Types .ARABIC_SCRIPT ,
30
33
UCD_Types .ARMENIAN_SCRIPT ,
31
34
UCD_Types .BENGALI_SCRIPT ,
32
35
UCD_Types .BOPOMOFO_SCRIPT ,
36
+ UCD_Types .CHEROKEE_SCRIPT ,
33
37
UCD_Types .CYRILLIC_SCRIPT ,
34
38
UCD_Types .DEVANAGARI_SCRIPT ,
35
39
UCD_Types .ETHIOPIC_SCRIPT ,
@@ -178,7 +182,7 @@ public int getFractionalPrimary() {
178
182
/**
179
183
* Computes valid FractionalUCA primary weights of desired byte lengths.
180
184
* Always starts with the first primary weight after 02.
181
- * {@link PrimaryWeight#next(int, boolean )} increments
185
+ * {@link PrimaryWeight#next(int)} increments
182
186
* one 1/2/3-byte weight to another 1/2/3-byte weight.
183
187
*/
184
188
private static class PrimaryWeight {
@@ -199,13 +203,18 @@ private static class PrimaryWeight {
199
203
private static final int MAX2_COMPRESSED = 0xfe ;
200
204
201
205
/**
202
- * Increment byte2 a little more at script boundaries
203
- * and around single-byte primaries,
206
+ * Increment byte2 a little more around single-byte primaries,
207
+ * for tailoring of at least 4 two-byte primaries or more than 1000 three-byte primaries.
208
+ */
209
+ private static final int GAP2_FOR_SINGLE = 4 ;
210
+ /**
211
+ * Increment byte2 a little more at major-script boundaries,
204
212
* for tailoring of at least 4 two-byte primaries or more than 1000 three-byte primaries.
205
213
*/
206
- private static final int GAP2_PLUS = 4 ;
214
+ private static final int GAP2_FOR_MAJOR_SCRIPT = 4 ;
207
215
208
216
private static final int GAP3 = 7 ;
217
+ private static final int GAP3_FOR_MINOR_SCRIPT = 40 ;
209
218
210
219
private int minByte2 = MIN2_UNCOMPRESSED ;
211
220
private int maxByte2 = MAX2_UNCOMPRESSED ;
@@ -218,21 +227,31 @@ private static class PrimaryWeight {
218
227
private int byte3 ;
219
228
private int lastByteLength = 1 ;
220
229
private boolean compressibleLeadByte ;
230
+ /**
231
+ * The first script in each group is a "major" script and gets a somewhat larger gap
232
+ * before its first primary and after its last primary.
233
+ */
234
+ private boolean firstScriptInGroup ;
235
+ /**
236
+ * Leave a somewhat larger gap between the special script-first primary
237
+ * and the first real letter primary.
238
+ */
239
+ private boolean firstPrimaryInScript ;
221
240
222
241
public int getIntValue () {
223
242
return (byte1 << 16 ) + (byte2 << 8 ) + byte3 ;
224
243
}
225
244
226
- public int startNewGroup (int newByteLength , boolean compress ) {
245
+ public int startNewGroup (boolean compress ) {
227
246
int oByte1 = byte1 ;
228
247
int oByte2 = byte2 ;
229
248
230
249
int inc1 ;
231
250
if (lastByteLength == 1 ) {
232
251
// Single-byte gap of 1 from a single-byte weight to the new reordering group.
233
252
inc1 = 2 ;
234
- } else if ((byte2 + GAP2_PLUS ) <= maxByte2 ) {
235
- // End-of-script two-byte-weight gap.
253
+ } else if ((byte2 + GAP2_FOR_SINGLE ) <= maxByte2 ) {
254
+ // End-of-group two-byte-weight gap.
236
255
inc1 = 1 ;
237
256
} else {
238
257
// The two-byte-weight gap would be too small.
@@ -241,33 +260,55 @@ public int startNewGroup(int newByteLength, boolean compress) {
241
260
addTo1 (inc1 );
242
261
243
262
int newMinByte2 = compress ? MIN2_COMPRESSED : MIN2_UNCOMPRESSED ;
244
- switch (newByteLength ) {
245
- case 1 :
246
- byte2 = byte3 = 0 ;
247
- break ;
248
- case 2 :
249
- byte2 = newMinByte2 ;
250
- byte3 = 0 ;
251
- break ;
252
- case 3 :
253
- byte2 = newMinByte2 ;
254
- byte3 = MIN_BYTE ;
255
- break ;
256
- }
263
+ byte2 = newMinByte2 ;
264
+ byte3 = MIN_BYTE ;
257
265
258
- check (oByte1 , oByte2 , newByteLength , true );
266
+ check (oByte1 , oByte2 , 3 , true );
259
267
260
268
compressibleLeadByte = compress ;
261
269
minByte2 = newMinByte2 ;
262
270
maxByte2 = compressibleLeadByte ? MAX2_COMPRESSED : MAX2_UNCOMPRESSED ;;
263
- lastByteLength = newByteLength ;
271
+ lastByteLength = 3 ;
272
+ firstScriptInGroup = firstPrimaryInScript = true ;
273
+ return getIntValue ();
274
+ }
275
+
276
+ public int startNewScript () {
277
+ int oByte1 = byte1 ;
278
+ int oByte2 = byte2 ;
279
+
280
+ if (lastByteLength == 1 ) {
281
+ // Larger two-byte gap after a single.
282
+ addTo1 (1 );
283
+ byte2 = minByte2 + GAP2_FOR_SINGLE ;
284
+ byte3 = MIN_BYTE ;
285
+ } else if (firstScriptInGroup ) {
286
+ // End-of-major-script two-byte-weight gap.
287
+ addTo2 (GAP2_FOR_MAJOR_SCRIPT + 1 );
288
+ byte3 = MIN_BYTE ;
289
+ } else if (lastByteLength == 2 ) {
290
+ // At least a two-byte gap after a double.
291
+ addTo2 (2 );
292
+ byte3 = MIN_BYTE ;
293
+ } else /* lastByteLength == 3 */ {
294
+ addTo3 (GAP3_FOR_MINOR_SCRIPT + 1 );
295
+ }
296
+
297
+ check (oByte1 , oByte2 , 3 , false );
298
+
299
+ lastByteLength = 3 ;
300
+ firstScriptInGroup = false ;
301
+ firstPrimaryInScript = true ;
264
302
return getIntValue ();
265
303
}
266
304
267
- public int next (int newByteLength , boolean scriptChange ) {
305
+ public int next (int newByteLength ) {
268
306
int oByte1 = byte1 ;
269
307
int oByte2 = byte2 ;
270
308
309
+ // Script-first primaries are three-byters.
310
+ assert !firstPrimaryInScript || lastByteLength == 3 ;
311
+
271
312
switch (lastByteLength ) {
272
313
case 1 :
273
314
switch (newByteLength ) {
@@ -278,12 +319,12 @@ public int next(int newByteLength, boolean scriptChange) {
278
319
case 2 :
279
320
// Larger two-byte gap after a single.
280
321
addTo1 (1 );
281
- byte2 = minByte2 + GAP2_PLUS ;
322
+ byte2 = minByte2 + GAP2_FOR_SINGLE ;
282
323
break ;
283
324
case 3 :
284
325
// Larger two-byte gap after a single.
285
326
addTo1 (1 );
286
- byte2 = minByte2 + GAP2_PLUS ;
327
+ byte2 = minByte2 + GAP2_FOR_SINGLE ;
287
328
byte3 = MIN_BYTE ;
288
329
break ;
289
330
}
@@ -292,16 +333,16 @@ public int next(int newByteLength, boolean scriptChange) {
292
333
switch (newByteLength ) {
293
334
case 1 :
294
335
// At least a larger two-byte gap before a single.
295
- addTo1 ((byte2 + GAP2_PLUS ) <= maxByte2 ? 1 : 2 );
336
+ addTo1 ((byte2 + GAP2_FOR_SINGLE ) <= maxByte2 ? 1 : 2 );
296
337
byte2 = 0 ;
297
338
break ;
298
339
case 2 :
299
340
// Normal two-byte gap.
300
- addTo2 (scriptChange ? GAP2_PLUS + 1 : 2 );
341
+ addTo2 (2 );
301
342
break ;
302
343
case 3 :
303
344
// At least a two-byte gap after a double.
304
- addTo2 (scriptChange ? GAP2_PLUS + 1 : 2 );
345
+ addTo2 (2 );
305
346
byte3 = MIN_BYTE ;
306
347
break ;
307
348
}
@@ -310,25 +351,28 @@ public int next(int newByteLength, boolean scriptChange) {
310
351
switch (newByteLength ) {
311
352
case 1 :
312
353
// At least a larger two-byte gap before a single.
313
- addTo1 ((byte2 + GAP2_PLUS ) <= maxByte2 ? 1 : 2 );
354
+ addTo1 ((byte2 + GAP2_FOR_SINGLE ) <= maxByte2 ? 1 : 2 );
314
355
byte2 = byte3 = 0 ;
315
356
break ;
316
357
case 2 :
317
- // At least a two-byte gap before a double.
318
- addTo2 (scriptChange ? GAP2_PLUS + 1 : 2 );
358
+ if (firstPrimaryInScript && firstScriptInGroup ) {
359
+ // Larger two-byte gap before the first letter of a major script.
360
+ addTo2 (GAP2_FOR_MAJOR_SCRIPT + 1 );
361
+ } else {
362
+ // At least a two-byte gap before a double.
363
+ addTo2 (2 );
364
+ }
319
365
byte3 = 0 ;
320
366
break ;
321
367
case 3 :
322
- if (scriptChange ) {
323
- // TODO: At least a small two-byte gap between minor scripts.
324
- // Issue: At least one of the miscellaneous-scripts reordering groups
325
- // overflows with this.
326
- // addTo2(2);
327
- // byte3 = MIN_BYTE;
328
-
329
- // TODO: The following is a smaller gap in the meantime.
330
- // At least a large three-byte gap between minor scripts.
331
- addTo3 (40 + 1 );
368
+ if (firstPrimaryInScript ) {
369
+ if (firstScriptInGroup ) {
370
+ // Larger two-byte gap before the first letter of a major script.
371
+ addTo2 (GAP2_FOR_MAJOR_SCRIPT );
372
+ } else {
373
+ // Larger three-byte gap before the first letter of a minor script.
374
+ addTo3 (GAP3_FOR_MINOR_SCRIPT + 1 );
375
+ }
332
376
} else {
333
377
// Normal three-byte gap.
334
378
addTo3 (GAP3 + 1 );
@@ -341,6 +385,7 @@ public int next(int newByteLength, boolean scriptChange) {
341
385
check (oByte1 , oByte2 , newByteLength , false );
342
386
343
387
lastByteLength = newByteLength ;
388
+ firstPrimaryInScript = false ;
344
389
return getIntValue ();
345
390
}
346
391
@@ -449,6 +494,7 @@ public PrimariesToFractional(UCA uca) {
449
494
groupIsCompressible [UCD_Types .TIBETAN_SCRIPT ] = true ;
450
495
groupIsCompressible [UCD_Types .MYANMAR_SCRIPT ] = true ;
451
496
groupIsCompressible [UCD_Types .KHMER_SCRIPT ] = true ;
497
+ groupIsCompressible [UCD_Types .CHEROKEE_SCRIPT ] = true ;
452
498
groupIsCompressible [UCD_Types .HANGUL_SCRIPT ] = true ;
453
499
groupIsCompressible [UCD_Types .HIRAGANA_SCRIPT ] = true ;
454
500
groupIsCompressible [UCD_Types .BOPOMOFO_SCRIPT ] = true ;
@@ -494,14 +540,13 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
494
540
495
541
PrimaryToFractional props = getOrCreateProps (primary );
496
542
int currentByteLength = props .getFractionalLength ();
497
- boolean scriptChange = false ;
498
543
499
544
int reorderCode = props .reorderCodeIfFirst ;
500
545
if (reorderCode >= 0 ) {
501
546
int firstFractional ;
502
547
if (props .startsGroup ) {
503
548
boolean compress = groupIsCompressible [reorderCode ];
504
- firstFractional = fractionalPrimary .startNewGroup (3 , compress );
549
+ firstFractional = fractionalPrimary .startNewGroup (compress );
505
550
int leadByte = Fractional .getLeadByte (firstFractional );
506
551
507
552
// Finish the previous reordering group.
@@ -529,12 +574,12 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
529
574
groupComment ));
530
575
531
576
if (reorderCode == ReorderCodes .DIGIT ) {
532
- numericFractionalPrimary = fractionalPrimary .next (1 , true );
577
+ numericFractionalPrimary = fractionalPrimary .next (1 );
533
578
++numPrimaries ;
534
579
}
535
580
} else {
536
581
// New script in current reordering group.
537
- firstFractional = fractionalPrimary .next ( 3 , true );
582
+ firstFractional = fractionalPrimary .startNewScript ( );
538
583
if (groupInfo .length () != 0 ) {
539
584
groupInfo .append (' ' );
540
585
}
@@ -550,10 +595,11 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
550
595
props .scriptFirstPrimary = firstFractional ;
551
596
firstFractionalPrimary [reorderCode ] = firstFractional ;
552
597
++numPrimaries ;
553
- scriptChange = true ;
554
598
}
555
599
556
- if (currentByteLength == 3 && (scriptChange || fractionalPrimary .lastByteLength <= 2 )) {
600
+ if (currentByteLength == 3 &&
601
+ (fractionalPrimary .firstPrimaryInScript ||
602
+ fractionalPrimary .lastByteLength <= 2 )) {
557
603
// We slightly optimize the assignment of primary weights:
558
604
// If a 3-byte primary is surrounded by one-or-two-byte primaries,
559
605
// or script boundaries,
@@ -582,7 +628,7 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
582
628
currentByteLength = 2 ;
583
629
}
584
630
}
585
- props .fractionalPrimary = fractionalPrimary .next (currentByteLength , scriptChange );
631
+ props .fractionalPrimary = fractionalPrimary .next (currentByteLength );
586
632
++numPrimaries ;
587
633
588
634
String newWeight = fractionalPrimary .toString ();
@@ -598,7 +644,7 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
598
644
}
599
645
600
646
// Create an entry for the first primary in the Hani script.
601
- int firstFractional = fractionalPrimary .startNewGroup (3 , false );
647
+ int firstFractional = fractionalPrimary .startNewGroup (false );
602
648
int leadByte = Fractional .getLeadByte (firstFractional );
603
649
604
650
// Finish the previous reordering group.
@@ -650,7 +696,7 @@ private static void appendTopByteInfo(StringBuilder topByteInfo, boolean compres
650
696
"reordering group {" + groupInfo +
651
697
"} marked for compression but uses more than one lead byte " +
652
698
Utility .hex (b , 2 ) + ".." +
653
- Utility .hex (limit , 2 ));
699
+ Utility .hex (limit - 1 , 2 ));
654
700
}
655
701
} else if (canCompress ) {
656
702
System .out .println (
@@ -916,7 +962,8 @@ private static boolean isThreeByteMajorScript(int script) {
916
962
// We cherry-pick the conjoining Jamo L/V/T for two-byte primaries.
917
963
script == UCD_Types .HANGUL_SCRIPT ||
918
964
script == UCD_Types .ETHIOPIC_SCRIPT ||
919
- script == UCD_Types .MYANMAR_SCRIPT ;
965
+ script == UCD_Types .MYANMAR_SCRIPT ||
966
+ script == UCD_Types .CHEROKEE_SCRIPT ;
920
967
}
921
968
922
969
private static boolean isTwoByteMinorScript (int script ) {
0 commit comments