Skip to content

Commit 18da8d2

Browse files
committed
split the Khmer group at Cherokee; ensure larger gap before the first & after the last primary of a major script
git-svn-id: https://unicode.org/repos/unicodetools/branches/markus@484 13e8329f-0b23-4da4-9fe8-d0f6fe080806
1 parent 06cc8f4 commit 18da8d2

File tree

1 file changed

+98
-51
lines changed

1 file changed

+98
-51
lines changed

uca63/org/unicode/text/UCA/PrimariesToFractional.java

Lines changed: 98 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,17 @@
2323
* @since 2013-jan-02 (mostly pulled out of {@link FractionalUCA})
2424
*/
2525
public final class PrimariesToFractional {
26+
/**
27+
* Scripts that start reordering groups, and normally get two-byte primary weights.
28+
*/
2629
private static final BitSet MAJOR_SCRIPTS = new BitSet();
2730
static {
2831
for (byte i : new Byte[]{
2932
UCD_Types.ARABIC_SCRIPT,
3033
UCD_Types.ARMENIAN_SCRIPT,
3134
UCD_Types.BENGALI_SCRIPT,
3235
UCD_Types.BOPOMOFO_SCRIPT,
36+
UCD_Types.CHEROKEE_SCRIPT,
3337
UCD_Types.CYRILLIC_SCRIPT,
3438
UCD_Types.DEVANAGARI_SCRIPT,
3539
UCD_Types.ETHIOPIC_SCRIPT,
@@ -178,7 +182,7 @@ public int getFractionalPrimary() {
178182
/**
179183
* Computes valid FractionalUCA primary weights of desired byte lengths.
180184
* Always starts with the first primary weight after 02.
181-
* {@link PrimaryWeight#next(int, boolean)} increments
185+
* {@link PrimaryWeight#next(int)} increments
182186
* one 1/2/3-byte weight to another 1/2/3-byte weight.
183187
*/
184188
private static class PrimaryWeight {
@@ -199,13 +203,18 @@ private static class PrimaryWeight {
199203
private static final int MAX2_COMPRESSED = 0xfe;
200204

201205
/**
202-
* Increment byte2 a little more at script boundaries
203-
* and around single-byte primaries,
206+
* Increment byte2 a little more around single-byte primaries,
207+
* for tailoring of at least 4 two-byte primaries or more than 1000 three-byte primaries.
208+
*/
209+
private static final int GAP2_FOR_SINGLE = 4;
210+
/**
211+
* Increment byte2 a little more at major-script boundaries,
204212
* for tailoring of at least 4 two-byte primaries or more than 1000 three-byte primaries.
205213
*/
206-
private static final int GAP2_PLUS = 4;
214+
private static final int GAP2_FOR_MAJOR_SCRIPT = 4;
207215

208216
private static final int GAP3 = 7;
217+
private static final int GAP3_FOR_MINOR_SCRIPT = 40;
209218

210219
private int minByte2 = MIN2_UNCOMPRESSED;
211220
private int maxByte2 = MAX2_UNCOMPRESSED;
@@ -218,21 +227,31 @@ private static class PrimaryWeight {
218227
private int byte3;
219228
private int lastByteLength = 1;
220229
private boolean compressibleLeadByte;
230+
/**
231+
* The first script in each group is a "major" script and gets a somewhat larger gap
232+
* before its first primary and after its last primary.
233+
*/
234+
private boolean firstScriptInGroup;
235+
/**
236+
* Leave a somewhat larger gap between the special script-first primary
237+
* and the first real letter primary.
238+
*/
239+
private boolean firstPrimaryInScript;
221240

222241
public int getIntValue() {
223242
return (byte1 << 16) + (byte2 << 8) + byte3;
224243
}
225244

226-
public int startNewGroup(int newByteLength, boolean compress) {
245+
public int startNewGroup(boolean compress) {
227246
int oByte1 = byte1;
228247
int oByte2 = byte2;
229248

230249
int inc1;
231250
if (lastByteLength == 1) {
232251
// Single-byte gap of 1 from a single-byte weight to the new reordering group.
233252
inc1 = 2;
234-
} else if ((byte2 + GAP2_PLUS) <= maxByte2) {
235-
// End-of-script two-byte-weight gap.
253+
} else if ((byte2 + GAP2_FOR_SINGLE) <= maxByte2) {
254+
// End-of-group two-byte-weight gap.
236255
inc1 = 1;
237256
} else {
238257
// The two-byte-weight gap would be too small.
@@ -241,33 +260,55 @@ public int startNewGroup(int newByteLength, boolean compress) {
241260
addTo1(inc1);
242261

243262
int newMinByte2 = compress ? MIN2_COMPRESSED : MIN2_UNCOMPRESSED;
244-
switch (newByteLength) {
245-
case 1:
246-
byte2 = byte3 = 0;
247-
break;
248-
case 2:
249-
byte2 = newMinByte2;
250-
byte3 = 0;
251-
break;
252-
case 3:
253-
byte2 = newMinByte2;
254-
byte3 = MIN_BYTE;
255-
break;
256-
}
263+
byte2 = newMinByte2;
264+
byte3 = MIN_BYTE;
257265

258-
check(oByte1, oByte2, newByteLength, true);
266+
check(oByte1, oByte2, 3, true);
259267

260268
compressibleLeadByte = compress;
261269
minByte2 = newMinByte2;
262270
maxByte2 = compressibleLeadByte ? MAX2_COMPRESSED : MAX2_UNCOMPRESSED;;
263-
lastByteLength = newByteLength;
271+
lastByteLength = 3;
272+
firstScriptInGroup = firstPrimaryInScript = true;
273+
return getIntValue();
274+
}
275+
276+
public int startNewScript() {
277+
int oByte1 = byte1;
278+
int oByte2 = byte2;
279+
280+
if (lastByteLength == 1) {
281+
// Larger two-byte gap after a single.
282+
addTo1(1);
283+
byte2 = minByte2 + GAP2_FOR_SINGLE;
284+
byte3 = MIN_BYTE;
285+
} else if (firstScriptInGroup) {
286+
// End-of-major-script two-byte-weight gap.
287+
addTo2(GAP2_FOR_MAJOR_SCRIPT + 1);
288+
byte3 = MIN_BYTE;
289+
} else if (lastByteLength == 2) {
290+
// At least a two-byte gap after a double.
291+
addTo2(2);
292+
byte3 = MIN_BYTE;
293+
} else /* lastByteLength == 3 */ {
294+
addTo3(GAP3_FOR_MINOR_SCRIPT + 1);
295+
}
296+
297+
check(oByte1, oByte2, 3, false);
298+
299+
lastByteLength = 3;
300+
firstScriptInGroup = false;
301+
firstPrimaryInScript = true;
264302
return getIntValue();
265303
}
266304

267-
public int next(int newByteLength, boolean scriptChange) {
305+
public int next(int newByteLength) {
268306
int oByte1 = byte1;
269307
int oByte2 = byte2;
270308

309+
// Script-first primaries are three-byters.
310+
assert !firstPrimaryInScript || lastByteLength == 3;
311+
271312
switch (lastByteLength) {
272313
case 1:
273314
switch (newByteLength) {
@@ -278,12 +319,12 @@ public int next(int newByteLength, boolean scriptChange) {
278319
case 2:
279320
// Larger two-byte gap after a single.
280321
addTo1(1);
281-
byte2 = minByte2 + GAP2_PLUS;
322+
byte2 = minByte2 + GAP2_FOR_SINGLE;
282323
break;
283324
case 3:
284325
// Larger two-byte gap after a single.
285326
addTo1(1);
286-
byte2 = minByte2 + GAP2_PLUS;
327+
byte2 = minByte2 + GAP2_FOR_SINGLE;
287328
byte3 = MIN_BYTE;
288329
break;
289330
}
@@ -292,16 +333,16 @@ public int next(int newByteLength, boolean scriptChange) {
292333
switch (newByteLength) {
293334
case 1:
294335
// At least a larger two-byte gap before a single.
295-
addTo1((byte2 + GAP2_PLUS) <= maxByte2 ? 1 : 2);
336+
addTo1((byte2 + GAP2_FOR_SINGLE) <= maxByte2 ? 1 : 2);
296337
byte2 = 0;
297338
break;
298339
case 2:
299340
// Normal two-byte gap.
300-
addTo2(scriptChange ? GAP2_PLUS + 1: 2);
341+
addTo2(2);
301342
break;
302343
case 3:
303344
// At least a two-byte gap after a double.
304-
addTo2(scriptChange ? GAP2_PLUS + 1: 2);
345+
addTo2(2);
305346
byte3 = MIN_BYTE;
306347
break;
307348
}
@@ -310,25 +351,28 @@ public int next(int newByteLength, boolean scriptChange) {
310351
switch (newByteLength) {
311352
case 1:
312353
// At least a larger two-byte gap before a single.
313-
addTo1((byte2 + GAP2_PLUS) <= maxByte2 ? 1 : 2);
354+
addTo1((byte2 + GAP2_FOR_SINGLE) <= maxByte2 ? 1 : 2);
314355
byte2 = byte3 = 0;
315356
break;
316357
case 2:
317-
// At least a two-byte gap before a double.
318-
addTo2(scriptChange ? GAP2_PLUS + 1: 2);
358+
if (firstPrimaryInScript && firstScriptInGroup) {
359+
// Larger two-byte gap before the first letter of a major script.
360+
addTo2(GAP2_FOR_MAJOR_SCRIPT + 1);
361+
} else {
362+
// At least a two-byte gap before a double.
363+
addTo2(2);
364+
}
319365
byte3 = 0;
320366
break;
321367
case 3:
322-
if (scriptChange) {
323-
// TODO: At least a small two-byte gap between minor scripts.
324-
// Issue: At least one of the miscellaneous-scripts reordering groups
325-
// overflows with this.
326-
// addTo2(2);
327-
// byte3 = MIN_BYTE;
328-
329-
// TODO: The following is a smaller gap in the meantime.
330-
// At least a large three-byte gap between minor scripts.
331-
addTo3(40 + 1);
368+
if (firstPrimaryInScript) {
369+
if (firstScriptInGroup) {
370+
// Larger two-byte gap before the first letter of a major script.
371+
addTo2(GAP2_FOR_MAJOR_SCRIPT);
372+
} else {
373+
// Larger three-byte gap before the first letter of a minor script.
374+
addTo3(GAP3_FOR_MINOR_SCRIPT + 1);
375+
}
332376
} else {
333377
// Normal three-byte gap.
334378
addTo3(GAP3 + 1);
@@ -341,6 +385,7 @@ public int next(int newByteLength, boolean scriptChange) {
341385
check(oByte1, oByte2, newByteLength, false);
342386

343387
lastByteLength = newByteLength;
388+
firstPrimaryInScript = false;
344389
return getIntValue();
345390
}
346391

@@ -449,6 +494,7 @@ public PrimariesToFractional(UCA uca) {
449494
groupIsCompressible[UCD_Types.TIBETAN_SCRIPT] = true;
450495
groupIsCompressible[UCD_Types.MYANMAR_SCRIPT] = true;
451496
groupIsCompressible[UCD_Types.KHMER_SCRIPT] = true;
497+
groupIsCompressible[UCD_Types.CHEROKEE_SCRIPT] = true;
452498
groupIsCompressible[UCD_Types.HANGUL_SCRIPT] = true;
453499
groupIsCompressible[UCD_Types.HIRAGANA_SCRIPT] = true;
454500
groupIsCompressible[UCD_Types.BOPOMOFO_SCRIPT] = true;
@@ -494,14 +540,13 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
494540

495541
PrimaryToFractional props = getOrCreateProps(primary);
496542
int currentByteLength = props.getFractionalLength();
497-
boolean scriptChange = false;
498543

499544
int reorderCode = props.reorderCodeIfFirst;
500545
if (reorderCode >= 0) {
501546
int firstFractional;
502547
if (props.startsGroup) {
503548
boolean compress = groupIsCompressible[reorderCode];
504-
firstFractional = fractionalPrimary.startNewGroup(3, compress);
549+
firstFractional = fractionalPrimary.startNewGroup(compress);
505550
int leadByte = Fractional.getLeadByte(firstFractional);
506551

507552
// Finish the previous reordering group.
@@ -529,12 +574,12 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
529574
groupComment));
530575

531576
if (reorderCode == ReorderCodes.DIGIT) {
532-
numericFractionalPrimary = fractionalPrimary.next(1, true);
577+
numericFractionalPrimary = fractionalPrimary.next(1);
533578
++numPrimaries;
534579
}
535580
} else {
536581
// New script in current reordering group.
537-
firstFractional = fractionalPrimary.next(3, true);
582+
firstFractional = fractionalPrimary.startNewScript();
538583
if (groupInfo.length() != 0) {
539584
groupInfo.append(' ');
540585
}
@@ -550,10 +595,11 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
550595
props.scriptFirstPrimary = firstFractional;
551596
firstFractionalPrimary[reorderCode] = firstFractional;
552597
++numPrimaries;
553-
scriptChange = true;
554598
}
555599

556-
if (currentByteLength == 3 && (scriptChange || fractionalPrimary.lastByteLength <= 2)) {
600+
if (currentByteLength == 3 &&
601+
(fractionalPrimary.firstPrimaryInScript ||
602+
fractionalPrimary.lastByteLength <= 2)) {
557603
// We slightly optimize the assignment of primary weights:
558604
// If a 3-byte primary is surrounded by one-or-two-byte primaries,
559605
// or script boundaries,
@@ -582,7 +628,7 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
582628
currentByteLength = 2;
583629
}
584630
}
585-
props.fractionalPrimary = fractionalPrimary.next(currentByteLength, scriptChange);
631+
props.fractionalPrimary = fractionalPrimary.next(currentByteLength);
586632
++numPrimaries;
587633

588634
String newWeight = fractionalPrimary.toString();
@@ -598,7 +644,7 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo
598644
}
599645

600646
// Create an entry for the first primary in the Hani script.
601-
int firstFractional = fractionalPrimary.startNewGroup(3, false);
647+
int firstFractional = fractionalPrimary.startNewGroup(false);
602648
int leadByte = Fractional.getLeadByte(firstFractional);
603649

604650
// Finish the previous reordering group.
@@ -650,7 +696,7 @@ private static void appendTopByteInfo(StringBuilder topByteInfo, boolean compres
650696
"reordering group {" + groupInfo +
651697
"} marked for compression but uses more than one lead byte " +
652698
Utility.hex(b, 2) + ".." +
653-
Utility.hex(limit, 2));
699+
Utility.hex(limit - 1, 2));
654700
}
655701
} else if (canCompress) {
656702
System.out.println(
@@ -916,7 +962,8 @@ private static boolean isThreeByteMajorScript(int script) {
916962
// We cherry-pick the conjoining Jamo L/V/T for two-byte primaries.
917963
script == UCD_Types.HANGUL_SCRIPT ||
918964
script == UCD_Types.ETHIOPIC_SCRIPT ||
919-
script == UCD_Types.MYANMAR_SCRIPT;
965+
script == UCD_Types.MYANMAR_SCRIPT ||
966+
script == UCD_Types.CHEROKEE_SCRIPT;
920967
}
921968

922969
private static boolean isTwoByteMinorScript(int script) {

0 commit comments

Comments
 (0)