@@ -1334,6 +1334,244 @@ pg_utf8_islegal(const unsigned char *source, int length)
1334
1334
return true;
1335
1335
}
1336
1336
1337
+ #ifndef FRONTEND
1338
+
1339
+ /*
1340
+ * Generic character increment function.
1341
+ *
1342
+ * Not knowing anything about the properties of the encoding in use, we just
1343
+ * keep incrementing the last byte until pg_verifymbstr() likes the result,
1344
+ * or we run out of values to try.
1345
+ *
1346
+ * Like all character-increment functions, we must restore the original input
1347
+ * string on failure.
1348
+ */
1349
+ static bool
1350
+ pg_generic_charinc (unsigned char * charptr , int len )
1351
+ {
1352
+ unsigned char * lastchar = (unsigned char * ) (charptr + len - 1 );
1353
+ unsigned char savelastchar = * lastchar ;
1354
+ const char * const_charptr = (const char * )charptr ;
1355
+
1356
+ while (* lastchar < (unsigned char ) 255 )
1357
+ {
1358
+ (* lastchar )++ ;
1359
+ if (!pg_verifymbstr (const_charptr , len , true))
1360
+ continue ;
1361
+ return true;
1362
+ }
1363
+
1364
+ * lastchar = savelastchar ;
1365
+ return false;
1366
+ }
1367
+
1368
+ /*
1369
+ * UTF-8 character increment function.
1370
+ *
1371
+ * For a one-byte character less than 0x7F, we just increment the byte.
1372
+ *
1373
+ * For a multibyte character, every byte but the first must fall between 0x80
1374
+ * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1375
+ * the last byte that's not already at its maximum value, and set any following
1376
+ * bytes back to 0x80. If we can't find a byte that's less than the maximum
1377
+ * allowable vale, we simply fail. We also have some special-case logic to
1378
+ * skip regions used for surrogate pair handling, as those should not occur in
1379
+ * valid UTF-8.
1380
+ *
1381
+ * Like all character-increment functions, we must restore the original input
1382
+ * string on failure.
1383
+ */
1384
+ static bool
1385
+ pg_utf8_increment (unsigned char * charptr , int length )
1386
+ {
1387
+ unsigned char a ;
1388
+ unsigned char bak [4 ];
1389
+ unsigned char limit ;
1390
+
1391
+ switch (length )
1392
+ {
1393
+ default :
1394
+ /* reject lengths 5 and 6 for now */
1395
+ return false;
1396
+ case 4 :
1397
+ bak [3 ] = charptr [3 ];
1398
+ a = charptr [3 ];
1399
+ if (a < 0xBF )
1400
+ {
1401
+ charptr [3 ]++ ;
1402
+ break ;
1403
+ }
1404
+ charptr [3 ] = 0x80 ;
1405
+ /* FALL THRU */
1406
+ case 3 :
1407
+ bak [2 ] = charptr [2 ];
1408
+ a = charptr [2 ];
1409
+ if (a < 0xBF )
1410
+ {
1411
+ charptr [2 ]++ ;
1412
+ break ;
1413
+ }
1414
+ charptr [2 ] = 0x80 ;
1415
+ /* FALL THRU */
1416
+ case 2 :
1417
+ bak [1 ] = charptr [1 ];
1418
+ a = charptr [1 ];
1419
+ switch (* charptr )
1420
+ {
1421
+ case 0xED :
1422
+ limit = 0x9F ;
1423
+ break ;
1424
+ case 0xF4 :
1425
+ limit = 0x8F ;
1426
+ break ;
1427
+ default :
1428
+ limit = 0xBF ;
1429
+ break ;
1430
+ }
1431
+ if (a < limit )
1432
+ {
1433
+ charptr [1 ]++ ;
1434
+ break ;
1435
+ }
1436
+ charptr [1 ] = 0x80 ;
1437
+ /* FALL THRU */
1438
+ case 1 :
1439
+ bak [0 ] = * charptr ;
1440
+ a = * charptr ;
1441
+ if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4 )
1442
+ {
1443
+ /* Restore original string. */
1444
+ memcpy (charptr , bak , length );
1445
+ return false;
1446
+ }
1447
+ charptr [0 ]++ ;
1448
+ break ;
1449
+ }
1450
+
1451
+ return true;
1452
+ }
1453
+
1454
+ /*
1455
+ * EUC-JP character increment function.
1456
+ *
1457
+ * If the sequence starts with SS2(0x8e), it must be a two-byte sequence
1458
+ * representing JIS X 0201 characters with the second byte ranges between
1459
+ * 0xa1 and 0xde. We just increment the last byte if it's less than 0xde,
1460
+ * and otherwise rewrite whole the sequence to 0xa1 0xa1.
1461
+ *
1462
+ * If the sequence starts with SS3(0x8f), it must be a three-byte sequence
1463
+ * which the last two bytes ranges between 0xa1 and 0xfe. The last byte
1464
+ * is incremented, carrying overflow to the second-to-last byte.
1465
+ *
1466
+ * If the sequence starts with the values other than the aboves and its MSB
1467
+ * is set, it must be a two-byte sequence representing JIS X 0208 characters
1468
+ * with both bytes ranges between 0xa1 and 0xfe. The last byte is incremented,
1469
+ * carrying overflow to the second-to-last byte.
1470
+ *
1471
+ * Otherwise the sequence is consists of single byte representing ASCII
1472
+ * characters. It is incremented up to 0x7f.
1473
+ *
1474
+ * Only three EUC-JP byte sequences shown below - which have no character
1475
+ * allocated - make this function to fail in spite of its validity: 0x7f,
1476
+ * 0xfe 0xfe, 0x8f 0xfe 0xfe.
1477
+ */
1478
+ static bool
1479
+ pg_eucjp_increment (unsigned char * charptr , int length )
1480
+ {
1481
+ unsigned char bak [3 ];
1482
+ unsigned char c1 , c2 ;
1483
+ signed int i ;
1484
+
1485
+ c1 = * charptr ;
1486
+
1487
+ switch (c1 )
1488
+ {
1489
+ case SS2 : /* JIS X 0201 */
1490
+ if (length != 2 )
1491
+ return false;
1492
+
1493
+ c2 = charptr [1 ];
1494
+
1495
+ if (c2 > 0xde )
1496
+ charptr [0 ] = charptr [1 ] = 0xa1 ;
1497
+ else if (c2 < 0xa1 )
1498
+ charptr [1 ] = 0xa1 ;
1499
+ else
1500
+ charptr [1 ]++ ;
1501
+
1502
+ break ;
1503
+
1504
+ case SS3 : /* JIS X 0212 */
1505
+ if (length != 3 )
1506
+ return false;
1507
+
1508
+ for (i = 2 ; i > 0 ; i -- )
1509
+ {
1510
+ bak [i ] = charptr [i ];
1511
+ c2 = charptr [i ];
1512
+ if (c2 < 0xa1 )
1513
+ {
1514
+ charptr [i ] = 0xa1 ;
1515
+ return true;
1516
+ }
1517
+ else if (c2 < 0xfe )
1518
+ {
1519
+ charptr [i ]++ ;
1520
+ break ;
1521
+ }
1522
+ charptr [i ] = 0xa1 ;
1523
+ }
1524
+
1525
+ if (i == 0 ) /* Out of 3-byte code region */
1526
+ {
1527
+ charptr [1 ] = bak [1 ];
1528
+ charptr [2 ] = bak [2 ];
1529
+ return false;
1530
+ }
1531
+ break ;
1532
+
1533
+ default :
1534
+ if (IS_HIGHBIT_SET (c1 )) /* JIS X 0208? */
1535
+ {
1536
+ if (length != 2 )
1537
+ return false;
1538
+
1539
+ for (i = 1 ; i >= 0 ; i -- ) /* i must be signed */
1540
+ {
1541
+ bak [i ] = charptr [i ];
1542
+ c2 = charptr [i ];
1543
+ if (c2 < 0xa1 )
1544
+ {
1545
+ charptr [i ] = 0xa1 ;
1546
+ return true;
1547
+ }
1548
+ else if (c2 < 0xfe )
1549
+ {
1550
+ charptr [i ]++ ;
1551
+ break ;
1552
+ }
1553
+ charptr [i ] = 0xa1 ;
1554
+ }
1555
+
1556
+ if (i < 0 ) /* Out of 2 byte code region */
1557
+ {
1558
+ charptr [0 ] = bak [0 ];
1559
+ charptr [1 ] = bak [1 ];
1560
+ return false;
1561
+ }
1562
+ }
1563
+ else
1564
+ { /* ASCII, single byte */
1565
+ if (c1 > 0x7e )
1566
+ return false;
1567
+ (* charptr )++ ;
1568
+ }
1569
+ }
1570
+
1571
+ return true;
1572
+ }
1573
+ #endif
1574
+
1337
1575
/*
1338
1576
*-------------------------------------------------------------------
1339
1577
* encoding info table
@@ -1458,6 +1696,25 @@ pg_database_encoding_max_length(void)
1458
1696
return pg_wchar_table [GetDatabaseEncoding ()].maxmblen ;
1459
1697
}
1460
1698
1699
+ /*
1700
+ * give the character incrementer for the encoding for the current database
1701
+ */
1702
+ mbcharacter_incrementer
1703
+ pg_database_encoding_character_incrementer (void )
1704
+ {
1705
+ switch (GetDatabaseEncoding ())
1706
+ {
1707
+ case PG_UTF8 :
1708
+ return pg_utf8_increment ;
1709
+
1710
+ case PG_EUC_JP :
1711
+ return pg_eucjp_increment ;
1712
+
1713
+ default :
1714
+ return pg_generic_charinc ;
1715
+ }
1716
+ }
1717
+
1461
1718
/*
1462
1719
* Verify mbstr to make sure that it is validly encoded in the current
1463
1720
* database encoding. Otherwise same as pg_verify_mbstr().
0 commit comments