@@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
6380
6380
6381
6381
PG_RETURN_BOOL (result );
6382
6382
}
6383
+
6384
+ /*
6385
+ * Check if first n chars are hexadecimal digits
6386
+ */
6387
+ static bool
6388
+ isxdigits_n (const char * instr , size_t n )
6389
+ {
6390
+ for (size_t i = 0 ; i < n ; i ++ )
6391
+ if (!isxdigit ((unsigned char ) instr [i ]))
6392
+ return false;
6393
+
6394
+ return true;
6395
+ }
6396
+
6397
+ static unsigned int
6398
+ hexval (unsigned char c )
6399
+ {
6400
+ if (c >= '0' && c <= '9' )
6401
+ return c - '0' ;
6402
+ if (c >= 'a' && c <= 'f' )
6403
+ return c - 'a' + 0xA ;
6404
+ if (c >= 'A' && c <= 'F' )
6405
+ return c - 'A' + 0xA ;
6406
+ elog (ERROR , "invalid hexadecimal digit" );
6407
+ return 0 ; /* not reached */
6408
+ }
6409
+
6410
+ /*
6411
+ * Translate string with hexadecimal digits to number
6412
+ */
6413
+ static unsigned int
6414
+ hexval_n (const char * instr , size_t n )
6415
+ {
6416
+ unsigned int result = 0 ;
6417
+
6418
+ for (size_t i = 0 ; i < n ; i ++ )
6419
+ result += hexval (instr [i ]) << (4 * (n - i - 1 ));
6420
+
6421
+ return result ;
6422
+ }
6423
+
6424
+ /*
6425
+ * Replaces Unicode escape sequences by Unicode characters
6426
+ */
6427
+ Datum
6428
+ unistr (PG_FUNCTION_ARGS )
6429
+ {
6430
+ text * input_text = PG_GETARG_TEXT_PP (0 );
6431
+ char * instr ;
6432
+ int len ;
6433
+ StringInfoData str ;
6434
+ text * result ;
6435
+ pg_wchar pair_first = 0 ;
6436
+ char cbuf [MAX_UNICODE_EQUIVALENT_STRING + 1 ];
6437
+
6438
+ instr = VARDATA_ANY (input_text );
6439
+ len = VARSIZE_ANY_EXHDR (input_text );
6440
+
6441
+ initStringInfo (& str );
6442
+
6443
+ while (len > 0 )
6444
+ {
6445
+ if (instr [0 ] == '\\' )
6446
+ {
6447
+ if (len >= 2 &&
6448
+ instr [1 ] == '\\' )
6449
+ {
6450
+ if (pair_first )
6451
+ goto invalid_pair ;
6452
+ appendStringInfoChar (& str , '\\' );
6453
+ instr += 2 ;
6454
+ len -= 2 ;
6455
+ }
6456
+ else if ((len >= 5 && isxdigits_n (instr + 1 , 4 )) ||
6457
+ (len >= 6 && instr [1 ] == 'u' && isxdigits_n (instr + 2 , 4 )))
6458
+ {
6459
+ pg_wchar unicode ;
6460
+ int offset = instr [1 ] == 'u' ? 2 : 1 ;
6461
+
6462
+ unicode = hexval_n (instr + offset , 4 );
6463
+
6464
+ if (!is_valid_unicode_codepoint (unicode ))
6465
+ ereport (ERROR ,
6466
+ errcode (ERRCODE_INVALID_PARAMETER_VALUE ),
6467
+ errmsg ("invalid Unicode code point: %04X" , unicode ));
6468
+
6469
+ if (pair_first )
6470
+ {
6471
+ if (is_utf16_surrogate_second (unicode ))
6472
+ {
6473
+ unicode = surrogate_pair_to_codepoint (pair_first , unicode );
6474
+ pair_first = 0 ;
6475
+ }
6476
+ else
6477
+ goto invalid_pair ;
6478
+ }
6479
+ else if (is_utf16_surrogate_second (unicode ))
6480
+ goto invalid_pair ;
6481
+
6482
+ if (is_utf16_surrogate_first (unicode ))
6483
+ pair_first = unicode ;
6484
+ else
6485
+ {
6486
+ pg_unicode_to_server (unicode , (unsigned char * ) cbuf );
6487
+ appendStringInfoString (& str , cbuf );
6488
+ }
6489
+
6490
+ instr += 4 + offset ;
6491
+ len -= 4 + offset ;
6492
+ }
6493
+ else if (len >= 8 && instr [1 ] == '+' && isxdigits_n (instr + 2 , 6 ))
6494
+ {
6495
+ pg_wchar unicode ;
6496
+
6497
+ unicode = hexval_n (instr + 2 , 6 );
6498
+
6499
+ if (!is_valid_unicode_codepoint (unicode ))
6500
+ ereport (ERROR ,
6501
+ errcode (ERRCODE_INVALID_PARAMETER_VALUE ),
6502
+ errmsg ("invalid Unicode code point: %04X" , unicode ));
6503
+
6504
+ if (pair_first )
6505
+ {
6506
+ if (is_utf16_surrogate_second (unicode ))
6507
+ {
6508
+ unicode = surrogate_pair_to_codepoint (pair_first , unicode );
6509
+ pair_first = 0 ;
6510
+ }
6511
+ else
6512
+ goto invalid_pair ;
6513
+ }
6514
+ else if (is_utf16_surrogate_second (unicode ))
6515
+ goto invalid_pair ;
6516
+
6517
+ if (is_utf16_surrogate_first (unicode ))
6518
+ pair_first = unicode ;
6519
+ else
6520
+ {
6521
+ pg_unicode_to_server (unicode , (unsigned char * ) cbuf );
6522
+ appendStringInfoString (& str , cbuf );
6523
+ }
6524
+
6525
+ instr += 8 ;
6526
+ len -= 8 ;
6527
+ }
6528
+ else if (len >= 10 && instr [1 ] == 'U' && isxdigits_n (instr + 2 , 8 ))
6529
+ {
6530
+ pg_wchar unicode ;
6531
+
6532
+ unicode = hexval_n (instr + 2 , 8 );
6533
+
6534
+ if (!is_valid_unicode_codepoint (unicode ))
6535
+ ereport (ERROR ,
6536
+ errcode (ERRCODE_INVALID_PARAMETER_VALUE ),
6537
+ errmsg ("invalid Unicode code point: %04X" , unicode ));
6538
+
6539
+ if (pair_first )
6540
+ {
6541
+ if (is_utf16_surrogate_second (unicode ))
6542
+ {
6543
+ unicode = surrogate_pair_to_codepoint (pair_first , unicode );
6544
+ pair_first = 0 ;
6545
+ }
6546
+ else
6547
+ goto invalid_pair ;
6548
+ }
6549
+ else if (is_utf16_surrogate_second (unicode ))
6550
+ goto invalid_pair ;
6551
+
6552
+ if (is_utf16_surrogate_first (unicode ))
6553
+ pair_first = unicode ;
6554
+ else
6555
+ {
6556
+ pg_unicode_to_server (unicode , (unsigned char * ) cbuf );
6557
+ appendStringInfoString (& str , cbuf );
6558
+ }
6559
+
6560
+ instr += 10 ;
6561
+ len -= 10 ;
6562
+ }
6563
+ else
6564
+ ereport (ERROR ,
6565
+ (errcode (ERRCODE_SYNTAX_ERROR ),
6566
+ errmsg ("invalid Unicode escape" ),
6567
+ errhint ("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX." )));
6568
+ }
6569
+ else
6570
+ {
6571
+ if (pair_first )
6572
+ goto invalid_pair ;
6573
+
6574
+ appendStringInfoChar (& str , * instr ++ );
6575
+ len -- ;
6576
+ }
6577
+ }
6578
+
6579
+ /* unfinished surrogate pair? */
6580
+ if (pair_first )
6581
+ goto invalid_pair ;
6582
+
6583
+ result = cstring_to_text_with_len (str .data , str .len );
6584
+ pfree (str .data );
6585
+
6586
+ PG_RETURN_TEXT_P (result );
6587
+
6588
+ invalid_pair :
6589
+ ereport (ERROR ,
6590
+ (errcode (ERRCODE_SYNTAX_ERROR ),
6591
+ errmsg ("invalid Unicode surrogate pair" )));
6592
+ }
0 commit comments