Skip to content

Commit f37fec8

Browse files
committed
Add unistr function
This allows decoding a string with Unicode escape sequences. It is similar to Unicode escape strings, but offers some more flexibility. Author: Pavel Stehule <pavel.stehule@gmail.com> Reviewed-by: Asif Rehman <asifr.rehman@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
1 parent ebedd0c commit f37fec8

File tree

6 files changed

+310
-1
lines changed

6 files changed

+310
-1
lines changed

doc/src/sgml/func.sgml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
35513551
</para></entry>
35523552
</row>
35533553

3554+
<row>
3555+
<entry role="func_table_entry"><para role="func_signature">
3556+
<indexterm>
3557+
<primary>unistr</primary>
3558+
</indexterm>
3559+
<function>unistr</function> ( <type>text</type> )
3560+
<returnvalue>text</returnvalue>
3561+
</para>
3562+
<para>
3563+
Evaluate escaped Unicode characters in argument. Unicode characters
3564+
can be specified as
3565+
<literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal
3566+
digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6
3567+
hexadecimal digits),
3568+
<literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal
3569+
digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>
3570+
(8 hexadecimal digits). To specify a backslash, write two
3571+
backslashes. All other characters are taken literally.
3572+
</para>
3573+
3574+
<para>
3575+
If the server encoding is not UTF-8, the Unicode code point identified
3576+
by one of these escape sequences is converted to the actual server
3577+
encoding; an error is reported if that's not possible.
3578+
</para>
3579+
3580+
<para>
3581+
This function provides a (non-standard) alternative to string
3582+
constants with Unicode escapes (see <xref
3583+
linkend="sql-syntax-strings-uescape"/>).
3584+
</para>
3585+
3586+
<para>
3587+
<literal>unistr('\0441\043B\043E\043D')</literal>
3588+
<returnvalue>слон</returnvalue>
3589+
</para>
3590+
<para>
3591+
<literal>unistr('d\0061t\+000061')</literal>
3592+
<returnvalue>data</returnvalue>
3593+
</para>
3594+
<para>
3595+
<literal>unistr('d\u0061t\U00000061')</literal>
3596+
<returnvalue>data</returnvalue>
3597+
</para></entry>
3598+
</row>
3599+
35543600
</tbody>
35553601
</tgroup>
35563602
</table>

src/backend/utils/adt/varlena.c

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
63806380

63816381
PG_RETURN_BOOL(result);
63826382
}
6383+
6384+
/*
6385+
* Check if first n chars are hexadecimal digits
6386+
*/
6387+
static bool
6388+
isxdigits_n(const char *instr, size_t n)
6389+
{
6390+
for (size_t i = 0; i < n; i++)
6391+
if (!isxdigit((unsigned char) instr[i]))
6392+
return false;
6393+
6394+
return true;
6395+
}
6396+
6397+
static unsigned int
6398+
hexval(unsigned char c)
6399+
{
6400+
if (c >= '0' && c <= '9')
6401+
return c - '0';
6402+
if (c >= 'a' && c <= 'f')
6403+
return c - 'a' + 0xA;
6404+
if (c >= 'A' && c <= 'F')
6405+
return c - 'A' + 0xA;
6406+
elog(ERROR, "invalid hexadecimal digit");
6407+
return 0; /* not reached */
6408+
}
6409+
6410+
/*
6411+
* Translate string with hexadecimal digits to number
6412+
*/
6413+
static unsigned int
6414+
hexval_n(const char *instr, size_t n)
6415+
{
6416+
unsigned int result = 0;
6417+
6418+
for (size_t i = 0; i < n; i++)
6419+
result += hexval(instr[i]) << (4 * (n - i - 1));
6420+
6421+
return result;
6422+
}
6423+
6424+
/*
6425+
* Replaces Unicode escape sequences by Unicode characters
6426+
*/
6427+
Datum
6428+
unistr(PG_FUNCTION_ARGS)
6429+
{
6430+
text *input_text = PG_GETARG_TEXT_PP(0);
6431+
char *instr;
6432+
int len;
6433+
StringInfoData str;
6434+
text *result;
6435+
pg_wchar pair_first = 0;
6436+
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6437+
6438+
instr = VARDATA_ANY(input_text);
6439+
len = VARSIZE_ANY_EXHDR(input_text);
6440+
6441+
initStringInfo(&str);
6442+
6443+
while (len > 0)
6444+
{
6445+
if (instr[0] == '\\')
6446+
{
6447+
if (len >= 2 &&
6448+
instr[1] == '\\')
6449+
{
6450+
if (pair_first)
6451+
goto invalid_pair;
6452+
appendStringInfoChar(&str, '\\');
6453+
instr += 2;
6454+
len -= 2;
6455+
}
6456+
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6457+
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6458+
{
6459+
pg_wchar unicode;
6460+
int offset = instr[1] == 'u' ? 2 : 1;
6461+
6462+
unicode = hexval_n(instr + offset, 4);
6463+
6464+
if (!is_valid_unicode_codepoint(unicode))
6465+
ereport(ERROR,
6466+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6467+
errmsg("invalid Unicode code point: %04X", unicode));
6468+
6469+
if (pair_first)
6470+
{
6471+
if (is_utf16_surrogate_second(unicode))
6472+
{
6473+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6474+
pair_first = 0;
6475+
}
6476+
else
6477+
goto invalid_pair;
6478+
}
6479+
else if (is_utf16_surrogate_second(unicode))
6480+
goto invalid_pair;
6481+
6482+
if (is_utf16_surrogate_first(unicode))
6483+
pair_first = unicode;
6484+
else
6485+
{
6486+
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6487+
appendStringInfoString(&str, cbuf);
6488+
}
6489+
6490+
instr += 4 + offset;
6491+
len -= 4 + offset;
6492+
}
6493+
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6494+
{
6495+
pg_wchar unicode;
6496+
6497+
unicode = hexval_n(instr + 2, 6);
6498+
6499+
if (!is_valid_unicode_codepoint(unicode))
6500+
ereport(ERROR,
6501+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6502+
errmsg("invalid Unicode code point: %04X", unicode));
6503+
6504+
if (pair_first)
6505+
{
6506+
if (is_utf16_surrogate_second(unicode))
6507+
{
6508+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6509+
pair_first = 0;
6510+
}
6511+
else
6512+
goto invalid_pair;
6513+
}
6514+
else if (is_utf16_surrogate_second(unicode))
6515+
goto invalid_pair;
6516+
6517+
if (is_utf16_surrogate_first(unicode))
6518+
pair_first = unicode;
6519+
else
6520+
{
6521+
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6522+
appendStringInfoString(&str, cbuf);
6523+
}
6524+
6525+
instr += 8;
6526+
len -= 8;
6527+
}
6528+
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6529+
{
6530+
pg_wchar unicode;
6531+
6532+
unicode = hexval_n(instr + 2, 8);
6533+
6534+
if (!is_valid_unicode_codepoint(unicode))
6535+
ereport(ERROR,
6536+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6537+
errmsg("invalid Unicode code point: %04X", unicode));
6538+
6539+
if (pair_first)
6540+
{
6541+
if (is_utf16_surrogate_second(unicode))
6542+
{
6543+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6544+
pair_first = 0;
6545+
}
6546+
else
6547+
goto invalid_pair;
6548+
}
6549+
else if (is_utf16_surrogate_second(unicode))
6550+
goto invalid_pair;
6551+
6552+
if (is_utf16_surrogate_first(unicode))
6553+
pair_first = unicode;
6554+
else
6555+
{
6556+
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6557+
appendStringInfoString(&str, cbuf);
6558+
}
6559+
6560+
instr += 10;
6561+
len -= 10;
6562+
}
6563+
else
6564+
ereport(ERROR,
6565+
(errcode(ERRCODE_SYNTAX_ERROR),
6566+
errmsg("invalid Unicode escape"),
6567+
errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6568+
}
6569+
else
6570+
{
6571+
if (pair_first)
6572+
goto invalid_pair;
6573+
6574+
appendStringInfoChar(&str, *instr++);
6575+
len--;
6576+
}
6577+
}
6578+
6579+
/* unfinished surrogate pair? */
6580+
if (pair_first)
6581+
goto invalid_pair;
6582+
6583+
result = cstring_to_text_with_len(str.data, str.len);
6584+
pfree(str.data);
6585+
6586+
PG_RETURN_TEXT_P(result);
6587+
6588+
invalid_pair:
6589+
ereport(ERROR,
6590+
(errcode(ERRCODE_SYNTAX_ERROR),
6591+
errmsg("invalid Unicode surrogate pair")));
6592+
}

src/include/catalog/catversion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/* yyyymmddN */
56-
#define CATALOG_VERSION_NO 202103266
56+
#define CATALOG_VERSION_NO 202103291
5757

5858
#endif

src/include/catalog/pg_proc.dat

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11527,6 +11527,10 @@
1152711527
proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
1152811528
prosrc => 'unicode_is_normalized' },
1152911529

11530+
{ oid => '9822', descr => 'unescape Unicode characters',
11531+
proname => 'unistr', prorettype => 'text', proargtypes => 'text',
11532+
prosrc => 'unistr' },
11533+
1153011534
{ oid => '4596', descr => 'I/O',
1153111535
proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',
1153211536
proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },

src/test/regress/expected/strings.out

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);
22342234
15
22352235
(1 row)
22362236

2237+
SELECT unistr('\0064at\+0000610');
2238+
unistr
2239+
--------
2240+
data0
2241+
(1 row)
2242+
2243+
SELECT unistr('d\u0061t\U000000610');
2244+
unistr
2245+
--------
2246+
data0
2247+
(1 row)
2248+
2249+
SELECT unistr('a\\b');
2250+
unistr
2251+
--------
2252+
a\b
2253+
(1 row)
2254+
2255+
-- errors:
2256+
SELECT unistr('wrong: \db99');
2257+
ERROR: invalid Unicode surrogate pair
2258+
SELECT unistr('wrong: \db99\0061');
2259+
ERROR: invalid Unicode surrogate pair
2260+
SELECT unistr('wrong: \+00db99\+000061');
2261+
ERROR: invalid Unicode surrogate pair
2262+
SELECT unistr('wrong: \+2FFFFF');
2263+
ERROR: invalid Unicode code point: 2FFFFF
2264+
SELECT unistr('wrong: \udb99\u0061');
2265+
ERROR: invalid Unicode surrogate pair
2266+
SELECT unistr('wrong: \U0000db99\U00000061');
2267+
ERROR: invalid Unicode surrogate pair
2268+
SELECT unistr('wrong: \U002FFFFF');
2269+
ERROR: invalid Unicode code point: 2FFFFF
2270+
SELECT unistr('wrong: \xyz');
2271+
ERROR: invalid Unicode escape
2272+
HINT: Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.

src/test/regress/sql/strings.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)
746746
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
747747

748748
SELECT bit_count('\x1234567890'::bytea);
749+
750+
SELECT unistr('\0064at\+0000610');
751+
SELECT unistr('d\u0061t\U000000610');
752+
SELECT unistr('a\\b');
753+
-- errors:
754+
SELECT unistr('wrong: \db99');
755+
SELECT unistr('wrong: \db99\0061');
756+
SELECT unistr('wrong: \+00db99\+000061');
757+
SELECT unistr('wrong: \+2FFFFF');
758+
SELECT unistr('wrong: \udb99\u0061');
759+
SELECT unistr('wrong: \U0000db99\U00000061');
760+
SELECT unistr('wrong: \U002FFFFF');
761+
SELECT unistr('wrong: \xyz');

0 commit comments

Comments
 (0)