Skip to content

Commit d8406b9

Browse files
committed
Ignore XML declaration in xpath_internal(), for UTF8 databases.
When a value contained an XML declaration naming some other encoding, this function interpreted UTF8 bytes as the named encoding, yielding mojibake. xml_parse() already has similar logic. This would be necessary but not sufficient for non-UTF8 databases, so preserve behavior there until the xpath facility can support such databases comprehensively. Back-patch to 9.3 (all supported versions). Pavel Stehule and Noah Misch Discussion: https://postgr.es/m/CAFj8pRC-dM=tT=QkGi+Achkm+gwPmjyOayGuUfXVumCxkDgYWg@mail.gmail.com
1 parent 6290646 commit d8406b9

File tree

5 files changed

+142
-1
lines changed

5 files changed

+142
-1
lines changed

src/backend/utils/adt/xml.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3792,6 +3792,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
37923792
int32 xpath_len;
37933793
xmlChar *string;
37943794
xmlChar *xpath_expr;
3795+
size_t xmldecl_len = 0;
37953796
int i;
37963797
int ndim;
37973798
Datum *ns_names_uris;
@@ -3852,6 +3853,16 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
38523853
memcpy(xpath_expr, VARDATA(xpath_expr_text), xpath_len);
38533854
xpath_expr[xpath_len] = '\0';
38543855

3856+
/*
3857+
* In a UTF8 database, skip any xml declaration, which might assert
3858+
* another encoding. Ignore parse_xml_decl() failure, letting
3859+
* xmlCtxtReadMemory() report parse errors. Documentation disclaims
3860+
* xpath() support for non-ASCII data in non-UTF8 databases, so leave
3861+
* those scenarios bug-compatible with historical behavior.
3862+
*/
3863+
if (GetDatabaseEncoding() == PG_UTF8)
3864+
parse_xml_decl(string, &xmldecl_len, NULL, NULL, NULL);
3865+
38553866
xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
38563867

38573868
PG_TRY();
@@ -3866,7 +3877,8 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
38663877
if (ctxt == NULL || xmlerrcxt->err_occurred)
38673878
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
38683879
"could not allocate parser context");
3869-
doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
3880+
doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len,
3881+
len - xmldecl_len, NULL, NULL, 0);
38703882
if (doc == NULL || xmlerrcxt->err_occurred)
38713883
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
38723884
"could not parse XML document");

src/test/regress/expected/xml.out

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,37 @@ SELECT xpath('/nosuchtag', '<root/>');
653653
{}
654654
(1 row)
655655

656+
-- Round-trip non-ASCII data through xpath().
657+
DO $$
658+
DECLARE
659+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
660+
degree_symbol text;
661+
res xml[];
662+
BEGIN
663+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
664+
-- the server encoding is not UTF8. The EXCEPTION block below,
665+
-- currently dead code, will be relevant if we remove this limitation.
666+
IF current_setting('server_encoding') <> 'UTF8' THEN
667+
RAISE LOG 'skip: encoding % unsupported for xml',
668+
current_setting('server_encoding');
669+
RETURN;
670+
END IF;
671+
672+
degree_symbol := convert_from('\xc2b0', 'UTF8');
673+
res := xpath('text()', (xml_declaration ||
674+
'<x>' || degree_symbol || '</x>')::xml);
675+
IF degree_symbol <> res[1]::text THEN
676+
RAISE 'expected % (%), got % (%)',
677+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
678+
res[1], convert_to(res[1]::text, 'UTF8');
679+
END IF;
680+
EXCEPTION
681+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
682+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
683+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
684+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
685+
END
686+
$$;
656687
-- Test xmlexists and xpath_exists
657688
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
658689
xmlexists

src/test/regress/expected/xml_1.out

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,41 @@ LINE 1: SELECT xpath('/nosuchtag', '<root/>');
564564
^
565565
DETAIL: This functionality requires the server to be built with libxml support.
566566
HINT: You need to rebuild PostgreSQL using --with-libxml.
567+
-- Round-trip non-ASCII data through xpath().
568+
DO $$
569+
DECLARE
570+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
571+
degree_symbol text;
572+
res xml[];
573+
BEGIN
574+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
575+
-- the server encoding is not UTF8. The EXCEPTION block below,
576+
-- currently dead code, will be relevant if we remove this limitation.
577+
IF current_setting('server_encoding') <> 'UTF8' THEN
578+
RAISE LOG 'skip: encoding % unsupported for xml',
579+
current_setting('server_encoding');
580+
RETURN;
581+
END IF;
582+
583+
degree_symbol := convert_from('\xc2b0', 'UTF8');
584+
res := xpath('text()', (xml_declaration ||
585+
'<x>' || degree_symbol || '</x>')::xml);
586+
IF degree_symbol <> res[1]::text THEN
587+
RAISE 'expected % (%), got % (%)',
588+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
589+
res[1], convert_to(res[1]::text, 'UTF8');
590+
END IF;
591+
EXCEPTION
592+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
593+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
594+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
595+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
596+
END
597+
$$;
598+
ERROR: unsupported XML feature
599+
DETAIL: This functionality requires the server to be built with libxml support.
600+
HINT: You need to rebuild PostgreSQL using --with-libxml.
601+
CONTEXT: PL/pgSQL function inline_code_block line 17 at assignment
567602
-- Test xmlexists and xpath_exists
568603
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
569604
ERROR: unsupported XML feature

src/test/regress/expected/xml_2.out

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,37 @@ SELECT xpath('/nosuchtag', '<root/>');
635635
{}
636636
(1 row)
637637

638+
-- Round-trip non-ASCII data through xpath().
639+
DO $$
640+
DECLARE
641+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
642+
degree_symbol text;
643+
res xml[];
644+
BEGIN
645+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
646+
-- the server encoding is not UTF8. The EXCEPTION block below,
647+
-- currently dead code, will be relevant if we remove this limitation.
648+
IF current_setting('server_encoding') <> 'UTF8' THEN
649+
RAISE LOG 'skip: encoding % unsupported for xml',
650+
current_setting('server_encoding');
651+
RETURN;
652+
END IF;
653+
654+
degree_symbol := convert_from('\xc2b0', 'UTF8');
655+
res := xpath('text()', (xml_declaration ||
656+
'<x>' || degree_symbol || '</x>')::xml);
657+
IF degree_symbol <> res[1]::text THEN
658+
RAISE 'expected % (%), got % (%)',
659+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
660+
res[1], convert_to(res[1]::text, 'UTF8');
661+
END IF;
662+
EXCEPTION
663+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
664+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
665+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
666+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
667+
END
668+
$$;
638669
-- Test xmlexists and xpath_exists
639670
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
640671
xmlexists

src/test/regress/sql/xml.sql

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,38 @@ SELECT xpath('count(//*)=3', '<root><sub/><sub/></root>');
186186
SELECT xpath('name(/*)', '<root><sub/><sub/></root>');
187187
SELECT xpath('/nosuchtag', '<root/>');
188188

189+
-- Round-trip non-ASCII data through xpath().
190+
DO $$
191+
DECLARE
192+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
193+
degree_symbol text;
194+
res xml[];
195+
BEGIN
196+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
197+
-- the server encoding is not UTF8. The EXCEPTION block below,
198+
-- currently dead code, will be relevant if we remove this limitation.
199+
IF current_setting('server_encoding') <> 'UTF8' THEN
200+
RAISE LOG 'skip: encoding % unsupported for xml',
201+
current_setting('server_encoding');
202+
RETURN;
203+
END IF;
204+
205+
degree_symbol := convert_from('\xc2b0', 'UTF8');
206+
res := xpath('text()', (xml_declaration ||
207+
'<x>' || degree_symbol || '</x>')::xml);
208+
IF degree_symbol <> res[1]::text THEN
209+
RAISE 'expected % (%), got % (%)',
210+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
211+
res[1], convert_to(res[1]::text, 'UTF8');
212+
END IF;
213+
EXCEPTION
214+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
215+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
216+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
217+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
218+
END
219+
$$;
220+
189221
-- Test xmlexists and xpath_exists
190222
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
191223
SELECT xmlexists('//town[text() = ''Cwmbran'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');

0 commit comments

Comments
 (0)