@@ -141,6 +141,7 @@ static int parse_xml_decl(const xmlChar *str, size_t *lenp,
141
141
xmlChar * * version , xmlChar * * encoding , int * standalone );
142
142
static bool print_xml_decl (StringInfo buf , const xmlChar * version ,
143
143
pg_enc encoding , int standalone );
144
+ static bool xml_doctype_in_content (const xmlChar * str );
144
145
static xmlDocPtr xml_parse (text * data , XmlOptionType xmloption_arg ,
145
146
bool preserve_whitespace , int encoding );
146
147
static text * xml_xmlnodetoxmltype (xmlNodePtr cur , PgXmlErrorContext * xmlerrcxt );
@@ -1243,8 +1244,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp,
1243
1244
if (xmlStrncmp (p , (xmlChar * ) "<?xml" , 5 ) != 0 )
1244
1245
goto finished ;
1245
1246
1246
- /* if next char is name char, it's a PI like <?xml-stylesheet ...?> */
1247
- utf8len = strlen ((const char * ) (p + 5 ));
1247
+ /*
1248
+ * If next char is a name char, it's a PI like <?xml-stylesheet ...?>
1249
+ * rather than an XMLDecl, so we have done what we came to do and found no
1250
+ * XMLDecl.
1251
+ *
1252
+ * We need an input length value for xmlGetUTF8Char, but there's no need
1253
+ * to count the whole document size, so use strnlen not strlen.
1254
+ */
1255
+ utf8len = strnlen ((const char * ) (p + 5 ), MAX_MULTIBYTE_CHAR_LEN );
1248
1256
utf8char = xmlGetUTF8Char (p + 5 , & utf8len );
1249
1257
if (PG_XMLISNAMECHAR (utf8char ))
1250
1258
goto finished ;
@@ -1415,6 +1423,88 @@ print_xml_decl(StringInfo buf, const xmlChar *version,
1415
1423
return false;
1416
1424
}
1417
1425
1426
+ /*
1427
+ * Test whether an input that is to be parsed as CONTENT contains a DTD.
1428
+ *
1429
+ * The SQL/XML:2003 definition of CONTENT ("XMLDecl? content") is not
1430
+ * satisfied by a document with a DTD, which is a bit of a wart, as it means
1431
+ * the CONTENT type is not a proper superset of DOCUMENT. SQL/XML:2006 and
1432
+ * later fix that, by redefining content with reference to the "more
1433
+ * permissive" Document Node of the XQuery/XPath Data Model, such that any
1434
+ * DOCUMENT value is indeed also a CONTENT value. That definition is more
1435
+ * useful, as CONTENT becomes usable for parsing input of unknown form (think
1436
+ * pg_restore).
1437
+ *
1438
+ * As used below in parse_xml when parsing for CONTENT, libxml does not give
1439
+ * us the 2006+ behavior, but only the 2003; it will choke if the input has
1440
+ * a DTD. But we can provide the 2006+ definition of CONTENT easily enough,
1441
+ * by detecting this case first and simply doing the parse as DOCUMENT.
1442
+ *
1443
+ * A DTD can be found arbitrarily far in, but that would be a contrived case;
1444
+ * it will ordinarily start within a few dozen characters. The only things
1445
+ * that can precede it are an XMLDecl (here, the caller will have called
1446
+ * parse_xml_decl already), whitespace, comments, and processing instructions.
1447
+ * This function need only return true if it sees a valid sequence of such
1448
+ * things leading to <!DOCTYPE. It can simply return false in any other
1449
+ * cases, including malformed input; that will mean the input gets parsed as
1450
+ * CONTENT as originally planned, with libxml reporting any errors.
1451
+ *
1452
+ * This is only to be called from xml_parse, when pg_xml_init has already
1453
+ * been called. The input is already in UTF8 encoding.
1454
+ */
1455
+ static bool
1456
+ xml_doctype_in_content (const xmlChar * str )
1457
+ {
1458
+ const xmlChar * p = str ;
1459
+
1460
+ for (;;)
1461
+ {
1462
+ const xmlChar * e ;
1463
+
1464
+ SKIP_XML_SPACE (p );
1465
+ if (* p != '<' )
1466
+ return false;
1467
+ p ++ ;
1468
+
1469
+ if (* p == '!' )
1470
+ {
1471
+ p ++ ;
1472
+
1473
+ /* if we see <!DOCTYPE, we can return true */
1474
+ if (xmlStrncmp (p , (xmlChar * ) "DOCTYPE" , 7 ) == 0 )
1475
+ return true;
1476
+
1477
+ /* otherwise, if it's not a comment, fail */
1478
+ if (xmlStrncmp (p , (xmlChar * ) "--" , 2 ) != 0 )
1479
+ return false;
1480
+ /* find end of comment: find -- and a > must follow */
1481
+ p = xmlStrstr (p + 2 , (xmlChar * ) "--" );
1482
+ if (!p || p [2 ] != '>' )
1483
+ return false;
1484
+ /* advance over comment, and keep scanning */
1485
+ p += 3 ;
1486
+ continue ;
1487
+ }
1488
+
1489
+ /* otherwise, if it's not a PI <?target something?>, fail */
1490
+ if (* p != '?' )
1491
+ return false;
1492
+ p ++ ;
1493
+
1494
+ /* find end of PI (the string ?> is forbidden within a PI) */
1495
+ e = xmlStrstr (p , (xmlChar * ) "?>" );
1496
+ if (!e )
1497
+ return false;
1498
+
1499
+ /* we don't check PIs carefully, but do reject "xml" target */
1500
+ if (e - p >= 3 && xmlStrncasecmp (p , (xmlChar * ) "xml" , 3 ) == 0 )
1501
+ return false;
1502
+
1503
+ /* advance over PI, keep scanning */
1504
+ p = e + 2 ;
1505
+ }
1506
+ }
1507
+
1418
1508
1419
1509
/*
1420
1510
* Convert a C string to XML internal representation
@@ -1450,14 +1540,38 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
1450
1540
/* Use a TRY block to ensure we clean up correctly */
1451
1541
PG_TRY ();
1452
1542
{
1543
+ bool parse_as_document = false;
1544
+ int res_code ;
1545
+ size_t count = 0 ;
1546
+ xmlChar * version = NULL ;
1547
+ int standalone = 0 ;
1548
+
1453
1549
xmlInitParser ();
1454
1550
1455
1551
ctxt = xmlNewParserCtxt ();
1456
1552
if (ctxt == NULL || xmlerrcxt -> err_occurred )
1457
1553
xml_ereport (xmlerrcxt , ERROR , ERRCODE_OUT_OF_MEMORY ,
1458
1554
"could not allocate parser context" );
1459
1555
1556
+ /* Decide whether to parse as document or content */
1460
1557
if (xmloption_arg == XMLOPTION_DOCUMENT )
1558
+ parse_as_document = true;
1559
+ else
1560
+ {
1561
+ /* Parse and skip over the XML declaration, if any */
1562
+ res_code = parse_xml_decl (utf8string ,
1563
+ & count , & version , NULL , & standalone );
1564
+ if (res_code != 0 )
1565
+ xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1566
+ "invalid XML content: invalid XML declaration" ,
1567
+ res_code );
1568
+
1569
+ /* Is there a DOCTYPE element? */
1570
+ if (xml_doctype_in_content (utf8string + count ))
1571
+ parse_as_document = true;
1572
+ }
1573
+
1574
+ if (parse_as_document )
1461
1575
{
1462
1576
/*
1463
1577
* Note, that here we try to apply DTD defaults
@@ -1472,23 +1586,18 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
1472
1586
XML_PARSE_NOENT | XML_PARSE_DTDATTR
1473
1587
| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS ));
1474
1588
if (doc == NULL || xmlerrcxt -> err_occurred )
1475
- xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1476
- "invalid XML document" );
1589
+ {
1590
+ /* Use original option to decide which error code to throw */
1591
+ if (xmloption_arg == XMLOPTION_DOCUMENT )
1592
+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1593
+ "invalid XML document" );
1594
+ else
1595
+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_CONTENT ,
1596
+ "invalid XML content" );
1597
+ }
1477
1598
}
1478
1599
else
1479
1600
{
1480
- int res_code ;
1481
- size_t count ;
1482
- xmlChar * version ;
1483
- int standalone ;
1484
-
1485
- res_code = parse_xml_decl (utf8string ,
1486
- & count , & version , NULL , & standalone );
1487
- if (res_code != 0 )
1488
- xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1489
- "invalid XML content: invalid XML declaration" ,
1490
- res_code );
1491
-
1492
1601
doc = xmlNewDoc (version );
1493
1602
Assert (doc -> encoding == NULL );
1494
1603
doc -> encoding = xmlStrdup ((const xmlChar * ) "UTF-8" );
0 commit comments