@@ -139,6 +139,7 @@ static int parse_xml_decl(const xmlChar *str, size_t *lenp,
139
139
xmlChar * * version , xmlChar * * encoding , int * standalone );
140
140
static bool print_xml_decl (StringInfo buf , const xmlChar * version ,
141
141
pg_enc encoding , int standalone );
142
+ static bool xml_doctype_in_content (const xmlChar * str );
142
143
static xmlDocPtr xml_parse (text * data , XmlOptionType xmloption_arg ,
143
144
bool preserve_whitespace , int encoding );
144
145
static text * xml_xmlnodetoxmltype (xmlNodePtr cur , PgXmlErrorContext * xmlerrcxt );
@@ -1154,8 +1155,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp,
1154
1155
if (xmlStrncmp (p , (xmlChar * ) "<?xml" , 5 ) != 0 )
1155
1156
goto finished ;
1156
1157
1157
- /* if next char is name char, it's a PI like <?xml-stylesheet ...?> */
1158
- utf8len = strlen ((const char * ) (p + 5 ));
1158
+ /*
1159
+ * If next char is a name char, it's a PI like <?xml-stylesheet ...?>
1160
+ * rather than an XMLDecl, so we have done what we came to do and found no
1161
+ * XMLDecl.
1162
+ *
1163
+ * We need an input length value for xmlGetUTF8Char, but there's no need
1164
+ * to count the whole document size, so use strnlen not strlen.
1165
+ */
1166
+ utf8len = strnlen ((const char * ) (p + 5 ), MAX_MULTIBYTE_CHAR_LEN );
1159
1167
utf8char = xmlGetUTF8Char (p + 5 , & utf8len );
1160
1168
if (PG_XMLISNAMECHAR (utf8char ))
1161
1169
goto finished ;
@@ -1326,6 +1334,88 @@ print_xml_decl(StringInfo buf, const xmlChar *version,
1326
1334
return false;
1327
1335
}
1328
1336
1337
+ /*
1338
+ * Test whether an input that is to be parsed as CONTENT contains a DTD.
1339
+ *
1340
+ * The SQL/XML:2003 definition of CONTENT ("XMLDecl? content") is not
1341
+ * satisfied by a document with a DTD, which is a bit of a wart, as it means
1342
+ * the CONTENT type is not a proper superset of DOCUMENT. SQL/XML:2006 and
1343
+ * later fix that, by redefining content with reference to the "more
1344
+ * permissive" Document Node of the XQuery/XPath Data Model, such that any
1345
+ * DOCUMENT value is indeed also a CONTENT value. That definition is more
1346
+ * useful, as CONTENT becomes usable for parsing input of unknown form (think
1347
+ * pg_restore).
1348
+ *
1349
+ * As used below in parse_xml when parsing for CONTENT, libxml does not give
1350
+ * us the 2006+ behavior, but only the 2003; it will choke if the input has
1351
+ * a DTD. But we can provide the 2006+ definition of CONTENT easily enough,
1352
+ * by detecting this case first and simply doing the parse as DOCUMENT.
1353
+ *
1354
+ * A DTD can be found arbitrarily far in, but that would be a contrived case;
1355
+ * it will ordinarily start within a few dozen characters. The only things
1356
+ * that can precede it are an XMLDecl (here, the caller will have called
1357
+ * parse_xml_decl already), whitespace, comments, and processing instructions.
1358
+ * This function need only return true if it sees a valid sequence of such
1359
+ * things leading to <!DOCTYPE. It can simply return false in any other
1360
+ * cases, including malformed input; that will mean the input gets parsed as
1361
+ * CONTENT as originally planned, with libxml reporting any errors.
1362
+ *
1363
+ * This is only to be called from xml_parse, when pg_xml_init has already
1364
+ * been called. The input is already in UTF8 encoding.
1365
+ */
1366
+ static bool
1367
+ xml_doctype_in_content (const xmlChar * str )
1368
+ {
1369
+ const xmlChar * p = str ;
1370
+
1371
+ for (;;)
1372
+ {
1373
+ const xmlChar * e ;
1374
+
1375
+ SKIP_XML_SPACE (p );
1376
+ if (* p != '<' )
1377
+ return false;
1378
+ p ++ ;
1379
+
1380
+ if (* p == '!' )
1381
+ {
1382
+ p ++ ;
1383
+
1384
+ /* if we see <!DOCTYPE, we can return true */
1385
+ if (xmlStrncmp (p , (xmlChar * ) "DOCTYPE" , 7 ) == 0 )
1386
+ return true;
1387
+
1388
+ /* otherwise, if it's not a comment, fail */
1389
+ if (xmlStrncmp (p , (xmlChar * ) "--" , 2 ) != 0 )
1390
+ return false;
1391
+ /* find end of comment: find -- and a > must follow */
1392
+ p = xmlStrstr (p + 2 , (xmlChar * ) "--" );
1393
+ if (!p || p [2 ] != '>' )
1394
+ return false;
1395
+ /* advance over comment, and keep scanning */
1396
+ p += 3 ;
1397
+ continue ;
1398
+ }
1399
+
1400
+ /* otherwise, if it's not a PI <?target something?>, fail */
1401
+ if (* p != '?' )
1402
+ return false;
1403
+ p ++ ;
1404
+
1405
+ /* find end of PI (the string ?> is forbidden within a PI) */
1406
+ e = xmlStrstr (p , (xmlChar * ) "?>" );
1407
+ if (!e )
1408
+ return false;
1409
+
1410
+ /* we don't check PIs carefully, but do reject "xml" target */
1411
+ if (e - p >= 3 && xmlStrncasecmp (p , (xmlChar * ) "xml" , 3 ) == 0 )
1412
+ return false;
1413
+
1414
+ /* advance over PI, keep scanning */
1415
+ p = e + 2 ;
1416
+ }
1417
+ }
1418
+
1329
1419
1330
1420
/*
1331
1421
* Convert a C string to XML internal representation
@@ -1361,14 +1451,38 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
1361
1451
/* Use a TRY block to ensure we clean up correctly */
1362
1452
PG_TRY ();
1363
1453
{
1454
+ bool parse_as_document = false;
1455
+ int res_code ;
1456
+ size_t count = 0 ;
1457
+ xmlChar * version = NULL ;
1458
+ int standalone = 0 ;
1459
+
1364
1460
xmlInitParser ();
1365
1461
1366
1462
ctxt = xmlNewParserCtxt ();
1367
1463
if (ctxt == NULL || xmlerrcxt -> err_occurred )
1368
1464
xml_ereport (xmlerrcxt , ERROR , ERRCODE_OUT_OF_MEMORY ,
1369
1465
"could not allocate parser context" );
1370
1466
1467
+ /* Decide whether to parse as document or content */
1371
1468
if (xmloption_arg == XMLOPTION_DOCUMENT )
1469
+ parse_as_document = true;
1470
+ else
1471
+ {
1472
+ /* Parse and skip over the XML declaration, if any */
1473
+ res_code = parse_xml_decl (utf8string ,
1474
+ & count , & version , NULL , & standalone );
1475
+ if (res_code != 0 )
1476
+ xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1477
+ "invalid XML content: invalid XML declaration" ,
1478
+ res_code );
1479
+
1480
+ /* Is there a DOCTYPE element? */
1481
+ if (xml_doctype_in_content (utf8string + count ))
1482
+ parse_as_document = true;
1483
+ }
1484
+
1485
+ if (parse_as_document )
1372
1486
{
1373
1487
/*
1374
1488
* Note, that here we try to apply DTD defaults
@@ -1383,23 +1497,18 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
1383
1497
XML_PARSE_NOENT | XML_PARSE_DTDATTR
1384
1498
| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS ));
1385
1499
if (doc == NULL || xmlerrcxt -> err_occurred )
1386
- xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1387
- "invalid XML document" );
1500
+ {
1501
+ /* Use original option to decide which error code to throw */
1502
+ if (xmloption_arg == XMLOPTION_DOCUMENT )
1503
+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1504
+ "invalid XML document" );
1505
+ else
1506
+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_CONTENT ,
1507
+ "invalid XML content" );
1508
+ }
1388
1509
}
1389
1510
else
1390
1511
{
1391
- int res_code ;
1392
- size_t count ;
1393
- xmlChar * version ;
1394
- int standalone ;
1395
-
1396
- res_code = parse_xml_decl (utf8string ,
1397
- & count , & version , NULL , & standalone );
1398
- if (res_code != 0 )
1399
- xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1400
- "invalid XML content: invalid XML declaration" ,
1401
- res_code );
1402
-
1403
1512
doc = xmlNewDoc (version );
1404
1513
Assert (doc -> encoding == NULL );
1405
1514
doc -> encoding = xmlStrdup ((const xmlChar * ) "UTF-8" );
0 commit comments