Skip to content

bpo-17088: Fix handling of XML attributes when serializing with default namespace #11050

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 78 additions & 6 deletions Lib/test/test_xml_etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def assertEqualElements(self, alice, bob):
# --------------------------------------------------------------------
# element tree tests

class ElementTreeTest(unittest.TestCase):
class ElementTreeTest(ElementTestCase, unittest.TestCase):

def serialize_check(self, elem, expected):
self.assertEqual(serialize(elem), expected)
Expand Down Expand Up @@ -972,6 +972,79 @@ def test_qname(self):
self.assertNotEqual(q1, 'ns:tag')
self.assertEqual(q1, '{ns}tag')

def test_namespace_attribs(self):
# Unprefixed attributes are unqualified even if a default
# namespace is in effect. (This is a little unclear in some
# versions of the XML TR but is clarified in errata and other
# versions.) See bugs.python.org issue 17088.
#
# The reasoning behind this, alluded to in the spec, is that
# attribute meanings already depend on the element they're
# attached to; attributes have always lived in per-element
# namespaces even before explicit XML namespaces were
# introduced. For that reason qualified attribute names are
# only really needed when one XML module defines attributes
# that can be placed on elements defined in a different module
# (such as happens with XLINK or, for that matter, the XML
# namespace spec itself).
e = ET.XML(
'<pf:elt xmlns:pf="space1" xmlns:pf2="space2" foo="value">'
'<pf:foo foo="value2" pf2:foo="value3"/>'
'<pf2:foo foo="value4" pf:foo="value5" pf2:foo="value6"/>'
'<foo foo="value7" pf:foo="value8"/>'
'</pf:elt>')
self.assertEqual(e.tag, '{space1}elt')
self.assertEqual(e.get('foo'), 'value')
self.assertIsNone(e.get('{space1}foo'))
self.assertIsNone(e.get('{space2}foo'))
self.assertEqual(e[0].tag, '{space1}foo')
self.assertEqual(e[0].attrib, { 'foo': 'value2',
'{space2}foo': 'value3' })
self.assertEqual(e[1].tag, '{space2}foo')
self.assertEqual(e[1].attrib, { 'foo': 'value4',
'{space1}foo': 'value5',
'{space2}foo': 'value6' })
self.assertEqual(e[2].tag, 'foo')
self.assertEqual(e[2].attrib, { 'foo': 'value7',
'{space1}foo': 'value8' })

serialized1 = (
'<ns0:elt xmlns:ns0="space1" xmlns:ns1="space2" foo="value">'
'<ns0:foo foo="value2" ns1:foo="value3" />'
'<ns1:foo foo="value4" ns0:foo="value5" ns1:foo="value6" />'
'<foo foo="value7" ns0:foo="value8" />'
'</ns0:elt>')
self.assertEqual(serialize(e), serialized1)
self.assertEqualElements(e, ET.XML(serialized1))

# Test writing with a default namespace.
with self.assertRaisesRegex(ValueError,
'cannot use non-qualified name.* with default_namespace option'):
serialize(e, default_namespace="space1")

# Remove the unqualified element from the tree so we can test
# further.
del e[2]

# Serialization can require a namespace prefix to be declared for
# space1 even if no elements use that prefix, in order to
# write an attribute name in that namespace.
serialized2 = (
'<ns0:elt xmlns="space2" xmlns:ns0="space1" xmlns:ns1="space2" foo="value">'
'<ns0:foo foo="value2" ns1:foo="value3" />'
'<foo foo="value4" ns0:foo="value5" ns1:foo="value6" />'
'</ns0:elt>')
self.assertEqual(serialize(e, default_namespace="space2"), serialized2)
self.assertEqualElements(e, ET.XML(serialized2))

serialized3 = (
'<elt xmlns="space1" xmlns:ns0="space2" xmlns:ns1="space1" foo="value">'
'<foo foo="value2" ns0:foo="value3" />'
'<ns0:foo foo="value4" ns1:foo="value5" ns0:foo="value6" />'
'</elt>')
self.assertEqual(serialize(e, default_namespace="space1"), serialized3)
self.assertEqualElements(e, ET.XML(serialized3))

def test_doctype_public(self):
# Test PUBLIC doctype.

Expand Down Expand Up @@ -1619,18 +1692,17 @@ def test_bug_200709_default_namespace(self):
s = ET.SubElement(e, "{default}elem")
s = ET.SubElement(e, "{not-default}elem")
self.assertEqual(serialize(e, default_namespace="default"), # 2
'<elem xmlns="default" xmlns:ns1="not-default">'
'<elem xmlns="default" xmlns:ns0="not-default">'
'<elem />'
'<ns1:elem />'
'<ns0:elem />'
'</elem>')

e = ET.Element("{default}elem")
s = ET.SubElement(e, "{default}elem")
s = ET.SubElement(e, "elem") # unprefixed name
with self.assertRaises(ValueError) as cm:
with self.assertRaisesRegex(ValueError,
'cannot use non-qualified name.* with default_namespace option'):
serialize(e, default_namespace="default") # 3
self.assertEqual(str(cm.exception),
'cannot use non-qualified names with default_namespace option')

def test_bug_200709_register_namespace(self):
e = ET.Element("{http://namespace.invalid/does/not/exist/}title")
Expand Down
96 changes: 56 additions & 40 deletions Lib/xml/etree/ElementTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,63 +836,81 @@ def _get_writer(file_or_filename, encoding):
def _namespaces(elem, default_namespace=None):
# identify namespaces used in this tree

# maps qnames to *encoded* prefix:local names
qnames = {None: None}
# maps qnames to *encoded* prefix:local names,
# value is a pair of prefix:local strings,
# second value is for attribute names
qnames = {None: (None, None)}

# maps uri:s to prefixes
namespaces = {}
if default_namespace:
namespaces[default_namespace] = ""

def add_qname(qname):
def serialize_qname(qname, is_attr):
# calculate serialized qname representation
try:
if qname[:1] == "{":
uri, tag = qname[1:].rsplit("}", 1)
if qname[:1] == "{":
uri, local = qname[1:].rsplit("}", 1)
if not is_attr and uri == default_namespace:
prefix = ""
else:
prefix = namespaces.get(uri)
if prefix is None:
prefix = _namespace_map.get(uri)
if prefix is None:
prefix = "ns%d" % len(namespaces)
if prefix != "xml":
namespaces[uri] = prefix
if prefix:
qnames[qname] = "%s:%s" % (prefix, tag)
else:
qnames[qname] = tag # default element
if prefix:
return "%s:%s" % (prefix, local)
else:
if default_namespace:
# FIXME: can this be handled in XML 1.0?
raise ValueError(
"cannot use non-qualified names with "
"default_namespace option"
)
qnames[qname] = qname
except TypeError:
_raise_serialization_error(qname)
Comment on lines -871 to -872
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where's this part gone? Wouldn't this leak a TypeError to the user side?

return local # default element
else:
if not is_attr and default_namespace:
# FIXME: can this be handled in XML 1.0?
raise ValueError(
"cannot use non-qualified name (<%s>) with "
"default_namespace option" % qname
)
return qname

def add_qname(qname, is_attr=False):
ser_tag, ser_attr = qnames.get(qname, (None, None))
if is_attr:
if ser_attr is None:
ser_attr = serialize_qname(qname, True)
if not default_namespace:
ser_tag = ser_attr
Comment on lines +879 to +880
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems worth a comment in the code (likewise below).

qnames[qname] = (ser_tag, ser_attr)
else:
if ser_tag is None:
ser_tag = serialize_qname(qname, False)
if not default_namespace:
ser_attr = ser_tag
qnames[qname] = (ser_tag, ser_attr)

# populate qname and namespaces table
for elem in elem.iter():
tag = elem.tag
if isinstance(tag, QName):
if tag.text not in qnames:
add_qname(tag.text)
add_qname(tag.text)
elif isinstance(tag, str):
if tag not in qnames:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These conditions might have been there for performance reasons. Not sure if it matters, but that's up to some benchmarking I think.

add_qname(tag)
add_qname(tag)
elif tag is not None and tag is not Comment and tag is not PI:
_raise_serialization_error(tag)
for key, value in elem.items():
if isinstance(key, QName):
key = key.text
if key not in qnames:
add_qname(key)
if isinstance(value, QName) and value.text not in qnames:
elif not isinstance(key, str):
_raise_serialization_error(key)
add_qname(key, is_attr=True)
if isinstance(value, QName):
add_qname(value.text)
text = elem.text
if isinstance(text, QName) and text.text not in qnames:
if isinstance(text, QName):
add_qname(text.text)
return qnames, namespaces

prefix_map = {prefix: ns for ns, prefix in namespaces.items()}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefixes don't have to be globally unique, so this might introduce bugs. (Suggests to me that there might be missing tests.)

if default_namespace:
prefix_map[""] = default_namespace
return qnames, prefix_map

def _serialize_xml(write, elem, qnames, namespaces,
short_empty_elements, **kwargs):
Expand All @@ -903,7 +921,7 @@ def _serialize_xml(write, elem, qnames, namespaces,
elif tag is ProcessingInstruction:
write("<?%s?>" % text)
else:
tag = qnames[tag]
tag = qnames[tag][0]
if tag is None:
if text:
write(_escape_cdata(text))
Expand All @@ -915,8 +933,7 @@ def _serialize_xml(write, elem, qnames, namespaces,
items = list(elem.items())
if items or namespaces:
if namespaces:
for v, k in sorted(namespaces.items(),
key=lambda x: x[1]): # sort on prefix
for k, v in sorted(namespaces.items()):
if k:
k = ":" + k
write(" xmlns%s=\"%s\"" % (
Expand All @@ -927,10 +944,10 @@ def _serialize_xml(write, elem, qnames, namespaces,
if isinstance(k, QName):
k = k.text
if isinstance(v, QName):
v = qnames[v.text]
v = qnames[v.text][0]
else:
v = _escape_attrib(v)
write(" %s=\"%s\"" % (qnames[k], v))
write(" %s=\"%s\"" % (qnames[k][1], v))
if text or len(elem) or not short_empty_elements:
write(">")
if text:
Expand Down Expand Up @@ -960,7 +977,7 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs):
elif tag is ProcessingInstruction:
write("<?%s?>" % _escape_cdata(text))
else:
tag = qnames[tag]
tag = qnames[tag][0]
if tag is None:
if text:
write(_escape_cdata(text))
Expand All @@ -971,8 +988,7 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs):
items = list(elem.items())
if items or namespaces:
if namespaces:
for v, k in sorted(namespaces.items(),
key=lambda x: x[1]): # sort on prefix
for k, v in sorted(namespaces.items()):
if k:
k = ":" + k
write(" xmlns%s=\"%s\"" % (
Expand All @@ -983,11 +999,11 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs):
if isinstance(k, QName):
k = k.text
if isinstance(v, QName):
v = qnames[v.text]
v = qnames[v.text][0]
else:
v = _escape_attrib_html(v)
# FIXME: handle boolean attributes
write(" %s=\"%s\"" % (qnames[k], v))
write(" %s=\"%s\"" % (qnames[k][1], v))
write(">")
ltag = tag.lower()
if text:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ElementTree's serialization of attribute names when a default namespace is
passed was corrected. Previously, unqualified attribute names would be
rejected unnecessarily, while attributes in the default namespace would have
their prefix stripped, which goes against the rules for namespace defaulting
and uniqueness of attributes as specified in the XML namespaces spec.