Skip to content

Commit e834288

Browse files
committed
Work with lxml entities (based on patch by fantasai)
1 parent 6fdd5d7 commit e834288

File tree

5 files changed

+78
-5
lines changed

5 files changed

+78
-5
lines changed

src/html5lib/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,8 @@
508508
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
509509
)
510510

511+
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
512+
511513
entities = {
512514
"AElig;": u"\u00C6",
513515
"AElig": u"\u00C6",

src/html5lib/serializer/htmlserializer.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
_ = gettext.gettext
99

1010
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
11-
from html5lib.constants import rcdataElements
11+
from html5lib.constants import rcdataElements, entities, xmlEntities
1212

1313
from xml.sax.saxutils import escape
1414

@@ -54,26 +54,32 @@ def encode(text, encoding):
5454

5555
class HTMLSerializer(object):
5656

57+
# attribute quoting options
5758
quote_attr_values = False
5859
quote_char = '"'
5960
use_best_quote_char = True
60-
minimize_boolean_attributes = True
6161

62+
# tag syntax options
63+
omit_optional_tags = True
64+
minimize_boolean_attributes = True
6265
use_trailing_solidus = False
6366
space_before_trailing_solidus = True
67+
68+
# escaping options
6469
escape_lt_in_attrs = False
6570
escape_rcdata = False
71+
resolve_entities = True
6672

73+
# miscellaneous options
6774
inject_meta_charset = True
6875
strip_whitespace = False
6976
sanitize = False
70-
omit_optional_tags = True
7177

7278
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
7379
"minimize_boolean_attributes", "use_trailing_solidus",
7480
"space_before_trailing_solidus", "omit_optional_tags",
7581
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
76-
"escape_rcdata", 'use_trailing_solidus', "sanitize")
82+
"escape_rcdata", "resolve_entities", "sanitize")
7783

7884
def __init__(self, **kwargs):
7985
if kwargs.has_key('quote_char'):
@@ -214,6 +220,19 @@ def serialize(self, treewalker, encoding=None):
214220
comment = comment.encode(encoding, unicode_encode_errors)
215221
yield comment
216222

223+
elif type == "Entity":
224+
name = token["name"]
225+
key = name + ";"
226+
if not key in entities:
227+
self.serializeError(_("Entity %s not recognized" % name))
228+
if self.resolve_entities and key not in xmlEntities:
229+
data = entities[key]
230+
else:
231+
data = u"&%s;" % name
232+
if encoding:
233+
data = data.encode(encoding, unicode_encode_errors)
234+
yield data
235+
217236
else:
218237
self.serializeError(token["data"])
219238

src/html5lib/treewalkers/_base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ def doctype(self, name, publicId=None, systemId=None, correct=True):
6464
"systemId": systemId,
6565
"correct": correct}
6666

67+
def entity(self, name):
68+
return {"type": "Entity", "name": unicode(name)}
69+
6770
def unknown(self, nodeType):
6871
return self.error(_("Unknown node type: ") + nodeType)
6972

@@ -89,6 +92,7 @@ def element(self, node, namespace, name, attrs, hasChildren):
8992
TEXT = Node.TEXT_NODE
9093
ELEMENT = Node.ELEMENT_NODE
9194
COMMENT = Node.COMMENT_NODE
95+
ENTITY = Node.ENTITY_NODE
9296
UNKNOWN = "<#UNKNOWN#>"
9397

9498
class NonRecursiveTreeWalker(TreeWalker):
@@ -133,6 +137,9 @@ def __iter__(self):
133137
elif type == COMMENT:
134138
yield self.comment(details[0])
135139

140+
elif type == ENTITY:
141+
yield self.entity(details[0])
142+
136143
elif type == DOCUMENT:
137144
hasChildren = True
138145

src/html5lib/treewalkers/lxmletree.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ def getNodeDetails(self, node):
126126
elif node.tag == etree.Comment:
127127
return _base.COMMENT, node.text
128128

129+
elif node.tag == etree.Entity:
130+
return _base.ENTITY, node.text[1:-1] # strip &;
131+
129132
else:
130133
#This is assumed to be an ordinary element
131134
match = tag_regexp.match(node.tag)

tests/test_serializer.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,18 @@
22
import unittest
33
from support import simplejson, html5lib_test_files
44

5+
import html5lib
56
from html5lib import html5parser, serializer, constants
67
from html5lib.treewalkers._base import TreeWalker
78

9+
optionals_loaded = []
10+
11+
try:
12+
from lxml import etree
13+
optionals_loaded.append("lxml")
14+
except ImportError:
15+
pass
16+
817
default_namespace = constants.namespaces["html"]
918

1019
class JsonWalker(TreeWalker):
@@ -80,7 +89,32 @@ def serialize_xhtml(self, input, options):
8089
return u''.join(serializer.XHTMLSerializer(**options).
8190
serialize(JsonWalker(input),options.get("encoding",None)))
8291

83-
def buildTestSuite():
92+
class LxmlTestCase(unittest.TestCase):
93+
def setUp(self):
94+
self.parser = etree.XMLParser(resolve_entities=False)
95+
self.treewalker = html5lib.getTreeWalker("lxml")
96+
self.serializer = serializer.HTMLSerializer()
97+
98+
def testEntityReplacement(self):
99+
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
100+
tree = etree.fromstring(doc, parser = self.parser).getroottree()
101+
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
102+
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
103+
104+
def testEntityXML(self):
105+
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
106+
tree = etree.fromstring(doc, parser = self.parser).getroottree()
107+
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
108+
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
109+
110+
def testEntityNoResolve(self):
111+
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
112+
tree = etree.fromstring(doc, parser = self.parser).getroottree()
113+
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
114+
resolve_entities=False)
115+
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
116+
117+
def buildBasicTestSuite():
84118
for filename in html5lib_test_files('serializer', '*.test'):
85119
test_name = os.path.basename(filename).replace('.test','')
86120
tests = simplejson.load(file(filename))
@@ -92,6 +126,14 @@ def buildTestSuite():
92126
test.get("options", {}))
93127
return unittest.TestLoader().loadTestsFromTestCase(TestCase)
94128

129+
def buildTestSuite():
130+
allTests = [buildBasicTestSuite()]
131+
if "lxml" in optionals_loaded:
132+
allTests.append(unittest.TestLoader().loadTestsFromTestCase(LxmlTestCase))
133+
134+
return unittest.TestSuite(allTests)
135+
136+
95137
def main():
96138
buildTestSuite()
97139
unittest.main()

0 commit comments

Comments
 (0)