Skip to content

Commit 6a29350

Browse files
committed
Merge
2 parents 0eacde6 + b88d31d commit 6a29350

File tree

9 files changed

+277
-291
lines changed

9 files changed

+277
-291
lines changed

html5lib/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,8 @@
483483
"area",
484484
"col",
485485
"input",
486-
"source"
486+
"source",
487+
"track"
487488
))
488489

489490
cdataElements = frozenset(('title', 'textarea'))

html5lib/html5parser.py

Lines changed: 126 additions & 101 deletions
Large diffs are not rendered by default.

html5lib/sanitizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,11 @@ def sanitize_css(self, style):
245245

246246
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
247247
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
248-
lowercaseElementName=False, lowercaseAttrName=False):
248+
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
249249
#Change case matching defaults as we only output lowercase html anyway
250250
#This solution doesn't seem ideal...
251251
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
252-
lowercaseElementName, lowercaseAttrName)
252+
lowercaseElementName, lowercaseAttrName, parser=parser)
253253

254254
def __iter__(self):
255255
for token in HTMLTokenizer.__iter__(self):

html5lib/tests/test_parser.py

Lines changed: 62 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
import sys
33
import traceback
44
import StringIO
5-
import unittest
65
import warnings
76
import re
87

98
warnings.simplefilter("error")
109

11-
from support import html5lib_test_files, TestData, convert, convertExpected
10+
from support import html5lib_test_files as data_files
11+
from support import TestData, convert, convertExpected
1212
import html5lib
1313
from html5lib import html5parser, treebuilders, constants
1414

@@ -70,94 +70,71 @@ def convertTreeDump(data):
7070

7171
namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
7272

73-
class TestCase(unittest.TestCase):
74-
def runParserTest(self, innerHTML, input, expected, errors, treeClass,
75-
namespaceHTMLElements):
76-
#XXX - move this out into the setup function
77-
#concatenate all consecutive character tokens into a single token
78-
try:
79-
p = html5parser.HTMLParser(tree = treeClass,
80-
namespaceHTMLElements=namespaceHTMLElements)
81-
except constants.DataLossWarning:
82-
return
83-
84-
try:
85-
if innerHTML:
86-
document = p.parseFragment(input, innerHTML)
87-
else:
88-
try:
89-
document = p.parse(input)
90-
except constants.DataLossWarning:
91-
return
92-
except:
93-
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
94-
u"\nTraceback:", traceback.format_exc()])
95-
self.assertTrue(False, errorMsg.encode("utf8"))
96-
97-
output = convertTreeDump(p.tree.testSerializer(document))
98-
99-
expected = convertExpected(expected)
100-
if namespaceHTMLElements:
101-
expected = namespaceExpected(r"\1<html \2>", expected)
102-
103-
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
104-
u"\nReceived:", output])
105-
self.assertEquals(expected, output, errorMsg.encode("utf8"))
106-
errStr = [u"Line: %i Col: %i %s"%(line, col,
107-
constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
108-
((line,col), errorcode, datavars) in p.errors]
109-
110-
errorMsg2 = u"\n".join([u"\n\nInput:", input,
111-
u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors),
112-
u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
113-
if checkParseErrors:
114-
self.assertEquals(len(p.errors), len(errors), errorMsg2.encode("utf-8"))
11573

116-
def buildTestSuite():
117-
sys.stdout.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
118-
119-
for treeName, treeCls in treeTypes.iteritems():
120-
files = html5lib_test_files('tree-construction')
121-
for filename in files:
122-
testName = os.path.basename(filename).replace(".dat","")
74+
def runParserTest(innerHTML, input, expected, errors, treeClass,
75+
namespaceHTMLElements):
76+
#XXX - move this out into the setup function
77+
#concatenate all consecutive character tokens into a single token
78+
try:
79+
p = html5parser.HTMLParser(tree = treeClass,
80+
namespaceHTMLElements=namespaceHTMLElements)
81+
except constants.DataLossWarning:
82+
return
12383

124-
tests = TestData(filename, "data")
84+
try:
85+
if innerHTML:
86+
document = p.parseFragment(input, innerHTML)
87+
else:
88+
try:
89+
document = p.parse(input)
90+
except constants.DataLossWarning:
91+
return
92+
except:
93+
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
94+
u"\nTraceback:", traceback.format_exc()])
95+
assert False, errorMsg.encode("utf8")
96+
97+
output = convertTreeDump(p.tree.testSerializer(document))
98+
99+
expected = convertExpected(expected)
100+
if namespaceHTMLElements:
101+
expected = namespaceExpected(r"\1<html \2>", expected)
102+
103+
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
104+
u"\nReceived:", output])
105+
assert expected == output, errorMsg.encode("utf8")
106+
errStr = [u"Line: %i Col: %i %s"%(line, col,
107+
constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
108+
((line,col), errorcode, datavars) in p.errors]
109+
110+
errorMsg2 = u"\n".join([u"\n\nInput:", input,
111+
u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors),
112+
u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
113+
if checkParseErrors:
114+
assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
115+
116+
def test_parser():
117+
sys.stderr.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
118+
files = data_files('tree-construction')
119+
120+
for filename in files:
121+
testName = os.path.basename(filename).replace(".dat","")
125122

126-
for index, test in enumerate(tests):
127-
input, errors, innerHTML, expected = [test[key] for key in
123+
tests = TestData(filename, "data")
124+
125+
for index, test in enumerate(tests):
126+
input, errors, innerHTML, expected = [test[key] for key in
128127
'data', 'errors',
129128
'document-fragment',
130129
'document']
131-
if errors:
132-
errors = errors.split("\n")
133-
130+
if errors:
131+
errors = errors.split("\n")
132+
133+
for treeName, treeCls in treeTypes.iteritems():
134134
for namespaceHTMLElements in (True, False):
135-
def testFunc(self, innerHTML=innerHTML, input=input,
136-
expected=expected, errors=errors, treeCls=treeCls,
137-
namespaceHTMLElements=namespaceHTMLElements):
138-
return self.runParserTest(innerHTML, input, expected,
139-
errors, treeCls,
140-
namespaceHTMLElements)
141-
testFunc.__name__ = "test_%s_%d_%s_%s" % (testName,index+1,treeName, namespaceHTMLElements and "namespaced" or "no_html_namespace")
142-
setattr(TestCase, testFunc.__name__,
143-
testFunc)
135+
print input
136+
yield (runParserTest, innerHTML, input, expected, errors, treeCls,
137+
namespaceHTMLElements)
144138
break
145-
146-
return unittest.TestLoader().loadTestsFromTestCase(TestCase)
147-
148-
def main():
149-
# the following is temporary while the unit tests for parse errors are
150-
# still in flux
151-
if '-p' in sys.argv: # suppress check for parse errors
152-
sys.argv.remove('-p')
153-
global checkParseErrors
154-
checkParseErrors = False
155-
buildTestSuite()
156-
try:
157-
unittest.main()
158-
except SystemExit:
159-
pass
160-
161-
if __name__ == "__main__":
162-
print sys.argv
163-
main()
139+
140+

html5lib/tests/test_sanitizer.py

Lines changed: 61 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -7,92 +7,70 @@
77
except ImportError:
88
import simplejson as json
99

10-
from support import html5lib_test_files
1110
from html5lib import html5parser, sanitizer, constants
1211

13-
class SanitizeTest(unittest.TestCase):
14-
def addTest(cls, name, expected, input):
15-
def test(self, expected=expected, input=input):
16-
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
17-
parseFragment(expected).childNodes])
18-
expected = json.loads(json.dumps(expected))
19-
self.assertEqual(expected, self.sanitize_html(input))
20-
setattr(cls, name, test)
21-
addTest = classmethod(addTest)
12+
def runSanitizerTest(name, expected, input):
13+
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
14+
parseFragment(expected).childNodes])
15+
expected = json.loads(json.dumps(expected))
16+
assert expected == sanitize_html(input)
2217

23-
def sanitize_html(self,stream):
18+
def sanitize_html(stream):
2419
return ''.join([token.toxml() for token in
25-
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
26-
parseFragment(stream).childNodes])
27-
28-
def test_should_handle_astral_plane_characters(self):
29-
self.assertEqual(u"<p>\U0001d4b5 \U0001d538</p>",
30-
self.sanitize_html("<p>&#x1d4b5; &#x1d538;</p>"))
31-
32-
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
33-
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: continue ### TODO
34-
if tag_name != tag_name.lower(): continue ### TODO
35-
if tag_name == 'image':
36-
SanitizeTest.addTest("test_should_allow_%s_tag" % tag_name,
37-
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
38-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
39-
elif tag_name == 'br':
40-
SanitizeTest.addTest("test_should_allow_%s_tag" % tag_name,
41-
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
20+
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
21+
parseFragment(stream).childNodes])
22+
23+
def test_should_handle_astral_plane_characters():
24+
assert u"<p>\U0001d4b5 \U0001d538</p>" == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
25+
26+
def test_sanitizer():
27+
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
28+
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
29+
continue ### TODO
30+
if tag_name != tag_name.lower():
31+
continue ### TODO
32+
if tag_name == 'image':
33+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
34+
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
35+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
36+
elif tag_name == 'br':
37+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
38+
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
39+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
40+
elif tag_name in constants.voidElements:
41+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
42+
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
43+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
44+
else:
45+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
46+
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name,tag_name),
47+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
48+
49+
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
50+
tag_name = tag_name.upper()
51+
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
52+
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name,tag_name),
4253
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
43-
elif tag_name in constants.voidElements:
44-
SanitizeTest.addTest("test_should_allow_%s_tag" % tag_name,
45-
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
46-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
47-
else:
48-
SanitizeTest.addTest("test_should_allow_%s_tag" % tag_name,
49-
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name,tag_name),
50-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
51-
52-
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
53-
tag_name = tag_name.upper()
54-
SanitizeTest.addTest("test_should_forbid_%s_tag" % tag_name,
55-
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name,tag_name),
56-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
57-
58-
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
59-
if attribute_name != attribute_name.lower(): continue ### TODO
60-
if attribute_name == 'style': continue
61-
SanitizeTest.addTest("test_should_allow_%s_attribute" % attribute_name,
62-
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
63-
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name)
64-
65-
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
66-
attribute_name = attribute_name.upper()
67-
SanitizeTest.addTest("test_should_forbid_%s_attribute" % attribute_name,
68-
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
69-
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name)
70-
71-
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
72-
SanitizeTest.addTest("test_should_allow_%s_uris" % protocol,
73-
"<a href=\"%s\">foo</a>" % protocol,
74-
"""<a href="%s">foo</a>""" % protocol)
75-
76-
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
77-
SanitizeTest.addTest("test_should_allow_uppercase_%s_uris" % protocol,
78-
"<a href=\"%s\">foo</a>" % protocol,
79-
"""<a href="%s">foo</a>""" % protocol)
80-
81-
def buildTestSuite():
82-
for filename in html5lib_test_files("sanitizer"):
83-
for test in json.load(file(filename)):
84-
SanitizeTest.addTest('test_' + test['name'], test['output'], test['input'])
85-
86-
return unittest.TestLoader().loadTestsFromTestCase(SanitizeTest)
87-
88-
def sanitize_html(stream):
89-
return ''.join([token.toxml() for token in
90-
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
91-
parseFragment(stream).childNodes])
92-
93-
def main():
94-
buildTestSuite()
95-
unittest.main()
9654

97-
if __name__ == "__main__":
98-
main()
55+
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
56+
if attribute_name != attribute_name.lower(): continue ### TODO
57+
if attribute_name == 'style': continue
58+
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
59+
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
60+
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name)
61+
62+
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
63+
attribute_name = attribute_name.upper()
64+
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
65+
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
66+
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name)
67+
68+
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
69+
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
70+
"<a href=\"%s\">foo</a>" % protocol,
71+
"""<a href="%s">foo</a>""" % protocol)
72+
73+
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
74+
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
75+
"<a href=\"%s\">foo</a>" % protocol,
76+
"""<a href="%s">foo</a>""" % protocol)

html5lib/tokenizer.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
for e in entities:
2424
entitiesByFirstChar.setdefault(e[0], []).append(e)
2525

26-
class HTMLTokenizer:
26+
class HTMLTokenizer(object):
2727
""" This class takes care of tokenizing HTML.
2828
2929
* self.currentToken
@@ -36,8 +36,6 @@ class HTMLTokenizer:
3636
Points to HTMLInputStream object.
3737
"""
3838

39-
# XXX need to fix documentation
40-
4139
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
4240
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
4341

@@ -56,6 +54,7 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
5654

5755
# The current token being created
5856
self.currentToken = None
57+
super(HTMLTokenizer, self).__init__()
5958

6059
def __iter__(self):
6160
""" This is where the magic happens.
@@ -1151,7 +1150,7 @@ def markupDeclarationOpenState(self):
11511150
return True
11521151
elif (charStack[-1] == "[" and
11531152
self.parser is not None and
1154-
self.parser.phase == self.parser.phases["inForeignContent"] and
1153+
self.parser.tree.openElements and
11551154
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
11561155
matched = True
11571156
for expected in ["C", "D", "A", "T", "A", "["]:

0 commit comments

Comments
 (0)