Skip to content

Commit 221b909

Browse files
author
Mark Pilgrim
committed
refactored validator filter, added attribute tests for input element
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40968
1 parent 14ee2b6 commit 221b909

File tree

1 file changed

+238
-81
lines changed

1 file changed

+238
-81
lines changed

src/html5lib/filters/validator.py

Lines changed: 238 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -9,84 +9,196 @@
99
>>> p.parse('<!doctype html>\n<html foo=bar></html>')
1010
<<class 'html5lib.treebuilders.simpletree.Document'> None>
1111
>>> p.errors
12-
[((2, 14), 'unrecognized-attribute', {'attributeName': u'foo', 'tagName': u'html'})]
12+
[((2, 14), 'unknown-attribute', {'attributeName': u'foo', 'tagName': u'html'})]
1313
"""
1414

15+
try:
16+
frozenset
17+
except NameError:
18+
# Import from the sets module for python 2.3
19+
from sets import Set as set
20+
from sets import ImmutableSet as frozenset
1521
import _base
1622
from html5lib.constants import E
1723
from html5lib import tokenizer
1824
import gettext
1925
_ = gettext.gettext
2026

2127
E.update({
22-
"unrecognized-attribute":
23-
_(u"Unrecognized attribute '%(attributeName)s' in <%(tagName)s>"),
28+
"unknown-start-tag":
29+
_(u"Unknown start tag <%(tagName)s'"),
30+
"unknown-attribute":
31+
_(u"Unknown '%(attributeName)s' attribute on <%(tagName)s>"),
2432
"missing-required-attribute":
25-
_(u"Missing required attribute '%(attributeName)s' in <%(tagName)s>"),
33+
_(u"Missing required '%(attributeName)s' attribute on <%(tagName)s>"),
34+
"unknown-input-type":
35+
_(u"Unknown value for input type: '%(inputType)s'"),
36+
"attribute-not-allowed-on-this-input-type":
37+
_(u"'%(attributeName)s' attribute is not allowed on <input type='%(inputType)s'>"),
2638
})
2739

28-
globalAttributes = ['class', 'contenteditable', 'contextmenu', 'dir',
40+
globalAttributes = frozenset(('class', 'contenteditable', 'contextmenu', 'dir',
2941
'draggable', 'id', 'irrelevant', 'lang', 'ref', 'tabindex', 'template',
3042
'title', 'onabort', 'onbeforeunload', 'onblur', 'onchange', 'onclick',
3143
'oncontextmenu', 'ondblclick', 'ondrag', 'ondragend', 'ondragenter',
3244
'ondragleave', 'ondragover', 'ondragstart', 'ondrop', 'onerror',
3345
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', 'onmessage',
3446
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
35-
'onmousewheel', 'onresize', 'onscroll', 'onselect', 'onsubmit', 'onunload']
47+
'onmousewheel', 'onresize', 'onscroll', 'onselect', 'onsubmit', 'onunload'))
3648
# XXX lang in HTML only, xml:lang in XHTML only
3749

3850
allowedAttributeMap = {
39-
'html': ['xmlns'],
40-
'base': ['href', 'target'],
41-
'link': ['href', 'rel', 'media', 'hreflang', 'type'],
42-
'meta': ['name', 'http-equiv', 'content', 'charset'], # XXX charset in HTML only
43-
'style': ['media', 'type', 'scoped'],
44-
'blockquote': ['cite'],
45-
'ol': ['start'],
46-
'li': ['value'], # XXX depends on parent
47-
'a': ['href', 'target', 'ping', 'rel', 'media', 'hreflang', 'type'],
48-
'q': ['cite'],
49-
'time': ['datetime'],
50-
'meter': ['value', 'min', 'low', 'high', 'max', 'optimum'],
51-
'progress': ['value', 'max'],
52-
'ins': ['cite', 'datetime'],
53-
'del': ['cite', 'datetime'],
54-
'img': ['alt', 'src', 'usemap', 'ismap', 'height', 'width'], # XXX ismap depends on parent
55-
'iframe': ['src'],
56-
'object': ['data', 'type', 'usemap', 'height', 'width'],
57-
'param': ['name', 'value'],
58-
'video': ['src', 'autoplay', 'start', 'loopstart', 'loopend', 'end',
59-
'loopcount', 'controls'],
60-
'audio': ['src', 'autoplay', 'start', 'loopstart', 'loopend', 'end',
61-
'loopcount', 'controls'],
62-
'source': ['src', 'type', 'media'],
63-
'canvas': ['height', 'width'],
64-
'area': ['alt', 'coords', 'shape', 'href', 'target', 'ping', 'rel',
65-
'media', 'hreflang', 'type'],
66-
'colgroup': ['span'], # XXX only if element contains no <col> elements
67-
'col': ['span'],
68-
'td': ['colspan', 'rowspan'],
69-
'th': ['colspan', 'rowspan', 'scope'],
70-
# XXX form elements
71-
'script': ['src', 'defer', 'async', 'type'],
72-
'event-source': ['src'],
73-
'details': ['open'],
74-
'datagrid': ['multiple', 'disabled'],
75-
'command': ['type', 'label', 'icon', 'hidden', 'disabled', 'checked',
76-
'radiogroup', 'default'],
77-
'menu': ['type', 'label', 'autosubmit'],
78-
'font': ['style']
51+
'html': frozenset(('xmlns',)),
52+
'head': frozenset(()),
53+
'title': frozenset(()),
54+
'base': frozenset(('href', 'target')),
55+
'link': frozenset(('href', 'rel', 'media', 'hreflang', 'type')),
56+
'meta': frozenset(('name', 'http-equiv', 'content', 'charset')), # XXX charset in HTML only
57+
'style': frozenset(('media', 'type', 'scoped')),
58+
'body': frozenset(()),
59+
'section': frozenset(()),
60+
'nav': frozenset(()),
61+
'article': frozenset(()),
62+
'blockquote': frozenset(('cite',)),
63+
'aside': frozenset(()),
64+
'h1': frozenset(()),
65+
'h2': frozenset(()),
66+
'h3': frozenset(()),
67+
'h4': frozenset(()),
68+
'h5': frozenset(()),
69+
'h6': frozenset(()),
70+
'header': frozenset(()),
71+
'footer': frozenset(()),
72+
'address': frozenset(()),
73+
'p': frozenset(()),
74+
'hr': frozenset(()),
75+
'br': frozenset(()),
76+
'dialog': frozenset(()),
77+
'pre': frozenset(()),
78+
'ol': frozenset(('start',)),
79+
'ul': frozenset(()),
80+
'li': frozenset(('value',)), # XXX depends on parent
81+
'dl': frozenset(()),
82+
'dt': frozenset(()),
83+
'dd': frozenset(()),
84+
'a': frozenset(('href', 'target', 'ping', 'rel', 'media', 'hreflang', 'type')),
85+
'q': frozenset(('cite',)),
86+
'cite': frozenset(()),
87+
'em': frozenset(()),
88+
'strong': frozenset(()),
89+
'small': frozenset(()),
90+
'm': frozenset(()),
91+
'dfn': frozenset(()),
92+
'abbr': frozenset(()),
93+
'time': frozenset(('datetime',)),
94+
'meter': frozenset(('value', 'min', 'low', 'high', 'max', 'optimum')),
95+
'progress': frozenset(('value', 'max')),
96+
'code': frozenset(()),
97+
'var': frozenset(()),
98+
'samp': frozenset(()),
99+
'kbd': frozenset(()),
100+
'sup': frozenset(()),
101+
'sub': frozenset(()),
102+
'span': frozenset(()),
103+
'i': frozenset(()),
104+
'b': frozenset(()),
105+
'bdo': frozenset(()),
106+
'ins': frozenset(('cite', 'datetime')),
107+
'del': frozenset(('cite', 'datetime')),
108+
'figure': frozenset(()),
109+
'img': frozenset(('alt', 'src', 'usemap', 'ismap', 'height', 'width')), # XXX ismap depends on parent
110+
'iframe': frozenset(('src',)),
111+
# <embed> handled separately
112+
'object': frozenset(('data', 'type', 'usemap', 'height', 'width')),
113+
'param': frozenset(('name', 'value')),
114+
'video': frozenset(('src', 'autoplay', 'start', 'loopstart', 'loopend', 'end',
115+
'loopcount', 'controls')),
116+
'audio': frozenset(('src', 'autoplay', 'start', 'loopstart', 'loopend', 'end',
117+
'loopcount', 'controls')),
118+
'source': frozenset(('src', 'type', 'media')),
119+
'canvas': frozenset(('height', 'width')),
120+
'map': frozenset(()),
121+
'area': frozenset(('alt', 'coords', 'shape', 'href', 'target', 'ping', 'rel',
122+
'media', 'hreflang', 'type')),
123+
'table': frozenset(()),
124+
'caption': frozenset(()),
125+
'colgroup': frozenset(('span',)), # XXX only if element contains no <col> elements
126+
'col': frozenset(('span',)),
127+
'tbody': frozenset(()),
128+
'thead': frozenset(()),
129+
'tfoot': frozenset(()),
130+
'tr': frozenset(()),
131+
'td': frozenset(('colspan', 'rowspan')),
132+
'th': frozenset(('colspan', 'rowspan', 'scope')),
133+
# 'form': frozenset(('action', 'method', 'enctype', 'accept', 'name', 'onsubmit',
134+
# 'onreset', 'accept-charset', 'data', 'replace')),
135+
# all possible <input> attributes are listed here but <input> is really handled separately
136+
'input': frozenset(('accept', 'accesskey', 'action', 'alt', 'autocomplete', 'autofocus', 'checked', 'disabled', 'enctype', 'form', 'inputmode', 'list', 'maxlength', 'method', 'min', 'max', 'name', 'pattern', 'step', 'readonly', 'replace', 'required', 'size', 'src', 'tabindex', 'target', 'template', 'value')),
137+
# 'button': frozenset(('name', 'value', 'type', 'disabled', 'form', 'autofocus')),
138+
# 'select': frozenset(('name', 'size', 'multiple', 'disabled', 'data', 'accesskey',
139+
# 'form', 'autofocus')),
140+
# 'optgroup': frozenset(('disabled', 'label', 'form', 'autofocus')),
141+
# 'option': frozenset(('selected', 'disabled', 'label', 'value', 'form', 'autofocus')),
142+
# 'textarea': frozenset(('name', 'rows', 'cols', 'disabled', 'readonly', 'required',
143+
# 'form', 'autofocus', 'wrap', 'accept')),
144+
# 'label': frozenset(('for', 'accesskey', 'form')),
145+
# 'fieldset': frozenset(('disabled', 'form')),
146+
# 'output': frozenset(('form', 'name', 'for', 'onforminput', 'onformchange')),
147+
# 'datalist': frozenset(('data')),
148+
# # XXX repetition model for repeating form controls
149+
'script': frozenset(('src', 'defer', 'async', 'type')),
150+
'noscript': frozenset(()),
151+
'noembed': frozenset(()),
152+
'event-source': frozenset(('src',)),
153+
'details': frozenset(('open',)),
154+
'datagrid': frozenset(('multiple', 'disabled')),
155+
'command': frozenset(('type', 'label', 'icon', 'hidden', 'disabled', 'checked',
156+
'radiogroup', 'default')),
157+
'menu': frozenset(('type', 'label', 'autosubmit')),
158+
'datatemplate': frozenset(()),
159+
'rule': frozenset(()),
160+
'nest': frozenset(()),
161+
'legend': frozenset(()),
162+
'div': frozenset(()),
163+
'font': frozenset(('style',)),
79164
}
80165

81166
requiredAttributeMap = {
82-
'link': ['href', 'rel'],
83-
'bdo': ['dir'],
84-
'img': ['src'],
85-
'embed': ['src'],
86-
'object': [], # XXX one of 'data' or 'type' is required
87-
'param': ['name', 'value'],
88-
'source': ['src'],
89-
'map': ['id'],
167+
'link': frozenset(('href', 'rel')),
168+
'bdo': frozenset(('dir',)),
169+
'img': frozenset(('src',)),
170+
'embed': frozenset(('src',)),
171+
'object': frozenset(()), # XXX one of 'data' or 'type' is required
172+
'param': frozenset(('name', 'value')),
173+
'source': frozenset(('src',)),
174+
'map': frozenset(('id',)),
175+
}
176+
177+
inputTypeAllowedAttributeMap = {
178+
'text': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'inputmode', 'list', 'maxlength', 'name', 'pattern', 'readonly', 'required', 'size', 'tabindex', 'value')),
179+
'password': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'inputmode', 'maxlength', 'name', 'pattern', 'readonly', 'required', 'size', 'tabindex', 'value')),
180+
'checkbox': frozenset(('accesskey', 'autofocus', 'checked', 'disabled', 'form', 'name', 'required', 'tabindex', 'value')),
181+
'radio': frozenset(('accesskey', 'autofocus', 'checked', 'disabled', 'form', 'name', 'required', 'tabindex', 'value')),
182+
'button': frozenset(('accesskey', 'autofocus', 'disabled', 'form', 'name', 'tabindex', 'value')),
183+
'submit': frozenset(('accesskey', 'action', 'autofocus', 'disabled', 'enctype', 'form', 'method', 'name', 'replace', 'tabindex', 'target', 'value')),
184+
'reset': frozenset(('accesskey', 'autofocus', 'disabled', 'form', 'name', 'tabindex', 'value')),
185+
'add': frozenset(('accesskey', 'autofocus', 'disabled', 'form', 'name', 'tabindex', 'template', 'value')),
186+
'remove': frozenset(('accesskey', 'autofocus', 'disabled', 'form', 'name', 'tabindex', 'value')),
187+
'move-up': frozenset(('accesskey', 'autofocus', 'disabled', 'form', 'name', 'tabindex', 'value')),
188+
'move-down': frozenset(('accesskey', 'autofocus', 'disabled', 'form', 'name', 'tabindex', 'value')),
189+
'file': frozenset(('accept', 'accesskey', 'autofocus', 'disabled', 'form', 'min', 'max', 'name', 'required', 'tabindex')),
190+
'hidden': frozenset(('disabled', 'form', 'name', 'value')),
191+
'image': frozenset(('accesskey', 'action', 'alt', 'autofocus', 'disabled', 'enctype', 'form', 'method', 'name', 'replace', 'src', 'tabindex', 'target')),
192+
'datetime': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
193+
'datetime-local': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
194+
'date': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
195+
'month': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
196+
'week': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
197+
'time': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
198+
'number': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
199+
'range': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'list', 'min', 'max', 'name', 'step', 'readonly', 'required', 'tabindex', 'value')),
200+
'email': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'inputmode', 'list', 'maxlength', 'name', 'pattern', 'readonly', 'required', 'tabindex', 'value')),
201+
'url': frozenset(('accesskey', 'autocomplete', 'autofocus', 'disabled', 'form', 'inputmode', 'list', 'maxlength', 'name', 'pattern', 'readonly', 'required', 'tabindex', 'value')),
90202
}
91203

92204
class HTMLConformanceChecker(_base.Filter):
@@ -96,31 +208,76 @@ def __init__(self, stream, encoding, parseMeta, **kwargs):
96208

97209
def __iter__(self):
98210
for token in _base.Filter.__iter__(self):
99-
type = token["type"]
100-
if type == "StartTag":
101-
name = token["name"].lower()
102-
if name == 'embed':
103-
# XXX spec says "any attributes w/o namespace"
104-
pass
105-
else:
106-
if name in allowedAttributeMap.keys():
107-
allowedAttributes = globalAttributes + \
108-
allowedAttributeMap[name]
109-
else:
110-
allowedAttributes = globalAttributes
111-
for attrName, attrValue in token["data"]:
112-
if attrName.lower() not in allowedAttributes:
113-
yield {"type": "ParseError",
114-
"data": "unrecognized-attribute",
115-
"datavars": {"tagName": name,
116-
"attributeName": attrName}}
117-
if name in requiredAttributeMap.keys():
118-
attrsPresent = [attrName for attrName, attrValue
119-
in token["data"]]
120-
for attrName in requiredAttributeMap[name]:
121-
if attrName not in attrsPresent:
122-
yield {"type": "ParseError",
123-
"data": "missing-required-attribute",
124-
"datavars": {"tagName": name,
125-
"attributeName": attrName}}
211+
fakeToken = {"type": token.get("type", "-"),
212+
"name": token.get("name", "-").capitalize()}
213+
method = getattr(self, "validate%(type)s%(name)s" % fakeToken, None)
214+
if method:
215+
for t in method(token) or []: yield t
216+
else:
217+
method = getattr(self, "validate%(type)s" % fakeToken, None)
218+
if method:
219+
for t in method(token) or []: yield t
126220
yield token
221+
222+
def validateStartTag(self, token):
223+
for t in self.checkUnknownStartTag(token) or []: yield t
224+
for t in self.checkStartTagRequiredAttributes(token) or []: yield t
225+
for t in self.checkStartTagUnknownAttributes(token) or []: yield t
226+
227+
def validateStartTagEmbed(self, token):
228+
for t in self.checkStartTagRequiredAttributes(token) or []: yield t
229+
# spec says "any attributes w/o namespace"
230+
# so don't call checkStartTagUnknownAttributes
231+
232+
def validateStartTagInput(self, token):
233+
attrDict = dict([(name.lower(), value) for name, value in token["data"]])
234+
inputType = attrDict.get("type", "text")
235+
if inputType not in inputTypeAllowedAttributeMap.keys():
236+
yield {"type": "ParseError",
237+
"data": "unknown-input-type",
238+
"datavars": {"attrValue": inputType}}
239+
allowedAttributes = inputTypeAllowedAttributeMap.get(inputType, [])
240+
for attrName, attrValue in attrDict.items():
241+
if attrName not in allowedAttributeMap['input']:
242+
yield {"type": "ParseError",
243+
"data": "unknown-attribute",
244+
"datavars": {"tagName": "input",
245+
"attributeName": attrName}}
246+
elif attrName not in allowedAttributes:
247+
yield {"type": "ParseError",
248+
"data": "attribute-not-allowed-on-this-input-type",
249+
"datavars": {"attributeName": attrName,
250+
"inputType": inputType}}
251+
252+
def checkUnknownStartTag(self, token):
253+
# check for recognized tag name
254+
name = token["name"].lower()
255+
if name not in allowedAttributeMap.keys():
256+
yield {"type": "ParseError",
257+
"data": "unknown-start-tag",
258+
"datavars": {"tagName": name}}
259+
260+
def checkStartTagRequiredAttributes(self, token):
261+
# check for presence of required attributes
262+
name = token["name"].lower()
263+
if name in requiredAttributeMap.keys():
264+
attrsPresent = [attrName for attrName, attrValue
265+
in token["data"]]
266+
for attrName in requiredAttributeMap[name]:
267+
if attrName not in attrsPresent:
268+
yield {"type": "ParseError",
269+
"data": "missing-required-attribute",
270+
"datavars": {"tagName": name,
271+
"attributeName": attrName}}
272+
273+
def checkStartTagUnknownAttributes(self, token):
274+
# check for recognized attribute names
275+
name = token["name"].lower()
276+
allowedAttributes = globalAttributes | allowedAttributeMap.get(name, frozenset(()))
277+
for attrName, attrValue in token["data"]:
278+
if attrName.lower() not in allowedAttributes:
279+
yield {"type": "ParseError",
280+
"data": "unknown-attribute",
281+
"datavars": {"tagName": name,
282+
"attributeName": attrName}}
283+

0 commit comments

Comments
 (0)