9
9
>>> p.parse('<!doctype html>\n <html foo=bar></html>')
10
10
<<class 'html5lib.treebuilders.simpletree.Document'> None>
11
11
>>> p.errors
12
- [((2, 14), 'unrecognized -attribute', {'attributeName': u'foo', 'tagName': u'html'})]
12
+ [((2, 14), 'unknown -attribute', {'attributeName': u'foo', 'tagName': u'html'})]
13
13
"""
14
14
15
+ try :
16
+ frozenset
17
+ except NameError :
18
+ # Import from the sets module for python 2.3
19
+ from sets import Set as set
20
+ from sets import ImmutableSet as frozenset
15
21
import _base
16
22
from html5lib .constants import E
17
23
from html5lib import tokenizer
18
24
import gettext
19
25
_ = gettext .gettext
20
26
21
27
E .update ({
22
- "unrecognized-attribute" :
23
- _ (u"Unrecognized attribute '%(attributeName)s' in <%(tagName)s>" ),
28
+ "unknown-start-tag" :
29
+ _ (u"Unknown start tag <%(tagName)s'" ),
30
+ "unknown-attribute" :
31
+ _ (u"Unknown '%(attributeName)s' attribute on <%(tagName)s>" ),
24
32
"missing-required-attribute" :
25
- _ (u"Missing required attribute '%(attributeName)s' in <%(tagName)s>" ),
33
+ _ (u"Missing required '%(attributeName)s' attribute on <%(tagName)s>" ),
34
+ "unknown-input-type" :
35
+ _ (u"Unknown value for input type: '%(inputType)s'" ),
36
+ "attribute-not-allowed-on-this-input-type" :
37
+ _ (u"'%(attributeName)s' attribute is not allowed on <input type='%(inputType)s'>" ),
26
38
})
27
39
28
- globalAttributes = [ 'class' , 'contenteditable' , 'contextmenu' , 'dir' ,
40
+ globalAttributes = frozenset (( 'class' , 'contenteditable' , 'contextmenu' , 'dir' ,
29
41
'draggable' , 'id' , 'irrelevant' , 'lang' , 'ref' , 'tabindex' , 'template' ,
30
42
'title' , 'onabort' , 'onbeforeunload' , 'onblur' , 'onchange' , 'onclick' ,
31
43
'oncontextmenu' , 'ondblclick' , 'ondrag' , 'ondragend' , 'ondragenter' ,
32
44
'ondragleave' , 'ondragover' , 'ondragstart' , 'ondrop' , 'onerror' ,
33
45
'onfocus' , 'onkeydown' , 'onkeypress' , 'onkeyup' , 'onload' , 'onmessage' ,
34
46
'onmousedown' , 'onmousemove' , 'onmouseout' , 'onmouseover' , 'onmouseup' ,
35
- 'onmousewheel' , 'onresize' , 'onscroll' , 'onselect' , 'onsubmit' , 'onunload' ]
47
+ 'onmousewheel' , 'onresize' , 'onscroll' , 'onselect' , 'onsubmit' , 'onunload' ))
36
48
# XXX lang in HTML only, xml:lang in XHTML only
37
49
38
50
allowedAttributeMap = {
39
- 'html' : ['xmlns' ],
40
- 'base' : ['href' , 'target' ],
41
- 'link' : ['href' , 'rel' , 'media' , 'hreflang' , 'type' ],
42
- 'meta' : ['name' , 'http-equiv' , 'content' , 'charset' ], # XXX charset in HTML only
43
- 'style' : ['media' , 'type' , 'scoped' ],
44
- 'blockquote' : ['cite' ],
45
- 'ol' : ['start' ],
46
- 'li' : ['value' ], # XXX depends on parent
47
- 'a' : ['href' , 'target' , 'ping' , 'rel' , 'media' , 'hreflang' , 'type' ],
48
- 'q' : ['cite' ],
49
- 'time' : ['datetime' ],
50
- 'meter' : ['value' , 'min' , 'low' , 'high' , 'max' , 'optimum' ],
51
- 'progress' : ['value' , 'max' ],
52
- 'ins' : ['cite' , 'datetime' ],
53
- 'del' : ['cite' , 'datetime' ],
54
- 'img' : ['alt' , 'src' , 'usemap' , 'ismap' , 'height' , 'width' ], # XXX ismap depends on parent
55
- 'iframe' : ['src' ],
56
- 'object' : ['data' , 'type' , 'usemap' , 'height' , 'width' ],
57
- 'param' : ['name' , 'value' ],
58
- 'video' : ['src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
59
- 'loopcount' , 'controls' ],
60
- 'audio' : ['src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
61
- 'loopcount' , 'controls' ],
62
- 'source' : ['src' , 'type' , 'media' ],
63
- 'canvas' : ['height' , 'width' ],
64
- 'area' : ['alt' , 'coords' , 'shape' , 'href' , 'target' , 'ping' , 'rel' ,
65
- 'media' , 'hreflang' , 'type' ],
66
- 'colgroup' : ['span' ], # XXX only if element contains no <col> elements
67
- 'col' : ['span' ],
68
- 'td' : ['colspan' , 'rowspan' ],
69
- 'th' : ['colspan' , 'rowspan' , 'scope' ],
70
- # XXX form elements
71
- 'script' : ['src' , 'defer' , 'async' , 'type' ],
72
- 'event-source' : ['src' ],
73
- 'details' : ['open' ],
74
- 'datagrid' : ['multiple' , 'disabled' ],
75
- 'command' : ['type' , 'label' , 'icon' , 'hidden' , 'disabled' , 'checked' ,
76
- 'radiogroup' , 'default' ],
77
- 'menu' : ['type' , 'label' , 'autosubmit' ],
78
- 'font' : ['style' ]
51
+ 'html' : frozenset (('xmlns' ,)),
52
+ 'head' : frozenset (()),
53
+ 'title' : frozenset (()),
54
+ 'base' : frozenset (('href' , 'target' )),
55
+ 'link' : frozenset (('href' , 'rel' , 'media' , 'hreflang' , 'type' )),
56
+ 'meta' : frozenset (('name' , 'http-equiv' , 'content' , 'charset' )), # XXX charset in HTML only
57
+ 'style' : frozenset (('media' , 'type' , 'scoped' )),
58
+ 'body' : frozenset (()),
59
+ 'section' : frozenset (()),
60
+ 'nav' : frozenset (()),
61
+ 'article' : frozenset (()),
62
+ 'blockquote' : frozenset (('cite' ,)),
63
+ 'aside' : frozenset (()),
64
+ 'h1' : frozenset (()),
65
+ 'h2' : frozenset (()),
66
+ 'h3' : frozenset (()),
67
+ 'h4' : frozenset (()),
68
+ 'h5' : frozenset (()),
69
+ 'h6' : frozenset (()),
70
+ 'header' : frozenset (()),
71
+ 'footer' : frozenset (()),
72
+ 'address' : frozenset (()),
73
+ 'p' : frozenset (()),
74
+ 'hr' : frozenset (()),
75
+ 'br' : frozenset (()),
76
+ 'dialog' : frozenset (()),
77
+ 'pre' : frozenset (()),
78
+ 'ol' : frozenset (('start' ,)),
79
+ 'ul' : frozenset (()),
80
+ 'li' : frozenset (('value' ,)), # XXX depends on parent
81
+ 'dl' : frozenset (()),
82
+ 'dt' : frozenset (()),
83
+ 'dd' : frozenset (()),
84
+ 'a' : frozenset (('href' , 'target' , 'ping' , 'rel' , 'media' , 'hreflang' , 'type' )),
85
+ 'q' : frozenset (('cite' ,)),
86
+ 'cite' : frozenset (()),
87
+ 'em' : frozenset (()),
88
+ 'strong' : frozenset (()),
89
+ 'small' : frozenset (()),
90
+ 'm' : frozenset (()),
91
+ 'dfn' : frozenset (()),
92
+ 'abbr' : frozenset (()),
93
+ 'time' : frozenset (('datetime' ,)),
94
+ 'meter' : frozenset (('value' , 'min' , 'low' , 'high' , 'max' , 'optimum' )),
95
+ 'progress' : frozenset (('value' , 'max' )),
96
+ 'code' : frozenset (()),
97
+ 'var' : frozenset (()),
98
+ 'samp' : frozenset (()),
99
+ 'kbd' : frozenset (()),
100
+ 'sup' : frozenset (()),
101
+ 'sub' : frozenset (()),
102
+ 'span' : frozenset (()),
103
+ 'i' : frozenset (()),
104
+ 'b' : frozenset (()),
105
+ 'bdo' : frozenset (()),
106
+ 'ins' : frozenset (('cite' , 'datetime' )),
107
+ 'del' : frozenset (('cite' , 'datetime' )),
108
+ 'figure' : frozenset (()),
109
+ 'img' : frozenset (('alt' , 'src' , 'usemap' , 'ismap' , 'height' , 'width' )), # XXX ismap depends on parent
110
+ 'iframe' : frozenset (('src' ,)),
111
+ # <embed> handled separately
112
+ 'object' : frozenset (('data' , 'type' , 'usemap' , 'height' , 'width' )),
113
+ 'param' : frozenset (('name' , 'value' )),
114
+ 'video' : frozenset (('src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
115
+ 'loopcount' , 'controls' )),
116
+ 'audio' : frozenset (('src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
117
+ 'loopcount' , 'controls' )),
118
+ 'source' : frozenset (('src' , 'type' , 'media' )),
119
+ 'canvas' : frozenset (('height' , 'width' )),
120
+ 'map' : frozenset (()),
121
+ 'area' : frozenset (('alt' , 'coords' , 'shape' , 'href' , 'target' , 'ping' , 'rel' ,
122
+ 'media' , 'hreflang' , 'type' )),
123
+ 'table' : frozenset (()),
124
+ 'caption' : frozenset (()),
125
+ 'colgroup' : frozenset (('span' ,)), # XXX only if element contains no <col> elements
126
+ 'col' : frozenset (('span' ,)),
127
+ 'tbody' : frozenset (()),
128
+ 'thead' : frozenset (()),
129
+ 'tfoot' : frozenset (()),
130
+ 'tr' : frozenset (()),
131
+ 'td' : frozenset (('colspan' , 'rowspan' )),
132
+ 'th' : frozenset (('colspan' , 'rowspan' , 'scope' )),
133
+ # 'form': frozenset(('action', 'method', 'enctype', 'accept', 'name', 'onsubmit',
134
+ # 'onreset', 'accept-charset', 'data', 'replace')),
135
+ # all possible <input> attributes are listed here but <input> is really handled separately
136
+ 'input' : frozenset (('accept' , 'accesskey' , 'action' , 'alt' , 'autocomplete' , 'autofocus' , 'checked' , 'disabled' , 'enctype' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'method' , 'min' , 'max' , 'name' , 'pattern' , 'step' , 'readonly' , 'replace' , 'required' , 'size' , 'src' , 'tabindex' , 'target' , 'template' , 'value' )),
137
+ # 'button': frozenset(('name', 'value', 'type', 'disabled', 'form', 'autofocus')),
138
+ # 'select': frozenset(('name', 'size', 'multiple', 'disabled', 'data', 'accesskey',
139
+ # 'form', 'autofocus')),
140
+ # 'optgroup': frozenset(('disabled', 'label', 'form', 'autofocus')),
141
+ # 'option': frozenset(('selected', 'disabled', 'label', 'value', 'form', 'autofocus')),
142
+ # 'textarea': frozenset(('name', 'rows', 'cols', 'disabled', 'readonly', 'required',
143
+ # 'form', 'autofocus', 'wrap', 'accept')),
144
+ # 'label': frozenset(('for', 'accesskey', 'form')),
145
+ # 'fieldset': frozenset(('disabled', 'form')),
146
+ # 'output': frozenset(('form', 'name', 'for', 'onforminput', 'onformchange')),
147
+ # 'datalist': frozenset(('data')),
148
+ # # XXX repetition model for repeating form controls
149
+ 'script' : frozenset (('src' , 'defer' , 'async' , 'type' )),
150
+ 'noscript' : frozenset (()),
151
+ 'noembed' : frozenset (()),
152
+ 'event-source' : frozenset (('src' ,)),
153
+ 'details' : frozenset (('open' ,)),
154
+ 'datagrid' : frozenset (('multiple' , 'disabled' )),
155
+ 'command' : frozenset (('type' , 'label' , 'icon' , 'hidden' , 'disabled' , 'checked' ,
156
+ 'radiogroup' , 'default' )),
157
+ 'menu' : frozenset (('type' , 'label' , 'autosubmit' )),
158
+ 'datatemplate' : frozenset (()),
159
+ 'rule' : frozenset (()),
160
+ 'nest' : frozenset (()),
161
+ 'legend' : frozenset (()),
162
+ 'div' : frozenset (()),
163
+ 'font' : frozenset (('style' ,)),
79
164
}
80
165
81
166
requiredAttributeMap = {
82
- 'link' : ['href' , 'rel' ],
83
- 'bdo' : ['dir' ],
84
- 'img' : ['src' ],
85
- 'embed' : ['src' ],
86
- 'object' : [], # XXX one of 'data' or 'type' is required
87
- 'param' : ['name' , 'value' ],
88
- 'source' : ['src' ],
89
- 'map' : ['id' ],
167
+ 'link' : frozenset (('href' , 'rel' )),
168
+ 'bdo' : frozenset (('dir' ,)),
169
+ 'img' : frozenset (('src' ,)),
170
+ 'embed' : frozenset (('src' ,)),
171
+ 'object' : frozenset (()), # XXX one of 'data' or 'type' is required
172
+ 'param' : frozenset (('name' , 'value' )),
173
+ 'source' : frozenset (('src' ,)),
174
+ 'map' : frozenset (('id' ,)),
175
+ }
176
+
177
+ inputTypeAllowedAttributeMap = {
178
+ 'text' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'size' , 'tabindex' , 'value' )),
179
+ 'password' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'size' , 'tabindex' , 'value' )),
180
+ 'checkbox' : frozenset (('accesskey' , 'autofocus' , 'checked' , 'disabled' , 'form' , 'name' , 'required' , 'tabindex' , 'value' )),
181
+ 'radio' : frozenset (('accesskey' , 'autofocus' , 'checked' , 'disabled' , 'form' , 'name' , 'required' , 'tabindex' , 'value' )),
182
+ 'button' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
183
+ 'submit' : frozenset (('accesskey' , 'action' , 'autofocus' , 'disabled' , 'enctype' , 'form' , 'method' , 'name' , 'replace' , 'tabindex' , 'target' , 'value' )),
184
+ 'reset' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
185
+ 'add' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'template' , 'value' )),
186
+ 'remove' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
187
+ 'move-up' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
188
+ 'move-down' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
189
+ 'file' : frozenset (('accept' , 'accesskey' , 'autofocus' , 'disabled' , 'form' , 'min' , 'max' , 'name' , 'required' , 'tabindex' )),
190
+ 'hidden' : frozenset (('disabled' , 'form' , 'name' , 'value' )),
191
+ 'image' : frozenset (('accesskey' , 'action' , 'alt' , 'autofocus' , 'disabled' , 'enctype' , 'form' , 'method' , 'name' , 'replace' , 'src' , 'tabindex' , 'target' )),
192
+ 'datetime' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
193
+ 'datetime-local' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
194
+ 'date' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
195
+ 'month' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
196
+ 'week' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
197
+ 'time' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
198
+ 'number' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
199
+ 'range' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
200
+ 'email' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'tabindex' , 'value' )),
201
+ 'url' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'tabindex' , 'value' )),
90
202
}
91
203
92
204
class HTMLConformanceChecker (_base .Filter ):
@@ -96,31 +208,76 @@ def __init__(self, stream, encoding, parseMeta, **kwargs):
96
208
97
209
def __iter__ (self ):
98
210
for token in _base .Filter .__iter__ (self ):
99
- type = token ["type" ]
100
- if type == "StartTag" :
101
- name = token ["name" ].lower ()
102
- if name == 'embed' :
103
- # XXX spec says "any attributes w/o namespace"
104
- pass
105
- else :
106
- if name in allowedAttributeMap .keys ():
107
- allowedAttributes = globalAttributes + \
108
- allowedAttributeMap [name ]
109
- else :
110
- allowedAttributes = globalAttributes
111
- for attrName , attrValue in token ["data" ]:
112
- if attrName .lower () not in allowedAttributes :
113
- yield {"type" : "ParseError" ,
114
- "data" : "unrecognized-attribute" ,
115
- "datavars" : {"tagName" : name ,
116
- "attributeName" : attrName }}
117
- if name in requiredAttributeMap .keys ():
118
- attrsPresent = [attrName for attrName , attrValue
119
- in token ["data" ]]
120
- for attrName in requiredAttributeMap [name ]:
121
- if attrName not in attrsPresent :
122
- yield {"type" : "ParseError" ,
123
- "data" : "missing-required-attribute" ,
124
- "datavars" : {"tagName" : name ,
125
- "attributeName" : attrName }}
211
+ fakeToken = {"type" : token .get ("type" , "-" ),
212
+ "name" : token .get ("name" , "-" ).capitalize ()}
213
+ method = getattr (self , "validate%(type)s%(name)s" % fakeToken , None )
214
+ if method :
215
+ for t in method (token ) or []: yield t
216
+ else :
217
+ method = getattr (self , "validate%(type)s" % fakeToken , None )
218
+ if method :
219
+ for t in method (token ) or []: yield t
126
220
yield token
221
+
222
+ def validateStartTag (self , token ):
223
+ for t in self .checkUnknownStartTag (token ) or []: yield t
224
+ for t in self .checkStartTagRequiredAttributes (token ) or []: yield t
225
+ for t in self .checkStartTagUnknownAttributes (token ) or []: yield t
226
+
227
+ def validateStartTagEmbed (self , token ):
228
+ for t in self .checkStartTagRequiredAttributes (token ) or []: yield t
229
+ # spec says "any attributes w/o namespace"
230
+ # so don't call checkStartTagUnknownAttributes
231
+
232
+ def validateStartTagInput (self , token ):
233
+ attrDict = dict ([(name .lower (), value ) for name , value in token ["data" ]])
234
+ inputType = attrDict .get ("type" , "text" )
235
+ if inputType not in inputTypeAllowedAttributeMap .keys ():
236
+ yield {"type" : "ParseError" ,
237
+ "data" : "unknown-input-type" ,
238
+ "datavars" : {"attrValue" : inputType }}
239
+ allowedAttributes = inputTypeAllowedAttributeMap .get (inputType , [])
240
+ for attrName , attrValue in attrDict .items ():
241
+ if attrName not in allowedAttributeMap ['input' ]:
242
+ yield {"type" : "ParseError" ,
243
+ "data" : "unknown-attribute" ,
244
+ "datavars" : {"tagName" : "input" ,
245
+ "attributeName" : attrName }}
246
+ elif attrName not in allowedAttributes :
247
+ yield {"type" : "ParseError" ,
248
+ "data" : "attribute-not-allowed-on-this-input-type" ,
249
+ "datavars" : {"attributeName" : attrName ,
250
+ "inputType" : inputType }}
251
+
252
+ def checkUnknownStartTag (self , token ):
253
+ # check for recognized tag name
254
+ name = token ["name" ].lower ()
255
+ if name not in allowedAttributeMap .keys ():
256
+ yield {"type" : "ParseError" ,
257
+ "data" : "unknown-start-tag" ,
258
+ "datavars" : {"tagName" : name }}
259
+
260
+ def checkStartTagRequiredAttributes (self , token ):
261
+ # check for presence of required attributes
262
+ name = token ["name" ].lower ()
263
+ if name in requiredAttributeMap .keys ():
264
+ attrsPresent = [attrName for attrName , attrValue
265
+ in token ["data" ]]
266
+ for attrName in requiredAttributeMap [name ]:
267
+ if attrName not in attrsPresent :
268
+ yield {"type" : "ParseError" ,
269
+ "data" : "missing-required-attribute" ,
270
+ "datavars" : {"tagName" : name ,
271
+ "attributeName" : attrName }}
272
+
273
+ def checkStartTagUnknownAttributes (self , token ):
274
+ # check for recognized attribute names
275
+ name = token ["name" ].lower ()
276
+ allowedAttributes = globalAttributes | allowedAttributeMap .get (name , frozenset (()))
277
+ for attrName , attrValue in token ["data" ]:
278
+ if attrName .lower () not in allowedAttributes :
279
+ yield {"type" : "ParseError" ,
280
+ "data" : "unknown-attribute" ,
281
+ "datavars" : {"tagName" : name ,
282
+ "attributeName" : attrName }}
283
+
0 commit comments