|
| 1 | +"""HTML 5 conformance checker |
| 2 | +
|
| 3 | +Warning: this module is experimental, incomplete, and subject to removal at any time. |
| 4 | +
|
| 5 | +Usage: |
| 6 | +>>> from html5lib.html5parser import HTMLParser |
| 7 | +>>> from html5lib.filters.validator import HTMLConformanceChecker |
| 8 | +>>> p = HTMLParser(tokenizer=HTMLConformanceChecker) |
| 9 | +>>> p.parse('<!doctype html>\n<html foo=bar></html>') |
| 10 | +<<class 'html5lib.treebuilders.simpletree.Document'> None> |
| 11 | +>>> p.errors |
| 12 | +[((2, 14), 'unrecognized-attribute', {'attributeName': u'foo', 'tagName': u'html'})] |
| 13 | +""" |
| 14 | + |
| 15 | +import _base |
| 16 | +from html5lib.constants import E |
| 17 | +from html5lib import tokenizer |
| 18 | +import gettext |
| 19 | +_ = gettext.gettext |
| 20 | + |
| 21 | +E.update({ |
| 22 | + "unrecognized-attribute": |
| 23 | + _(u"Unrecognized attribute '%(attrName)s' in <%(tagName)s>"), |
| 24 | +}) |
| 25 | + |
| 26 | +class HTMLConformanceChecker(_base.Filter): |
| 27 | + def __init__(self, stream, encoding, parseMeta, **kwargs): |
| 28 | + _base.Filter.__init__(self, tokenizer.HTMLTokenizer(stream, encoding, parseMeta, **kwargs)) |
| 29 | + |
| 30 | + def __iter__(self): |
| 31 | + for token in _base.Filter.__iter__(self): |
| 32 | + type = token["type"] |
| 33 | + if type == "StartTag": |
| 34 | + name = token["name"].lower() |
| 35 | + if name == 'html': |
| 36 | + for attrName, attrValue in token["data"]: |
| 37 | + if attrName.lower() != 'xmlns': |
| 38 | + yield {"type": "ParseError", "data": "unrecognized-attribute", "datavars": {"tagName": name, "attributeName": attrName}} |
| 39 | + |
| 40 | + yield token |
0 commit comments