Skip to content

Commit 8d18c09

Browse files
author
Mark Pilgrim
committed
initial checkin of conformance checker wrapper
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40964
1 parent 88c2483 commit 8d18c09

File tree

2 files changed

+84
-5
lines changed

2 files changed

+84
-5
lines changed

src/html5lib/filters/validator.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,31 @@
2020

2121
E.update({
2222
"unrecognized-attribute":
23-
_(u"Unrecognized attribute '%(attrName)s' in <%(tagName)s>"),
23+
_(u"Unrecognized attribute '%(attributeName)s' in <%(tagName)s>"),
2424
})
2525

26+
globalAttributes = ['id', 'title', 'lang', 'dir', 'class', 'irrelevant']
27+
allowedAttributeMap = {
28+
'html': globalAttributes + ['xmlns']
29+
}
30+
2631
class HTMLConformanceChecker(_base.Filter):
2732
def __init__(self, stream, encoding, parseMeta, **kwargs):
28-
_base.Filter.__init__(self, tokenizer.HTMLTokenizer(stream, encoding, parseMeta, **kwargs))
33+
_base.Filter.__init__(self, tokenizer.HTMLTokenizer(
34+
stream, encoding, parseMeta, **kwargs))
2935

3036
def __iter__(self):
3137
for token in _base.Filter.__iter__(self):
3238
type = token["type"]
3339
if type == "StartTag":
3440
name = token["name"].lower()
35-
if name == 'html':
41+
if name in allowedAttributeMap.keys():
42+
allowedAttributes = allowedAttributeMap[name]
3643
for attrName, attrValue in token["data"]:
37-
if attrName.lower() != 'xmlns':
38-
yield {"type": "ParseError", "data": "unrecognized-attribute", "datavars": {"tagName": name, "attributeName": attrName}}
44+
if attrName.lower() not in allowedAttributes:
45+
yield {"type": "ParseError",
46+
"data": "unrecognized-attribute",
47+
"datavars": {"tagName": name,
48+
"attributeName": attrName}}
3949

4050
yield token

validate.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env python
2+
"""usage: %prog [options] url-or-filename
3+
4+
Validate an HTML5 document using a non-schema-based conformance checker"""
5+
#RELEASE move ./examples/
6+
7+
import sys
8+
import os
9+
from optparse import OptionParser
10+
11+
#RELEASE remove
12+
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
13+
#END RELEASE
14+
from html5lib import html5parser#, liberalxmlparser
15+
from html5lib import treebuilders
16+
from html5lib import constants
17+
from html5lib.filters import validator
18+
19+
def parse():
20+
optParser = getOptParser()
21+
opts,args = optParser.parse_args()
22+
encoding = None
23+
24+
try:
25+
f = args[-1]
26+
# Try opening from the internet
27+
if f.startswith('http://'):
28+
try:
29+
import urllib, cgi
30+
f = urllib.urlopen(f)
31+
contentType = f.headers.get('content-type')
32+
if contentType:
33+
(mediaType, params) = cgi.parse_header(contentType)
34+
encoding = params.get('charset')
35+
except: pass
36+
elif f == '-':
37+
f = sys.stdin
38+
else:
39+
try:
40+
# Try opening from file system
41+
f = open(f)
42+
except IOError: pass
43+
except IndexError:
44+
sys.stderr.write("No filename provided. Use -h for help\n")
45+
sys.exit(1)
46+
47+
treebuilder = treebuilders.getTreeBuilder("simpleTree")
48+
49+
# if opts.xml:
50+
# p = liberalxmlparser.XHTMLParser(tree=treebuilder)
51+
# else:
52+
if 1:
53+
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=validator.HTMLConformanceChecker)
54+
55+
document = p.parse(f, encoding=encoding)
56+
printOutput(p, document, opts)
57+
58+
def printOutput(parser, document, opts):
59+
errList=[]
60+
for pos, errorcode, datavars in parser.errors:
61+
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
62+
sys.stdout.write("\nValidation errors:\n" + "\n".join(errList)+"\n")
63+
64+
def getOptParser():
65+
parser = OptionParser(usage=__doc__)
66+
return parser
67+
68+
if __name__ == "__main__":
69+
parse()

0 commit comments

Comments
 (0)