Skip to content

Commit c12357b

Browse files
author
James Graham
committed
Add file for converting the tokenizer tests to tree tests (assuming html5lib works with them correctly)
1 parent 55bfda0 commit c12357b

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed

html5lib/tests/tokenizertotree.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import sys
2+
import os
3+
import json
4+
import re
5+
6+
import html5lib
7+
import support
8+
import test_parser
9+
import test_tokenizer
10+
11+
p = html5lib.HTMLParser()
12+
13+
unnamespaceExpected = re.compile(r"^(\s*)<html (\S+)>", re.M).sub
14+
15+
def main(out_path):
16+
if not os.path.exists(out_path):
17+
sys.stderr.write("Path %s does not exist"%out_path)
18+
sys.exit(1)
19+
20+
for filename in support.html5lib_test_files('tokenizer', '*.test'):
21+
run_file(filename, out_path)
22+
23+
def run_file(filename, out_path):
24+
try:
25+
tests_data = json.load(file(filename))
26+
except ValueError:
27+
sys.stderr.write("Failed to load %s\n"%filename)
28+
return
29+
name = os.path.splitext(os.path.split(filename)[1])[0]
30+
output_file = open(os.path.join(out_path, "tokenizer_%s.dat"%name), "w")
31+
32+
if 'tests' in tests_data:
33+
for test_data in tests_data['tests']:
34+
if 'initialStates' not in test_data:
35+
test_data["initialStates"] = ["Data state"]
36+
37+
for initial_state in test_data["initialStates"]:
38+
if initial_state != "Data state":
39+
#don't support this yet
40+
continue
41+
test = make_test(test_data)
42+
output_file.write(test)
43+
44+
output_file.close()
45+
46+
def make_test(test_data):
47+
if 'doubleEscaped' in test_data:
48+
test_data = test_tokenizer.unescape_test(test_data)
49+
50+
rv = []
51+
rv.append("#data")
52+
rv.append(test_data["input"].encode("utf8"))
53+
rv.append("#errors")
54+
rv.append("#document")
55+
tree = p.parse(test_data["input"])
56+
output = test_parser.convertTreeDump(p.tree.testSerializer(tree))
57+
output = test_parser.attrlist.sub(test_parser.sortattrs, output)
58+
output = unnamespaceExpected(r"\1<\2>", output)
59+
rv.append(output.encode("utf8"))
60+
rv.append("")
61+
return "\n".join(rv)
62+
63+
if __name__ == "__main__":
64+
main(sys.argv[1])

0 commit comments

Comments
 (0)