|
| 1 | +import sys |
| 2 | +import os |
| 3 | +import json |
| 4 | +import re |
| 5 | + |
| 6 | +import html5lib |
| 7 | +import support |
| 8 | +import test_parser |
| 9 | +import test_tokenizer |
| 10 | + |
| 11 | +p = html5lib.HTMLParser() |
| 12 | + |
| 13 | +unnamespaceExpected = re.compile(r"^(\s*)<html (\S+)>", re.M).sub |
| 14 | + |
| 15 | +def main(out_path): |
| 16 | + if not os.path.exists(out_path): |
| 17 | + sys.stderr.write("Path %s does not exist"%out_path) |
| 18 | + sys.exit(1) |
| 19 | + |
| 20 | + for filename in support.html5lib_test_files('tokenizer', '*.test'): |
| 21 | + run_file(filename, out_path) |
| 22 | + |
| 23 | +def run_file(filename, out_path): |
| 24 | + try: |
| 25 | + tests_data = json.load(file(filename)) |
| 26 | + except ValueError: |
| 27 | + sys.stderr.write("Failed to load %s\n"%filename) |
| 28 | + return |
| 29 | + name = os.path.splitext(os.path.split(filename)[1])[0] |
| 30 | + output_file = open(os.path.join(out_path, "tokenizer_%s.dat"%name), "w") |
| 31 | + |
| 32 | + if 'tests' in tests_data: |
| 33 | + for test_data in tests_data['tests']: |
| 34 | + if 'initialStates' not in test_data: |
| 35 | + test_data["initialStates"] = ["Data state"] |
| 36 | + |
| 37 | + for initial_state in test_data["initialStates"]: |
| 38 | + if initial_state != "Data state": |
| 39 | + #don't support this yet |
| 40 | + continue |
| 41 | + test = make_test(test_data) |
| 42 | + output_file.write(test) |
| 43 | + |
| 44 | + output_file.close() |
| 45 | + |
| 46 | +def make_test(test_data): |
| 47 | + if 'doubleEscaped' in test_data: |
| 48 | + test_data = test_tokenizer.unescape_test(test_data) |
| 49 | + |
| 50 | + rv = [] |
| 51 | + rv.append("#data") |
| 52 | + rv.append(test_data["input"].encode("utf8")) |
| 53 | + rv.append("#errors") |
| 54 | + rv.append("#document") |
| 55 | + tree = p.parse(test_data["input"]) |
| 56 | + output = test_parser.convertTreeDump(p.tree.testSerializer(tree)) |
| 57 | + output = test_parser.attrlist.sub(test_parser.sortattrs, output) |
| 58 | + output = unnamespaceExpected(r"\1<\2>", output) |
| 59 | + rv.append(output.encode("utf8")) |
| 60 | + rv.append("") |
| 61 | + return "\n".join(rv) |
| 62 | + |
| 63 | +if __name__ == "__main__": |
| 64 | + main(sys.argv[1]) |
0 commit comments