diff --git a/README.rst b/README.rst index 9e0a0f74..d0b5ecf7 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ +Cratejoy html5lib +================= + +Cratejoy fork of html5lib adds syntax checking for jinja templates + html5lib ======== diff --git a/html5lib/constants.py b/html5lib/constants.py index e7089846..5735d7b6 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3085,7 +3085,15 @@ "EndTag": 4, "EmptyTag": 5, "Comment": 6, - "ParseError": 7 + "ParseError": 7, + "JinjaStatementStartTag": 8, + "JinjaStatementEndTag": 9, + "JinjaStatementTag": 10, + "JinjaVariableStartTag": 11, + "JinjaVariableEndTag": 12, + "JinjaVariable": 13, + "JinjaFilter": 14, + "JinjaPipe": 15 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 5b9ce7d7..91a5ae7b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,6 +1,8 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass +import logging + import types from . import inputstream @@ -20,6 +22,9 @@ from .constants import adjustForeignAttributes as adjustForeignAttributesMap +log = logging.getLogger(u"html5lib") + + def parse(doc, treebuilder="etree", encoding=None, namespaceHTMLElements=True): """Parse a string or file-like object into a tree""" @@ -163,6 +168,7 @@ def mainLoop(self): ParseErrorToken = tokenTypes["ParseError"] for token in self.normalizedTokens(): + #log.debug(u"Token {}".format(token)) new_token = token while new_token is not None: currentNode = self.tree.openElements[-1] if self.tree.openElements else None @@ -175,7 +181,8 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if (len(self.tree.openElements) == 0 or + if ( + len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and ((type == StartTagToken and @@ -475,6 +482,30 @@ def processCharacters(self, token): def processSpaceCharacters(self, token): self.tree.insertText(token["data"]) + def processJinjaStatementStartTag(self, token): + pass + + def processJinjaStatementEndTag(self, token): + pass + + def processJinjaStatementTag(self, token): + pass + + def processJinjaVariableStartTag(self, token): + pass + + def processJinjaVariableEndTag(self, token): + pass + + def processJinjaVariable(self, token): + pass + + def processJinjaPipe(self, token): + pass + + def processJinjaFilterTag(self, token): + pass + def processStartTag(self, token): return self.startTagHandler[token["name"]](token) diff --git a/html5lib/tests/test_jinja.py b/html5lib/tests/test_jinja.py new file mode 100644 index 00000000..63dad144 --- /dev/null +++ b/html5lib/tests/test_jinja.py @@ -0,0 +1,57 @@ +import html5lib +import unittest +import logging + +log = logging.getLogger(__name__) + + +def dump(tree, tabs=0): + log.debug(u"{}Tag '{}' - {} children - Value = {} - Text = {}".format( + "".join(["\t" for i in range(tabs)]), tree.tag, len(tree), tree.attrib['value'] if 'value' in tree.attrib else None, tree.text)) + + for child in tree: + dump(child, tabs + 1) + + +class JinjaTestCase(unittest.TestCase): + def setUp(self): + self.parser = html5lib.HTMLParser(strict=True, namespaceHTMLElements=False, tree=html5lib.treebuilders.getTreeBuilder("etree", fullTree=True)) + + def assertTree(self, root, spec): + self.assertEqual(len(root), len(spec)) + + for child, spec_child in zip(root, spec): + self.assertEqual(child.tag, spec_child['tag']) + + if 'text' in spec_child: + self.assertEqual(child.text, spec_child['text']) + + if 'value' in spec_child: + self.assertEqual(child.attrib['value'], spec_child['value']) + + if 'children' in spec_child: + self.assertTree(child, spec_child['children']) + else: + self.assertEqual(len(child), 0) + + if 'attrs' in spec_child: + for k, v in spec_child['attrs'].iteritems(): + self.assertIn(k, child.attrib) + self.assertEqual(v, child.attrib[k]) + + def test_open_block(self): + html_string = """ + + """ + tree = self.parser.parseFragment(html_string) + dump(tree) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 79774578..b1267cd5 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -5,6 +5,10 @@ except NameError: pass + +import sys +import logging + from collections import deque from .constants import spaceCharacters @@ -20,6 +24,8 @@ entitiesTrie = Trie(entities) +log = logging.getLogger(u"html5lib") + class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. @@ -254,6 +260,8 @@ def dataState(self): self.state = self.entityDataState elif data == "<": self.state = self.tagOpenState + elif data == "{": + self.state = self.jinjaOpenState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) @@ -282,6 +290,166 @@ def entityDataState(self): self.state = self.dataState return True + def jinjaOpenState(self): + data = self.stream.char() + + if data == "{": + #self.currentToken = { + #"type": tokenTypes["JinjaVariableStartTag"], + #"name": "{{", "data": {}, + #"selfClosing": False + #} + + #self.tokenQueue.append(self.currentToken) + + self.state = self.jinjaVariableState + elif data == "%": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) + + self.state = self.jinjaStatementState + elif data == "#": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) + + self.state = self.jinjaCommentState + + #self.state = self.dataState + return True + + def jinjaStatementEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementEndTag"], + #"name": "%}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-statement-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaVariableEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-variable-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaCommentEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-comment-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaStatementState(self): + data = self.stream.char() + + if data == "%": + self.state = self.jinjaStatementEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-closing-brace"}) + self.state = self.dataState + else: + chars = self.stream.charsUntil(("%", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) + + return True + + def jinjaCommentState(self): + data = self.stream.char() + + #log.debug(u"Jinja comment state '{}'".format(data)) + + if data == "#": + self.state = self.jinjaCommentEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-comment-closing-brace"}) + self.state = self.dataState + else: + chars = self.stream.charsUntil(("#", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) + + return True + + def jinjaVariableState(self): + data = self.stream.char() + + if data == "}": + self.state = self.jinjaVariableEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-closing-brace"}) + self.state = self.dataState + elif data in spaceCharacters: + # Skip spaces + pass + elif data == "|": + pass + # If this is the first token after the variable start tag + else: + chars = self.stream.charsUntil(frozenset(("}", "\u0000")) | spaceCharacters) + #self.currentToken = {"type": tokenTypes["JinjaFilterTag"], "data": + #data + chars} + #self.tokenQueue.append(self.currentToken) + #else: + #chars = self.stream.charsUntil(("}", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + #data + chars}) + + return True + def rcdataState(self): data = self.stream.char() if data == "&": diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py index 8b97cc11..6e5c2561 100644 --- a/html5lib/treebuilders/_base.py +++ b/html5lib/treebuilders/_base.py @@ -3,6 +3,11 @@ from ..constants import scopingElements, tableInsertModeElements, namespaces +import logging + +log = logging.getLogger("html5lib") + + # The scope markers are inserted when entering object elements, # marquees, table cells, and table captions, and are used to prevent formatting # from "leaking" into tables, object elements, and marquees. @@ -269,6 +274,13 @@ def createElement(self, token): element.attributes = token["data"] return element + def createElementWithoutNamespace(self, token): + """Create an element but don't insert it anywhere""" + name = token["name"] + element = self.elementClass(name) + element.attributes = token["data"] + return element + def _getInsertFromTable(self): return self._insertFromTable diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 2c8ed19f..5d68fcd8 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -3,6 +3,8 @@ import re +import logging + from . import _base from .. import ihatexml from .. import constants @@ -11,6 +13,8 @@ tag_regexp = re.compile("{([^}]*)}(.*)") +log = logging.getLogger("html5lib") + def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation