From 25555a8f425df601ab069509188d46e583816d83 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 29 Jun 2014 20:19:31 -0500 Subject: [PATCH 1/8] Added in jinja parsing --- html5lib/constants.py | 8 +++- html5lib/html5parser.py | 41 +++++++++++++++++++ html5lib/tokenizer.py | 91 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index e7089846..fb24a32f 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3085,7 +3085,13 @@ "EndTag": 4, "EmptyTag": 5, "Comment": 6, - "ParseError": 7 + "ParseError": 7, + "JinjaStatementStartTag": 8, + "JinjaStatementEndTag": 9, + "JinjaStatementTag": 10, + "JinjaVariableStartTag": 11, + "JinjaVariableEndTag": 12, + "JinjaVariableTag": 13 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 5b9ce7d7..6c441bfe 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,6 +1,8 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass +import logging + import types from . import inputstream @@ -20,6 +22,9 @@ from .constants import adjustForeignAttributes as adjustForeignAttributesMap +log = logging.getLogger(u"html5lib") + + def parse(doc, treebuilder="etree", encoding=None, namespaceHTMLElements=True): """Parse a string or file-like object into a tree""" @@ -161,6 +166,12 @@ def mainLoop(self): CommentToken = tokenTypes["Comment"] DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] + JinjaStatementStartTag = tokenTypes["JinjaStatementStartTag"] + JinjaStatementEndTag = tokenTypes["JinjaStatementEndTag"] + JinjaStatementTag = tokenTypes["JinjaStatementTag"] + JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] + JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] + JinjaVariableTag = tokenTypes["JinjaVariableTag"] for token in self.normalizedTokens(): new_token = token @@ -202,6 +213,18 @@ def mainLoop(self): new_token = phase.processComment(new_token) elif type == DoctypeToken: new_token = phase.processDoctype(new_token) + elif type == JinjaStatementStartTag: + new_token = phase.processJinjaStatementStartTag(new_token) + elif type == JinjaStatementEndTag: + new_token = phase.processJinjaStatementEndTag(new_token) + elif type == JinjaStatementTag: + new_token = phase.processJinjaStatementTag(new_token) + elif type == JinjaVariableStartTag: + new_token = phase.processJinjaVariableStartTag(new_token) + elif type == JinjaVariableEndTag: + new_token = phase.processJinjaVariableEndTag(new_token) + elif type == JinjaVariableTag: + new_token = phase.processJinjaVariableTag(new_token) if (type == StartTagToken and token["selfClosing"] and not token["selfClosingAcknowledged"]): @@ -475,6 +498,24 @@ def processCharacters(self, token): def processSpaceCharacters(self, token): self.tree.insertText(token["data"]) + def processJinjaStatementStartTag(self, token): + pass + + def processJinjaStatementEndTag(self, token): + pass + + def processJinjaStatementTag(self, token): + pass + + def processJinjaVariableStartTag(self, token): + pass + + def processJinjaVariableEndTag(self, token): + pass + + def processJinjaVariableTag(self, token): + pass + def processStartTag(self, token): return self.startTagHandler[token["name"]](token) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 79774578..6498e7ba 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -5,6 +5,10 @@ except NameError: pass + +import sys +import logging + from collections import deque from .constants import spaceCharacters @@ -20,6 +24,8 @@ entitiesTrie = Trie(entities) +log = logging.getLogger(u"html5lib") + class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. @@ -254,6 +260,8 @@ def dataState(self): self.state = self.entityDataState elif data == "<": self.state = self.tagOpenState + elif data == "{": + self.state = self.jinjaOpenState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) @@ -282,6 +290,89 @@ def entityDataState(self): self.state = self.dataState return True + def jinjaOpenState(self): + data = self.stream.char() + + if data == "{": + self.tokenQueue.append({ + "type": tokenTypes["JinjaVariableStartTag"], + "name": "{{", "data": [], + "selfClosing": False + }) + + self.state = self.jinjaVariableState + elif data == "%": + self.tokenQueue.append({ + "type": tokenTypes["JinjaStatementStartTag"], + "name": "{%", "data": [], + "selfClosing": False + }) + + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaStatementEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + self.tokenQueue.append({ + "type": tokenTypes["JinjaStatementEndTag"], + "name": "%}", "data": [], + "selfClosing": False + }) + self.state = self.dataState + + #self.state = self.dataState + return True + + def jinjaVariableEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + self.tokenQueue.append({ + "type": tokenTypes["JinjaVariableEndTag"], + "name": "}}", "data": [], + "selfClosing": False + }) + self.state = self.dataState + + #self.state = self.dataState + return True + + def jinjaStatementState(self): + data = self.stream.char() + + if data == "%": + self.state = self.jinjaStatementEndState + elif data is EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil(("%", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + data + chars}) + + return True + + def jinjaVariableState(self): + data = self.stream.char() + + if data == "}": + self.state = self.jinjaVariableEndState + elif data is EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil(("}", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + data + chars}) + + return True + def rcdataState(self): data = self.stream.char() if data == "&": From fdde76416be476af83e10b18ac938dd6d431bfde Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 29 Jun 2014 20:51:38 -0500 Subject: [PATCH 2/8] Syntax errors in jinja --- html5lib/html5parser.py | 9 +++++++++ html5lib/tokenizer.py | 32 ++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 6c441bfe..dbd7dd5b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -176,6 +176,7 @@ def mainLoop(self): for token in self.normalizedTokens(): new_token = token while new_token is not None: + log.debug(u"Token {} Phase = {}".format(new_token, self.phase)) currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None @@ -421,6 +422,7 @@ def resetInsertionMode(self): new_phase = self.phases["inBody"] break + log.debug(u"Changing phase to {}".format(new_phase)) self.phase = new_phase def parseRCDataRawtext(self, token, contentType): @@ -438,6 +440,7 @@ def parseRCDataRawtext(self, token, contentType): self.originalPhase = self.phase + log.debug(u"Changing phase to text") self.phase = self.phases["text"] @@ -825,6 +828,8 @@ def startTagOther(self, token): def endTagHead(self, token): node = self.parser.tree.openElements.pop() assert node.name == "head", "Expected head got %s" % node.name + log = logging.getLogger(u"html5lib") + log.debug(u"Switching phase to afterHead") self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): @@ -835,6 +840,8 @@ def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): + log = logging.getLogger(u"html5lib") + log.debug(u"Implied end head tag") self.endTagHead(impliedTagToken("head")) # XXX If we implement a parser for which scripting is disabled we need to @@ -905,6 +912,8 @@ def endTagOther(self, token): def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) + log = logging.getLogger(u"html5lib") + log.debug(u"Changing phase to body") self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 6498e7ba..9d458f67 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -324,6 +324,17 @@ def jinjaStatementEndState(self): "selfClosing": False }) self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-statement-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-statement-closing-tag-but-got-char", + "datavars": {"data": data}}) + self.stream.unget(data) + self.state = self.bogusCommentState #self.state = self.dataState return True @@ -339,6 +350,17 @@ def jinjaVariableEndState(self): "selfClosing": False }) self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-variable-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-variable-closing-tag-but-got-char", + "datavars": {"data": data}}) + self.stream.unget(data) + self.state = self.bogusCommentState #self.state = self.dataState return True @@ -349,8 +371,9 @@ def jinjaStatementState(self): if data == "%": self.state = self.jinjaStatementEndState elif data is EOF: - # Tokenization ends. - return False + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-jinja-statement"}) + self.state = self.dataState else: chars = self.stream.charsUntil(("%", "\u0000")) self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": @@ -364,8 +387,9 @@ def jinjaVariableState(self): if data == "}": self.state = self.jinjaVariableEndState elif data is EOF: - # Tokenization ends. - return False + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-jinja-variable"}) + self.state = self.dataState else: chars = self.stream.charsUntil(("}", "\u0000")) self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": From 53a0132c900575c45907a4082f3149b358641014 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 29 Jun 2014 21:04:10 -0500 Subject: [PATCH 3/8] Readme --- README.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.rst b/README.rst index 9e0a0f74..d0b5ecf7 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ +Cratejoy html5lib +================= + +Cratejoy fork of html5lib adds syntax checking for jinja templates + html5lib ======== From 5246944d9f68e15567682604b1159d23fbe0e44e Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Thu, 3 Jul 2014 17:46:02 -0500 Subject: [PATCH 4/8] WIP - Jinja parsing --- html5lib/constants.py | 3 +- html5lib/html5parser.py | 55 ++++++++++++++++++++++++++++++---- html5lib/tokenizer.py | 44 ++++++++++++++++++++++----- html5lib/treebuilders/_base.py | 12 ++++++++ html5lib/treebuilders/etree.py | 5 ++++ 5 files changed, 105 insertions(+), 14 deletions(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index fb24a32f..e9e8fab8 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3091,7 +3091,8 @@ "JinjaStatementTag": 10, "JinjaVariableStartTag": 11, "JinjaVariableEndTag": 12, - "JinjaVariableTag": 13 + "JinjaVariable": 13, + "JinjaFilter": 14 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index dbd7dd5b..40e4dd7e 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -171,7 +171,8 @@ def mainLoop(self): JinjaStatementTag = tokenTypes["JinjaStatementTag"] JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] - JinjaVariableTag = tokenTypes["JinjaVariableTag"] + JinjaVariable = tokenTypes["JinjaVariable"] + JinjaFilter = tokenTypes["JinjaFilter"] for token in self.normalizedTokens(): new_token = token @@ -187,7 +188,11 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if (len(self.tree.openElements) == 0 or + if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter): + log.debug(u"Type is a jinja tag") + phase = self.phases["inJinjaVariable"] + elif ( + len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and ((type == StartTagToken and @@ -224,8 +229,10 @@ def mainLoop(self): new_token = phase.processJinjaVariableStartTag(new_token) elif type == JinjaVariableEndTag: new_token = phase.processJinjaVariableEndTag(new_token) - elif type == JinjaVariableTag: - new_token = phase.processJinjaVariableTag(new_token) + elif type == JinjaVariable: + new_token = phase.processJinjaVariable(new_token) + elif type == JinjaFilter: + new_token = phase.processJinjaFilter(new_token) if (type == StartTagToken and token["selfClosing"] and not token["selfClosingAcknowledged"]): @@ -516,7 +523,10 @@ def processJinjaVariableStartTag(self, token): def processJinjaVariableEndTag(self, token): pass - def processJinjaVariableTag(self, token): + def processJinjaVariable(self, token): + pass + + def processJinjaFilterTag(self, token): pass def processStartTag(self, token): @@ -535,6 +545,40 @@ def startTagHtml(self, token): def processEndTag(self, token): return self.endTagHandler[token["name"]](token) + class InJinjaVariablePhase(Phase): + def processJinjaVariableStartTag(self, token): + log = logging.getLogger('html5lib') + log.debug(u"InJinja: Start Tag") + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + + def processJinjaVariableEndTag(self, token): + log = logging.getLogger('html5lib') + log.debug(u"InJinja: End Tag {}".format(token["name"])) + for node in self.tree.openElements[::-1]: + log.debug(u"InJinja: Open tag {} token {}".format(node, token)) + if node.name == token["name"]: + self.tree.generateImpliedEndTags(exclude=token["name"]) + log.debug(u"InJinja: Implied end tag {} {}".format(self.tree.openElements[-1].name, token["name"])) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + while self.tree.openElements.pop() != node: + pass + break + else: + if node.nameTuple in specialElements: + log.debug(u"Nametuple {} in {}".format(node.nameTuple, specialElements)) + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + break + + def processJinjaVariable(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + + def processJinjaFilter(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + class InitialPhase(Phase): def processSpaceCharacters(self, token): pass @@ -2739,6 +2783,7 @@ def processEndTag(self, token): "inHead": InHeadPhase, # XXX "inHeadNoscript": InHeadNoScriptPhase, "afterHead": AfterHeadPhase, + "inJinjaVariable": InJinjaVariablePhase, "inBody": InBodyPhase, "text": TextPhase, "inTable": InTablePhase, diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 9d458f67..4670d260 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -294,17 +294,21 @@ def jinjaOpenState(self): data = self.stream.char() if data == "{": - self.tokenQueue.append({ + self.currentToken = { "type": tokenTypes["JinjaVariableStartTag"], - "name": "{{", "data": [], + "name": u"jinjavariabletag", "data": {}, + "namespace": None, "selfClosing": False - }) + } + + self.tokenQueue.append(self.currentToken) self.state = self.jinjaVariableState elif data == "%": self.tokenQueue.append({ "type": tokenTypes["JinjaStatementStartTag"], - "name": "{%", "data": [], + "name": "{%", "data": {}, + "namespace": None, "selfClosing": False }) @@ -346,7 +350,7 @@ def jinjaVariableEndState(self): if data == "}": self.tokenQueue.append({ "type": tokenTypes["JinjaVariableEndTag"], - "name": "}}", "data": [], + "name": u"jinjavariabletag", "data": [], "selfClosing": False }) self.state = self.dataState @@ -390,10 +394,34 @@ def jinjaVariableState(self): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-jinja-variable"}) self.state = self.dataState + elif data in spaceCharacters: + # Skip spaces + pass + elif data == "|": + pass + # If this is the first token after the variable start tag + elif self.currentToken['type'] == tokenTypes["JinjaVariableStartTag"]: + #log.debug(u"Got start tag {}".format(("|", "}", "\u0000") | spaceCharacters)) + + chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + self.currentToken = {"type": tokenTypes["JinjaVariable"], + "name": "jinjavariable", "selfClosing": True, "data": { + "value": data + chars, + "position": self.stream.position(), + }} + self.tokenQueue.append(self.currentToken) else: - chars = self.stream.charsUntil(("}", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": - data + chars}) + chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + self.currentToken = {"type": tokenTypes["JinjaFilter"], + "name": "jinjafilter", "selfClosing": True, "data": { + "value": data + chars, + "position": self.stream.position(), + }} + self.tokenQueue.append(self.currentToken) + #else: + #chars = self.stream.charsUntil(("}", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + #data + chars}) return True diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py index 8b97cc11..6e5c2561 100644 --- a/html5lib/treebuilders/_base.py +++ b/html5lib/treebuilders/_base.py @@ -3,6 +3,11 @@ from ..constants import scopingElements, tableInsertModeElements, namespaces +import logging + +log = logging.getLogger("html5lib") + + # The scope markers are inserted when entering object elements, # marquees, table cells, and table captions, and are used to prevent formatting # from "leaking" into tables, object elements, and marquees. @@ -269,6 +274,13 @@ def createElement(self, token): element.attributes = token["data"] return element + def createElementWithoutNamespace(self, token): + """Create an element but don't insert it anywhere""" + name = token["name"] + element = self.elementClass(name) + element.attributes = token["data"] + return element + def _getInsertFromTable(self): return self._insertFromTable diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 2c8ed19f..03d51275 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -3,6 +3,8 @@ import re +import logging + from . import _base from .. import ihatexml from .. import constants @@ -11,6 +13,8 @@ tag_regexp = re.compile("{([^}]*)}(.*)") +log = logging.getLogger("html5lib") + def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation @@ -59,6 +63,7 @@ def _getAttributes(self): return self._element.attrib def _setAttributes(self, attributes): + log.debug(u"Attributes {}".format(attributes)) # Delete existing attributes first # XXX - there may be a better way to do this... for key in list(self._element.attrib.keys()): From f756cab03476ddb0e128ab08477e9c3a1e8b94b0 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Fri, 4 Jul 2014 09:06:48 -0500 Subject: [PATCH 5/8] WIP - Jinja parsing --- html5lib/constants.py | 3 ++- html5lib/html5parser.py | 14 ++++++++++++-- html5lib/tokenizer.py | 24 ++++++++++++++++-------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index e9e8fab8..5735d7b6 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3092,7 +3092,8 @@ "JinjaVariableStartTag": 11, "JinjaVariableEndTag": 12, "JinjaVariable": 13, - "JinjaFilter": 14 + "JinjaFilter": 14, + "JinjaPipe": 15 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 40e4dd7e..9d836e16 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -172,6 +172,7 @@ def mainLoop(self): JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] JinjaVariable = tokenTypes["JinjaVariable"] + JinjaPipe = tokenTypes["JinjaPipe"] JinjaFilter = tokenTypes["JinjaFilter"] for token in self.normalizedTokens(): @@ -188,7 +189,7 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter): + if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter, JinjaPipe): log.debug(u"Type is a jinja tag") phase = self.phases["inJinjaVariable"] elif ( @@ -231,6 +232,8 @@ def mainLoop(self): new_token = phase.processJinjaVariableEndTag(new_token) elif type == JinjaVariable: new_token = phase.processJinjaVariable(new_token) + elif type == JinjaPipe: + new_token = phase.processJinjaPipe(new_token) elif type == JinjaFilter: new_token = phase.processJinjaFilter(new_token) @@ -429,7 +432,7 @@ def resetInsertionMode(self): new_phase = self.phases["inBody"] break - log.debug(u"Changing phase to {}".format(new_phase)) + #log.debug(u"Changing phase to {}".format(new_phase)) self.phase = new_phase def parseRCDataRawtext(self, token, contentType): @@ -526,6 +529,9 @@ def processJinjaVariableEndTag(self, token): def processJinjaVariable(self, token): pass + def processJinjaPipe(self, token): + pass + def processJinjaFilterTag(self, token): pass @@ -575,6 +581,10 @@ def processJinjaVariable(self, token): element = self.tree.createElementWithoutNamespace(token) self.tree.openElements[-1].appendChild(element) + def processJinjaPipe(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + def processJinjaFilter(self, token): element = self.tree.createElementWithoutNamespace(token) self.tree.openElements[-1].appendChild(element) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 4670d260..09e705ff 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -390,6 +390,8 @@ def jinjaVariableState(self): if data == "}": self.state = self.jinjaVariableEndState + #elif data == "(": + #self.state = self.jinjaArgState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-jinja-variable"}) @@ -398,30 +400,36 @@ def jinjaVariableState(self): # Skip spaces pass elif data == "|": - pass + self.currentToken = {"type": tokenTypes["JinjaPipe"], + "name": "jinjapipe", "selfClosing": True, "data": { + "value": data, + "position": self.stream.position(), + }} + self.tokenQueue.append(self.currentToken) # If this is the first token after the variable start tag elif self.currentToken['type'] == tokenTypes["JinjaVariableStartTag"]: #log.debug(u"Got start tag {}".format(("|", "}", "\u0000") | spaceCharacters)) - chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) self.currentToken = {"type": tokenTypes["JinjaVariable"], "name": "jinjavariable", "selfClosing": True, "data": { "value": data + chars, "position": self.stream.position(), }} self.tokenQueue.append(self.currentToken) - else: - chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + elif self.currentToken['type'] == tokenTypes["JinjaPipe"]: + chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) self.currentToken = {"type": tokenTypes["JinjaFilter"], "name": "jinjafilter", "selfClosing": True, "data": { "value": data + chars, "position": self.stream.position(), }} self.tokenQueue.append(self.currentToken) - #else: - #chars = self.stream.charsUntil(("}", "\u0000")) - #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": - #data + chars}) + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-pipe-got-character"}) + self.stream.unget(data) + self.state = self.bogusCommentState return True From 2e6aaf62f183f4891579f1c72dfea665f9fc2d01 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Fri, 4 Jul 2014 09:37:58 -0500 Subject: [PATCH 6/8] Removed unecessary work around jinja, now we just toss stuff to make html parsing work --- html5lib/html5parser.py | 77 +----------------------- html5lib/tokenizer.py | 103 ++++++++++++--------------------- html5lib/treebuilders/etree.py | 1 - 3 files changed, 37 insertions(+), 144 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 9d836e16..300b2737 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -166,19 +166,10 @@ def mainLoop(self): CommentToken = tokenTypes["Comment"] DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] - JinjaStatementStartTag = tokenTypes["JinjaStatementStartTag"] - JinjaStatementEndTag = tokenTypes["JinjaStatementEndTag"] - JinjaStatementTag = tokenTypes["JinjaStatementTag"] - JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] - JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] - JinjaVariable = tokenTypes["JinjaVariable"] - JinjaPipe = tokenTypes["JinjaPipe"] - JinjaFilter = tokenTypes["JinjaFilter"] for token in self.normalizedTokens(): new_token = token while new_token is not None: - log.debug(u"Token {} Phase = {}".format(new_token, self.phase)) currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None @@ -189,10 +180,7 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter, JinjaPipe): - log.debug(u"Type is a jinja tag") - phase = self.phases["inJinjaVariable"] - elif ( + if ( len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and @@ -220,22 +208,6 @@ def mainLoop(self): new_token = phase.processComment(new_token) elif type == DoctypeToken: new_token = phase.processDoctype(new_token) - elif type == JinjaStatementStartTag: - new_token = phase.processJinjaStatementStartTag(new_token) - elif type == JinjaStatementEndTag: - new_token = phase.processJinjaStatementEndTag(new_token) - elif type == JinjaStatementTag: - new_token = phase.processJinjaStatementTag(new_token) - elif type == JinjaVariableStartTag: - new_token = phase.processJinjaVariableStartTag(new_token) - elif type == JinjaVariableEndTag: - new_token = phase.processJinjaVariableEndTag(new_token) - elif type == JinjaVariable: - new_token = phase.processJinjaVariable(new_token) - elif type == JinjaPipe: - new_token = phase.processJinjaPipe(new_token) - elif type == JinjaFilter: - new_token = phase.processJinjaFilter(new_token) if (type == StartTagToken and token["selfClosing"] and not token["selfClosingAcknowledged"]): @@ -432,7 +404,6 @@ def resetInsertionMode(self): new_phase = self.phases["inBody"] break - #log.debug(u"Changing phase to {}".format(new_phase)) self.phase = new_phase def parseRCDataRawtext(self, token, contentType): @@ -450,7 +421,6 @@ def parseRCDataRawtext(self, token, contentType): self.originalPhase = self.phase - log.debug(u"Changing phase to text") self.phase = self.phases["text"] @@ -551,44 +521,6 @@ def startTagHtml(self, token): def processEndTag(self, token): return self.endTagHandler[token["name"]](token) - class InJinjaVariablePhase(Phase): - def processJinjaVariableStartTag(self, token): - log = logging.getLogger('html5lib') - log.debug(u"InJinja: Start Tag") - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - - def processJinjaVariableEndTag(self, token): - log = logging.getLogger('html5lib') - log.debug(u"InJinja: End Tag {}".format(token["name"])) - for node in self.tree.openElements[::-1]: - log.debug(u"InJinja: Open tag {} token {}".format(node, token)) - if node.name == token["name"]: - self.tree.generateImpliedEndTags(exclude=token["name"]) - log.debug(u"InJinja: Implied end tag {} {}".format(self.tree.openElements[-1].name, token["name"])) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - while self.tree.openElements.pop() != node: - pass - break - else: - if node.nameTuple in specialElements: - log.debug(u"Nametuple {} in {}".format(node.nameTuple, specialElements)) - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - break - - def processJinjaVariable(self, token): - element = self.tree.createElementWithoutNamespace(token) - self.tree.openElements[-1].appendChild(element) - - def processJinjaPipe(self, token): - element = self.tree.createElementWithoutNamespace(token) - self.tree.openElements[-1].appendChild(element) - - def processJinjaFilter(self, token): - element = self.tree.createElementWithoutNamespace(token) - self.tree.openElements[-1].appendChild(element) - class InitialPhase(Phase): def processSpaceCharacters(self, token): pass @@ -882,8 +814,6 @@ def startTagOther(self, token): def endTagHead(self, token): node = self.parser.tree.openElements.pop() assert node.name == "head", "Expected head got %s" % node.name - log = logging.getLogger(u"html5lib") - log.debug(u"Switching phase to afterHead") self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): @@ -894,8 +824,6 @@ def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): - log = logging.getLogger(u"html5lib") - log.debug(u"Implied end head tag") self.endTagHead(impliedTagToken("head")) # XXX If we implement a parser for which scripting is disabled we need to @@ -966,8 +894,6 @@ def endTagOther(self, token): def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) - log = logging.getLogger(u"html5lib") - log.debug(u"Changing phase to body") self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True @@ -2793,7 +2719,6 @@ def processEndTag(self, token): "inHead": InHeadPhase, # XXX "inHeadNoscript": InHeadNoScriptPhase, "afterHead": AfterHeadPhase, - "inJinjaVariable": InJinjaVariablePhase, "inBody": InBodyPhase, "text": TextPhase, "inTable": InTablePhase, diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 09e705ff..425c4d92 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -294,23 +294,21 @@ def jinjaOpenState(self): data = self.stream.char() if data == "{": - self.currentToken = { - "type": tokenTypes["JinjaVariableStartTag"], - "name": u"jinjavariabletag", "data": {}, - "namespace": None, - "selfClosing": False - } + #self.currentToken = { + #"type": tokenTypes["JinjaVariableStartTag"], + #"name": "{{", "data": {}, + #"selfClosing": False + #} - self.tokenQueue.append(self.currentToken) + #self.tokenQueue.append(self.currentToken) self.state = self.jinjaVariableState elif data == "%": - self.tokenQueue.append({ - "type": tokenTypes["JinjaStatementStartTag"], - "name": "{%", "data": {}, - "namespace": None, - "selfClosing": False - }) + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) self.state = self.jinjaStatementState @@ -322,11 +320,11 @@ def jinjaStatementEndState(self): data = self.stream.char() if data == "}": - self.tokenQueue.append({ - "type": tokenTypes["JinjaStatementEndTag"], - "name": "%}", "data": [], - "selfClosing": False - }) + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementEndTag"], + #"name": "%}", "data": [], + #"selfClosing": False + #}) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": @@ -334,11 +332,7 @@ def jinjaStatementEndState(self): "datavars": {"data": data}}) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-jinja-statement-closing-tag-but-got-char", - "datavars": {"data": data}}) - self.stream.unget(data) - self.state = self.bogusCommentState + self.state = self.jinjaStatementState #self.state = self.dataState return True @@ -348,11 +342,11 @@ def jinjaVariableEndState(self): data = self.stream.char() if data == "}": - self.tokenQueue.append({ - "type": tokenTypes["JinjaVariableEndTag"], - "name": u"jinjavariabletag", "data": [], - "selfClosing": False - }) + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": @@ -360,11 +354,7 @@ def jinjaVariableEndState(self): "datavars": {"data": data}}) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-jinja-variable-closing-tag-but-got-char", - "datavars": {"data": data}}) - self.stream.unget(data) - self.state = self.bogusCommentState + self.state = self.jinjaStatementState #self.state = self.dataState return True @@ -376,12 +366,12 @@ def jinjaStatementState(self): self.state = self.jinjaStatementEndState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-jinja-statement"}) + "missing-jinja-closing-brace"}) self.state = self.dataState else: chars = self.stream.charsUntil(("%", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": - data + chars}) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) return True @@ -390,46 +380,25 @@ def jinjaVariableState(self): if data == "}": self.state = self.jinjaVariableEndState - #elif data == "(": - #self.state = self.jinjaArgState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-jinja-variable"}) + "missing-jinja-closing-brace"}) self.state = self.dataState elif data in spaceCharacters: # Skip spaces pass elif data == "|": - self.currentToken = {"type": tokenTypes["JinjaPipe"], - "name": "jinjapipe", "selfClosing": True, "data": { - "value": data, - "position": self.stream.position(), - }} - self.tokenQueue.append(self.currentToken) + pass # If this is the first token after the variable start tag - elif self.currentToken['type'] == tokenTypes["JinjaVariableStartTag"]: - #log.debug(u"Got start tag {}".format(("|", "}", "\u0000") | spaceCharacters)) - - chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) - self.currentToken = {"type": tokenTypes["JinjaVariable"], - "name": "jinjavariable", "selfClosing": True, "data": { - "value": data + chars, - "position": self.stream.position(), - }} - self.tokenQueue.append(self.currentToken) - elif self.currentToken['type'] == tokenTypes["JinjaPipe"]: - chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) - self.currentToken = {"type": tokenTypes["JinjaFilter"], - "name": "jinjafilter", "selfClosing": True, "data": { - "value": data + chars, - "position": self.stream.position(), - }} - self.tokenQueue.append(self.currentToken) else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-jinja-pipe-got-character"}) - self.stream.unget(data) - self.state = self.bogusCommentState + chars = self.stream.charsUntil(frozenset(("}", "\u0000")) | spaceCharacters) + #self.currentToken = {"type": tokenTypes["JinjaFilterTag"], "data": + #data + chars} + #self.tokenQueue.append(self.currentToken) + #else: + #chars = self.stream.charsUntil(("}", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + #data + chars}) return True diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 03d51275..5d68fcd8 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -63,7 +63,6 @@ def _getAttributes(self): return self._element.attrib def _setAttributes(self, attributes): - log.debug(u"Attributes {}".format(attributes)) # Delete existing attributes first # XXX - there may be a better way to do this... for key in list(self._element.attrib.keys()): From 913aa1f2f544bb497010c1fbfc6394f275b94ce5 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 28 Dec 2014 19:51:41 -0600 Subject: [PATCH 7/8] WIP --- html5lib/html5parser.py | 1 + html5lib/tests/test_jinja.py | 57 ++++++++++++++++++++++++++++++++++++ html5lib/tokenizer.py | 48 ++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 html5lib/tests/test_jinja.py diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 300b2737..91a5ae7b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -168,6 +168,7 @@ def mainLoop(self): ParseErrorToken = tokenTypes["ParseError"] for token in self.normalizedTokens(): + #log.debug(u"Token {}".format(token)) new_token = token while new_token is not None: currentNode = self.tree.openElements[-1] if self.tree.openElements else None diff --git a/html5lib/tests/test_jinja.py b/html5lib/tests/test_jinja.py new file mode 100644 index 00000000..63dad144 --- /dev/null +++ b/html5lib/tests/test_jinja.py @@ -0,0 +1,57 @@ +import html5lib +import unittest +import logging + +log = logging.getLogger(__name__) + + +def dump(tree, tabs=0): + log.debug(u"{}Tag '{}' - {} children - Value = {} - Text = {}".format( + "".join(["\t" for i in range(tabs)]), tree.tag, len(tree), tree.attrib['value'] if 'value' in tree.attrib else None, tree.text)) + + for child in tree: + dump(child, tabs + 1) + + +class JinjaTestCase(unittest.TestCase): + def setUp(self): + self.parser = html5lib.HTMLParser(strict=True, namespaceHTMLElements=False, tree=html5lib.treebuilders.getTreeBuilder("etree", fullTree=True)) + + def assertTree(self, root, spec): + self.assertEqual(len(root), len(spec)) + + for child, spec_child in zip(root, spec): + self.assertEqual(child.tag, spec_child['tag']) + + if 'text' in spec_child: + self.assertEqual(child.text, spec_child['text']) + + if 'value' in spec_child: + self.assertEqual(child.attrib['value'], spec_child['value']) + + if 'children' in spec_child: + self.assertTree(child, spec_child['children']) + else: + self.assertEqual(len(child), 0) + + if 'attrs' in spec_child: + for k, v in spec_child['attrs'].iteritems(): + self.assertIn(k, child.attrib) + self.assertEqual(v, child.attrib[k]) + + def test_open_block(self): + html_string = """ + + """ + tree = self.parser.parseFragment(html_string) + dump(tree) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 425c4d92..35a5f718 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -311,6 +311,14 @@ def jinjaOpenState(self): #}) self.state = self.jinjaStatementState + elif data == "#": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) + + self.state = self.jinjaCommentState #self.state = self.dataState return True @@ -359,6 +367,28 @@ def jinjaVariableEndState(self): #self.state = self.dataState return True + def jinjaCommentEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-comment-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + def jinjaStatementState(self): data = self.stream.char() @@ -375,6 +405,24 @@ def jinjaStatementState(self): return True + def jinjaCommentState(self): + data = self.stream.char() + + log.debug(u"Jinja comment state '{}'".format(data)) + + if data == "#": + self.state = self.jinjaCommentEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-comment-closing-brace"}) + self.state = self.dataState + else: + chars = self.stream.charsUntil(("#", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) + + return True + def jinjaVariableState(self): data = self.stream.char() From 92134ee97e3d1776a7f08f88063c6f6ce1e3a7d4 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 28 Dec 2014 20:04:12 -0600 Subject: [PATCH 8/8] WIP --- html5lib/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 35a5f718..b1267cd5 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -408,7 +408,7 @@ def jinjaStatementState(self): def jinjaCommentState(self): data = self.stream.char() - log.debug(u"Jinja comment state '{}'".format(data)) + #log.debug(u"Jinja comment state '{}'".format(data)) if data == "#": self.state = self.jinjaCommentEndState