From 6f4a282afff0307b0f2e51f15c4b45f4a7cce45a Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 12 Jan 2016 21:22:50 +0100 Subject: [PATCH 1/3] Remove the mockParser because I have no idea why we have it. --- html5lib/tests/mockParser.py | 41 ------------------------------------ 1 file changed, 41 deletions(-) delete mode 100644 html5lib/tests/mockParser.py diff --git a/html5lib/tests/mockParser.py b/html5lib/tests/mockParser.py deleted file mode 100644 index ef31527e..00000000 --- a/html5lib/tests/mockParser.py +++ /dev/null @@ -1,41 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals - -import sys -import os - -if __name__ == '__main__': - # Allow us to import from the src directory - os.chdir(os.path.split(os.path.abspath(__file__))[0]) - sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src"))) - -from html5lib.tokenizer import HTMLTokenizer - - -class HTMLParser(object): - """ Fake parser to test tokenizer output """ - def parse(self, stream, output=True): - tokenizer = HTMLTokenizer(stream) - for token in tokenizer: - if output: - print(token) - -if __name__ == "__main__": - x = HTMLParser() - if len(sys.argv) > 1: - if len(sys.argv) > 2: - import hotshot - import hotshot.stats - prof = hotshot.Profile('stats.prof') - prof.runcall(x.parse, sys.argv[1], False) - prof.close() - stats = hotshot.stats.load('stats.prof') - stats.strip_dirs() - stats.sort_stats('time') - stats.print_stats() - else: - x.parse(sys.argv[1]) - else: - print("""Usage: python mockParser.py filename [stats] - If stats is specified the hotshots profiler will run and output the - stats instead. - """) From f28c5acb9901d22bed7587aa8d58d76e94965aec Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 12 Jan 2016 21:23:50 +0100 Subject: [PATCH 2/3] We don't need Python performance tests. --- html5lib/tests/performance/concatenation.py | 36 --------------------- 1 file changed, 36 deletions(-) delete mode 100644 html5lib/tests/performance/concatenation.py diff --git a/html5lib/tests/performance/concatenation.py b/html5lib/tests/performance/concatenation.py deleted file mode 100644 index a1465036..00000000 --- a/html5lib/tests/performance/concatenation.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals - - -def f1(): - x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - x += y + z - - -def f2(): - x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - x = x + y + z - - -def f3(): - x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - x = "".join((x, y, z)) - - -def f4(): - x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ" - x = "%s%s%s" % (x, y, z) - -import timeit -for x in range(4): - statement = "f%s" % (x + 1) - t = timeit.Timer(statement, "from __main__ import " + statement) - r = t.repeat(3, 1000000) - print(r, min(r)) From 5e90af858c175133c34ee548271bddb3ca5ef245 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 12 Jan 2016 21:29:23 +0100 Subject: [PATCH 3/3] Make pep8 1.7 happy. --- html5lib/html5parser.py | 147 +++++++++++++------------- html5lib/serializer/htmlserializer.py | 4 +- html5lib/tests/test_serializer.py | 3 +- html5lib/tests/test_tokenizer.py | 4 +- html5lib/treebuilders/_base.py | 4 +- html5lib/treebuilders/dom.py | 4 +- html5lib/treewalkers/__init__.py | 4 +- html5lib/treewalkers/_base.py | 6 +- html5lib/treewalkers/genshistream.py | 4 +- 9 files changed, 89 insertions(+), 91 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index c2c30783..ae980c55 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -204,8 +204,8 @@ def mainLoop(self): elif type == DoctypeToken: new_token = phase.processDoctype(new_token) - if (type == StartTagToken and token["selfClosing"] - and not token["selfClosingAcknowledged"]): + if (type == StartTagToken and token["selfClosing"] and + not token["selfClosingAcknowledged"]): self.parseError("non-void-element-with-trailing-solidus", {"name": token["name"]}) @@ -517,77 +517,76 @@ def processDoctype(self, token): if publicId != "": publicId = publicId.translate(asciiUpper2Lower) - if (not correct or token["name"] != "html" - or publicId.startswith( - ("+//silmaril//dtd html pro v0r11 19970101//", - "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", - "-//as//dtd html 3.0 aswedit + extensions//", - "-//ietf//dtd html 2.0 level 1//", - "-//ietf//dtd html 2.0 level 2//", - "-//ietf//dtd html 2.0 strict level 1//", - "-//ietf//dtd html 2.0 strict level 2//", - "-//ietf//dtd html 2.0 strict//", - "-//ietf//dtd html 2.0//", - "-//ietf//dtd html 2.1e//", - "-//ietf//dtd html 3.0//", - "-//ietf//dtd html 3.2 final//", - "-//ietf//dtd html 3.2//", - "-//ietf//dtd html 3//", - "-//ietf//dtd html level 0//", - "-//ietf//dtd html level 1//", - "-//ietf//dtd html level 2//", - "-//ietf//dtd html level 3//", - "-//ietf//dtd html strict level 0//", - "-//ietf//dtd html strict level 1//", - "-//ietf//dtd html strict level 2//", - "-//ietf//dtd html strict level 3//", - "-//ietf//dtd html strict//", - "-//ietf//dtd html//", - "-//metrius//dtd metrius presentational//", - "-//microsoft//dtd internet explorer 2.0 html strict//", - "-//microsoft//dtd internet explorer 2.0 html//", - "-//microsoft//dtd internet explorer 2.0 tables//", - "-//microsoft//dtd internet explorer 3.0 html strict//", - "-//microsoft//dtd internet explorer 3.0 html//", - "-//microsoft//dtd internet explorer 3.0 tables//", - "-//netscape comm. corp.//dtd html//", - "-//netscape comm. corp.//dtd strict html//", - "-//o'reilly and associates//dtd html 2.0//", - "-//o'reilly and associates//dtd html extended 1.0//", - "-//o'reilly and associates//dtd html extended relaxed 1.0//", - "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", - "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", - "-//spyglass//dtd html 2.0 extended//", - "-//sq//dtd html 2.0 hotmetal + extensions//", - "-//sun microsystems corp.//dtd hotjava html//", - "-//sun microsystems corp.//dtd hotjava strict html//", - "-//w3c//dtd html 3 1995-03-24//", - "-//w3c//dtd html 3.2 draft//", - "-//w3c//dtd html 3.2 final//", - "-//w3c//dtd html 3.2//", - "-//w3c//dtd html 3.2s draft//", - "-//w3c//dtd html 4.0 frameset//", - "-//w3c//dtd html 4.0 transitional//", - "-//w3c//dtd html experimental 19960712//", - "-//w3c//dtd html experimental 970421//", - "-//w3c//dtd w3 html//", - "-//w3o//dtd w3 html 3.0//", - "-//webtechs//dtd mozilla html 2.0//", - "-//webtechs//dtd mozilla html//")) - or publicId in - ("-//w3o//dtd w3 html strict 3.0//en//", - "-/w3c/dtd html 4.0 transitional/en", - "html") - or publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is None - or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): + if (not correct or token["name"] != "html" or + publicId.startswith( + ("+//silmaril//dtd html pro v0r11 19970101//", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//")) or + publicId in ("-//w3o//dtd w3 html strict 3.0//en//", + "-/w3c/dtd html 4.0 transitional/en", + "html") or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is None or + systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" elif (publicId.startswith( ("-//w3c//dtd xhtml 1.0 frameset//", - "-//w3c//dtd xhtml 1.0 transitional//")) - or publicId.startswith( + "-//w3c//dtd xhtml 1.0 transitional//")) or + publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and systemId is not None): @@ -988,8 +987,8 @@ def processSpaceCharactersDropNewline(self, token): data = token["data"] self.processSpaceCharacters = self.processSpaceCharactersNonPre if (data.startswith("\n") and - self.tree.openElements[-1].name in ("pre", "listing", "textarea") - and not self.tree.openElements[-1].hasContent()): + self.tree.openElements[-1].name in ("pre", "listing", "textarea") and + not self.tree.openElements[-1].hasContent()): data = data[1:] if data: self.tree.reconstructActiveFormattingElements() @@ -1016,8 +1015,8 @@ def startTagProcessInHead(self, token): def startTagBody(self, token): self.parser.parseError("unexpected-start-tag", {"name": "body"}) - if (len(self.tree.openElements) == 1 - or self.tree.openElements[1].name != "body"): + if (len(self.tree.openElements) == 1 or + self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: self.parser.framesetOK = False diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index be4d6344..b87d9a75 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -237,8 +237,8 @@ def serialize(self, treewalker, encoding=None): yield self.encodeStrict(k) if not self.minimize_boolean_attributes or \ - (k not in booleanAttributes.get(name, tuple()) - and k not in booleanAttributes.get("", tuple())): + (k not in booleanAttributes.get(name, tuple()) and + k not in booleanAttributes.get("", tuple())): yield self.encodeStrict("=") if self.quote_attr_values or not v: quote_attr = True diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index 3c37feff..af76075e 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -91,8 +91,7 @@ def runSerializerTest(input, expected, options): encoding = options.get("encoding", None) if encoding: - encode = lambda x: x.encode(encoding) - expected = list(map(encode, expected)) + expected = list(map(lambda x: x.encode(encoding), expected)) result = serialize_html(input, options) if len(expected) == 1: diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index 823c6ea6..87e098f3 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -98,8 +98,8 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, """ checkSelfClosing = False for token in expectedTokens: - if (token[0] == "StartTag" and len(token) == 4 - or token[0] == "EndTag" and len(token) == 3): + if (token[0] == "StartTag" and len(token) == 4 or + token[0] == "EndTag" and len(token) == 3): checkSelfClosing = True break diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py index 8b97cc11..8196f591 100644 --- a/html5lib/treebuilders/_base.py +++ b/html5lib/treebuilders/_base.py @@ -353,8 +353,8 @@ def getTableMisnestedNodePosition(self): def generateImpliedEndTags(self, exclude=None): name = self.openElements[-1].name # XXX td, th and tr are not actually needed - if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) - and name != exclude): + if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and + name != exclude): self.openElements.pop() # XXX This is not entirely what the specification says. We should # investigate it more closely. diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index 234233b7..8656244f 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -47,8 +47,8 @@ def __init__(self, element): _base.Node.__init__(self, element.nodeName) self.element = element - namespace = property(lambda self: hasattr(self.element, "namespaceURI") - and self.element.namespaceURI or None) + namespace = property(lambda self: hasattr(self.element, "namespaceURI") and + self.element.namespaceURI or None) def appendChild(self, node): node.parent = self diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py index 21f46b01..00ae2804 100644 --- a/html5lib/treewalkers/__init__.py +++ b/html5lib/treewalkers/__init__.py @@ -10,11 +10,11 @@ from __future__ import absolute_import, division, unicode_literals -__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"] - from .. import constants from ..utils import default_etree +__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"] + treeWalkerCache = {} diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index 4e11cd02..e79a4357 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -1,11 +1,12 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type, string_types +from xml.dom import Node +from ..constants import voidElements, spaceCharacters + __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", "TreeWalker", "NonRecursiveTreeWalker"] -from xml.dom import Node - DOCUMENT = Node.DOCUMENT_NODE DOCTYPE = Node.DOCUMENT_TYPE_NODE TEXT = Node.TEXT_NODE @@ -14,7 +15,6 @@ ENTITY = Node.ENTITY_NODE UNKNOWN = "<#UNKNOWN#>" -from ..constants import voidElements, spaceCharacters spaceCharacters = "".join(spaceCharacters) diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index f559c45d..24d33282 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -39,8 +39,8 @@ def tokens(self, event, next): if namespace == namespaces["html"] and name in voidElements: for token in self.emptyTag(namespace, name, converted_attribs, - not next or next[0] != END - or next[1] != tag): + not next or next[0] != END or + next[1] != tag): yield token else: yield self.startTag(namespace, name, converted_attribs)