From 4a87368b71090f1432df6302f178c4babfcec93f Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 10 Jan 2024 15:13:40 +0100 Subject: [PATCH 1/3] GitHub Actions: python: ["3.8", "3.9", "3.10", "3.11", "3.12"] (#575) --- .appveyor.yml | 30 +++++++++++++++++------------ .github/workflows/python-tox.yml | 19 ++++++++---------- html5lib/tests/tokenizer.py | 4 +++- html5lib/tests/tree_construction.py | 4 +++- requirements-test.txt | 3 ++- 5 files changed, 34 insertions(+), 26 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 7661aa63..e6f7bf48 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,23 +1,29 @@ -image: Visual Studio 2019 +# appveyor.yml - https://www.appveyor.com/docs/lang/python +# https://www.appveyor.com/docs/windows-images-software/#visual-studio-2022 +--- +image: Visual Studio 2022 environment: - global: - PATH: "C:\\Python27\\Scripts\\;%PATH%" matrix: - - TOXENV: py27-base - - TOXENV: py27-optional - - TOXENV: py35-base - - TOXENV: py35-optional - - TOXENV: py36-base - - TOXENV: py36-optional + - PY_PYTHON: 2.7 + TOXENV: py27-base + - PY_PYTHON: 2.7 + TOXENV: py27-optional + - PY_PYTHON: 3.7 + TOXENV: py37-base + - PY_PYTHON: 3.7 + TOXENV: py37-optional install: - git submodule update --init --recursive - - python -m pip install tox + - py --list + - py -VV + - py -m pip install --upgrade pip + - py -m pip install tox build: off test_script: - - tox + - py -m tox after_test: - - python debug-info.py + - py debug-info.py diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml index cfcc42e6..5ed83175 100644 --- a/.github/workflows/python-tox.yml +++ b/.github/workflows/python-tox.yml @@ -7,30 +7,27 @@ jobs: strategy: fail-fast: false matrix: - # 2.7, 3.5, and 3.6 run on Windows via AppVeyor - python: ["3.7", "3.8", "3.9", "3.10", "3.11"] + # 2.7 and 3.7 run on Windows via AppVeyor + python: ["3.8", "3.9", "3.10", "3.11", "3.12"] os: [ubuntu-latest, windows-latest] deps: [base, optional] include: - python: "pypy-2.7" os: ubuntu-latest deps: base - - python: "pypy-3.8" + - python: "pypy-3.10" os: ubuntu-latest deps: base - - python: "2.7" - os: ubuntu-latest - deps: oldest - - python: "3.7" + - python: "3.8" os: ubuntu-latest deps: oldest runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - if: ${{ matrix.deps == 'base' }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: pip @@ -38,7 +35,7 @@ jobs: requirements.txt requirements-test.txt - if: ${{ matrix.deps == 'optional' }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: pip @@ -47,7 +44,7 @@ jobs: requirements-optional.txt requirements-test.txt - if: ${{ matrix.deps == 'oldest' }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: pip diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index cc9897a4..b49d2e6e 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -246,7 +246,9 @@ def runtest(self): def repr_failure(self, excinfo): traceback = excinfo.traceback ntraceback = traceback.cut(path=__file__) - excinfo.traceback = ntraceback.filter() + pytest_ver = getattr(pytest, "version_tuple", ()) + filter_args = (excinfo,) if pytest_ver >= (7, 4, 0) else () + excinfo.traceback = ntraceback.filter(*filter_args) return excinfo.getrepr(funcargs=True, showlocals=False, diff --git a/html5lib/tests/tree_construction.py b/html5lib/tests/tree_construction.py index fb0657bf..363b48c2 100644 --- a/html5lib/tests/tree_construction.py +++ b/html5lib/tests/tree_construction.py @@ -135,7 +135,9 @@ def runtest(self): def repr_failure(self, excinfo): traceback = excinfo.traceback ntraceback = traceback.cut(path=__file__) - excinfo.traceback = ntraceback.filter() + pytest_ver = getattr(pytest, "version_tuple", ()) + filter_args = (excinfo,) if pytest_ver >= (7, 4, 0) else () + excinfo.traceback = ntraceback.filter(*filter_args) return excinfo.getrepr(funcargs=True, showlocals=False, diff --git a/requirements-test.txt b/requirements-test.txt index 27866e59..39913ee4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,8 +3,9 @@ flake8==3.9.2 ; python_version < '3.6' flake8>=5.0.4; python_version >= '3.6' pytest>=4.6.10,<5 ; python_version < '3' -pytest>=5.4.2,<7 ; python_version >= '3' +pytest>=5.4.2,<8 ; python_version >= '3' coverage>=5.1,<6 pytest-expect>=1.1.0,<2 mock>=3.0.5,<4 ; python_version < '3.6' mock>=4.0.2,<5 ; python_version >= '3.6' +setuptools; python_version >= '3.12' From 82c2599585a6119e5afd26e58e754972c79f6734 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Wed, 10 Jan 2024 13:31:09 -0500 Subject: [PATCH 2/3] tests: drop dependency on external mock module for newer python (#574) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #541 Co-authored-by: Ɓukasz Langa --- html5lib/tests/test_meta.py | 5 ++++- requirements-test.txt | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/html5lib/tests/test_meta.py b/html5lib/tests/test_meta.py index dd02dd7f..e02268aa 100644 --- a/html5lib/tests/test_meta.py +++ b/html5lib/tests/test_meta.py @@ -1,7 +1,10 @@ from __future__ import absolute_import, division, unicode_literals import six -from mock import Mock +try: + from unittest.mock import Mock +except ImportError: + from mock import Mock from . import support diff --git a/requirements-test.txt b/requirements-test.txt index 39913ee4..aca31f5e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,6 +6,5 @@ pytest>=4.6.10,<5 ; python_version < '3' pytest>=5.4.2,<8 ; python_version >= '3' coverage>=5.1,<6 pytest-expect>=1.1.0,<2 -mock>=3.0.5,<4 ; python_version < '3.6' -mock>=4.0.2,<5 ; python_version >= '3.6' +mock>=3.0.5,<4 ; python_version < '3.3' setuptools; python_version >= '3.12' From fd4f032bc090d44fb11a84b352dad7cbee0a4745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Wed, 21 Feb 2024 16:31:38 +0100 Subject: [PATCH 3/3] Constant phases (#567) * Get rid of getPhases This added a fair bit of complexity, and notable made the Phase classes dynamically generated. However, by doing this, we no longer include "process the token using the rules for" phases in the debug log. Co-authored-by: Sam Sneddon --- html5lib/_utils.py | 12 - html5lib/html5parser.py | 4349 ++++++++++++++++---------------- html5lib/tests/test_parser2.py | 1 - 3 files changed, 2172 insertions(+), 2190 deletions(-) diff --git a/html5lib/_utils.py b/html5lib/_utils.py index 9ea57942..7e23ee57 100644 --- a/html5lib/_utils.py +++ b/html5lib/_utils.py @@ -145,15 +145,3 @@ def moduleFactory(baseModule, *args, **kwargs): return mod return moduleFactory - - -def memoize(func): - cache = {} - - def wrapped(*args, **kwargs): - key = (tuple(args), tuple(kwargs.items())) - if key not in cache: - cache[key] = func(*args, **kwargs) - return cache[key] - - return wrapped diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 4c2d4c75..b3c206d1 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,7 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import with_metaclass, viewkeys - -import types +from six import viewkeys from . import _inputstream from . import _tokenizer @@ -13,7 +11,7 @@ from .constants import ( spaceCharacters, asciiUpper2Lower, specialElements, headingElements, cdataElements, rcdataElements, - tokenTypes, tagTokenTypes, + tokenTypes, namespaces, htmlIntegrationPointElements, mathmlTextIntegrationPointElements, adjustForeignAttributes as adjustForeignAttributesMap, @@ -71,18 +69,6 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen return p.parseFragment(doc, container=container, **kwargs) -def method_decorator_metaclass(function): - class Decorated(type): - def __new__(meta, classname, bases, classDict): - for attributeName, attribute in classDict.items(): - if isinstance(attribute, types.FunctionType): - attribute = function(attribute) - - classDict[attributeName] = attribute - return type.__new__(meta, classname, bases, classDict) - return Decorated - - class HTMLParser(object): """HTML parser @@ -112,6 +98,7 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa # Raise an exception on the first error encountered self.strict = strict + self.debug = debug if tree is None: tree = treebuilders.getTreeBuilder("etree") @@ -122,7 +109,7 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa self.errors = [] self.phases = {name: cls(self, self.tree) for name, cls in - getPhases(debug).items()} + _phases.items()} def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): @@ -204,6 +191,9 @@ def mainLoop(self): DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] + type_names = {value: key for key, value in tokenTypes.items()} + debug = self.debug + for token in self.tokenizer: prev_token = None new_token = token @@ -235,6 +225,17 @@ def mainLoop(self): else: phase = self.phases["inForeignContent"] + if debug: + info = {"type": type_names[type]} + if type in (StartTagToken, EndTagToken): + info["name"] = new_token['name'] + + self.log.append((self.tokenizer.state.__name__, + self.phase.__class__.__name__, + phase.__class__.__name__, + "process" + info["type"], + info)) + if type == CharactersToken: new_token = phase.processCharacters(new_token) elif type == SpaceCharactersToken: @@ -396,2386 +397,2380 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] -@_utils.memoize -def getPhases(debug): - def log(function): - """Logger that records which phase processes each token""" - type_names = {value: key for key, value in tokenTypes.items()} - - def wrapped(self, *args, **kwargs): - if function.__name__.startswith("process") and len(args) > 0: - token = args[0] - info = {"type": type_names[token['type']]} - if token['type'] in tagTokenTypes: - info["name"] = token['name'] - - self.parser.log.append((self.parser.tokenizer.state.__name__, - self.parser.phase.__class__.__name__, - self.__class__.__name__, - function.__name__, - info)) - return function(self, *args, **kwargs) - else: - return function(self, *args, **kwargs) - return wrapped - - def getMetaclass(use_metaclass, metaclass_func): - if use_metaclass: - return method_decorator_metaclass(metaclass_func) +class Phase(object): + """Base class for helper object that implements each phase of processing + """ + __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") + + def __init__(self, parser, tree): + self.parser = parser + self.tree = tree + self.__startTagCache = {} + self.__endTagCache = {} + + def processEOF(self): + raise NotImplementedError + + def processComment(self, token): + # For most phases the following is correct. Where it's not it will be + # overridden. + self.tree.insertComment(token, self.tree.openElements[-1]) + + def processDoctype(self, token): + self.parser.parseError("unexpected-doctype") + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processSpaceCharacters(self, token): + self.tree.insertText(token["data"]) + + def processStartTag(self, token): + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__startTagCache: + func = self.__startTagCache[name] else: - return type - - # pylint:disable=unused-argument - class Phase(with_metaclass(getMetaclass(debug, log))): - """Base class for helper object that implements each phase of processing - """ - __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") - - def __init__(self, parser, tree): - self.parser = parser - self.tree = tree - self.__startTagCache = {} - self.__endTagCache = {} - - def processEOF(self): - raise NotImplementedError - - def processComment(self, token): - # For most phases the following is correct. Where it's not it will be - # overridden. - self.tree.insertComment(token, self.tree.openElements[-1]) - - def processDoctype(self, token): - self.parser.parseError("unexpected-doctype") - - def processCharacters(self, token): - self.tree.insertText(token["data"]) - - def processSpaceCharacters(self, token): - self.tree.insertText(token["data"]) - - def processStartTag(self, token): - # Note the caching is done here rather than BoundMethodDispatcher as doing it there - # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] - # In Py2, using `in` is quicker in general than try/except KeyError - # In Py3, `in` is quicker when there are few cache hits (typically short inputs) - if name in self.__startTagCache: - func = self.__startTagCache[name] - else: - func = self.__startTagCache[name] = self.startTagHandler[name] - # bound the cache size in case we get loads of unknown tags - while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: - # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 - self.__startTagCache.pop(next(iter(self.__startTagCache))) - return func(token) - - def startTagHtml(self, token): - if not self.parser.firstStartTag and token["name"] == "html": - self.parser.parseError("non-html-root") - # XXX Need a check here to see if the first start tag token emitted is - # this token... If it's not, invoke self.parser.parseError(). - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[0].attributes: - self.tree.openElements[0].attributes[attr] = value - self.parser.firstStartTag = False - - def processEndTag(self, token): - # Note the caching is done here rather than BoundMethodDispatcher as doing it there - # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] - # In Py2, using `in` is quicker in general than try/except KeyError - # In Py3, `in` is quicker when there are few cache hits (typically short inputs) - if name in self.__endTagCache: - func = self.__endTagCache[name] - else: - func = self.__endTagCache[name] = self.endTagHandler[name] - # bound the cache size in case we get loads of unknown tags - while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: - # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 - self.__endTagCache.pop(next(iter(self.__endTagCache))) - return func(token) - - class InitialPhase(Phase): - __slots__ = tuple() - - def processSpaceCharacters(self, token): - pass - - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) - - def processDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - correct = token["correct"] - - if (name != "html" or publicId is not None or - systemId is not None and systemId != "about:legacy-compat"): - self.parser.parseError("unknown-doctype") - - if publicId is None: - publicId = "" - - self.tree.insertDoctype(token) - - if publicId != "": - publicId = publicId.translate(asciiUpper2Lower) - - if (not correct or token["name"] != "html" or - publicId.startswith( - ("+//silmaril//dtd html pro v0r11 19970101//", - "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", - "-//as//dtd html 3.0 aswedit + extensions//", - "-//ietf//dtd html 2.0 level 1//", - "-//ietf//dtd html 2.0 level 2//", - "-//ietf//dtd html 2.0 strict level 1//", - "-//ietf//dtd html 2.0 strict level 2//", - "-//ietf//dtd html 2.0 strict//", - "-//ietf//dtd html 2.0//", - "-//ietf//dtd html 2.1e//", - "-//ietf//dtd html 3.0//", - "-//ietf//dtd html 3.2 final//", - "-//ietf//dtd html 3.2//", - "-//ietf//dtd html 3//", - "-//ietf//dtd html level 0//", - "-//ietf//dtd html level 1//", - "-//ietf//dtd html level 2//", - "-//ietf//dtd html level 3//", - "-//ietf//dtd html strict level 0//", - "-//ietf//dtd html strict level 1//", - "-//ietf//dtd html strict level 2//", - "-//ietf//dtd html strict level 3//", - "-//ietf//dtd html strict//", - "-//ietf//dtd html//", - "-//metrius//dtd metrius presentational//", - "-//microsoft//dtd internet explorer 2.0 html strict//", - "-//microsoft//dtd internet explorer 2.0 html//", - "-//microsoft//dtd internet explorer 2.0 tables//", - "-//microsoft//dtd internet explorer 3.0 html strict//", - "-//microsoft//dtd internet explorer 3.0 html//", - "-//microsoft//dtd internet explorer 3.0 tables//", - "-//netscape comm. corp.//dtd html//", - "-//netscape comm. corp.//dtd strict html//", - "-//o'reilly and associates//dtd html 2.0//", - "-//o'reilly and associates//dtd html extended 1.0//", - "-//o'reilly and associates//dtd html extended relaxed 1.0//", - "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", - "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", - "-//spyglass//dtd html 2.0 extended//", - "-//sq//dtd html 2.0 hotmetal + extensions//", - "-//sun microsystems corp.//dtd hotjava html//", - "-//sun microsystems corp.//dtd hotjava strict html//", - "-//w3c//dtd html 3 1995-03-24//", - "-//w3c//dtd html 3.2 draft//", - "-//w3c//dtd html 3.2 final//", - "-//w3c//dtd html 3.2//", - "-//w3c//dtd html 3.2s draft//", - "-//w3c//dtd html 4.0 frameset//", - "-//w3c//dtd html 4.0 transitional//", - "-//w3c//dtd html experimental 19960712//", - "-//w3c//dtd html experimental 970421//", - "-//w3c//dtd w3 html//", - "-//w3o//dtd w3 html 3.0//", - "-//webtechs//dtd mozilla html 2.0//", - "-//webtechs//dtd mozilla html//")) or - publicId in ("-//w3o//dtd w3 html strict 3.0//en//", - "-/w3c/dtd html 4.0 transitional/en", - "html") or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is None or - systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): - self.parser.compatMode = "quirks" - elif (publicId.startswith( - ("-//w3c//dtd xhtml 1.0 frameset//", - "-//w3c//dtd xhtml 1.0 transitional//")) or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is not None): - self.parser.compatMode = "limited quirks" - - self.parser.phase = self.parser.phases["beforeHtml"] - - def anythingElse(self): + func = self.__startTagCache[name] = self.startTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__startTagCache.pop(next(iter(self.__startTagCache))) + return func(token) + + def startTagHtml(self, token): + if not self.parser.firstStartTag and token["name"] == "html": + self.parser.parseError("non-html-root") + # XXX Need a check here to see if the first start tag token emitted is + # this token... If it's not, invoke self.parser.parseError(). + for attr, value in token["data"].items(): + if attr not in self.tree.openElements[0].attributes: + self.tree.openElements[0].attributes[attr] = value + self.parser.firstStartTag = False + + def processEndTag(self, token): + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__endTagCache: + func = self.__endTagCache[name] + else: + func = self.__endTagCache[name] = self.endTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__endTagCache.pop(next(iter(self.__endTagCache))) + return func(token) + + +class InitialPhase(Phase): + __slots__ = tuple() + + def processSpaceCharacters(self, token): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + correct = token["correct"] + + if (name != "html" or publicId is not None or + systemId is not None and systemId != "about:legacy-compat"): + self.parser.parseError("unknown-doctype") + + if publicId is None: + publicId = "" + + self.tree.insertDoctype(token) + + if publicId != "": + publicId = publicId.translate(asciiUpper2Lower) + + if (not correct or token["name"] != "html" or + publicId.startswith( + ("+//silmaril//dtd html pro v0r11 19970101//", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//")) or + publicId in ("-//w3o//dtd w3 html strict 3.0//en//", + "-/w3c/dtd html 4.0 transitional/en", + "html") or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is None or + systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" - self.parser.phase = self.parser.phases["beforeHtml"] - - def processCharacters(self, token): - self.parser.parseError("expected-doctype-but-got-chars") - self.anythingElse() - return token - - def processStartTag(self, token): - self.parser.parseError("expected-doctype-but-got-start-tag", + elif (publicId.startswith( + ("-//w3c//dtd xhtml 1.0 frameset//", + "-//w3c//dtd xhtml 1.0 transitional//")) or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is not None): + self.parser.compatMode = "limited quirks" + + self.parser.phase = self.parser.phases["beforeHtml"] + + def anythingElse(self): + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + + def processCharacters(self, token): + self.parser.parseError("expected-doctype-but-got-chars") + self.anythingElse() + return token + + def processStartTag(self, token): + self.parser.parseError("expected-doctype-but-got-start-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEndTag(self, token): + self.parser.parseError("expected-doctype-but-got-end-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEOF(self): + self.parser.parseError("expected-doctype-but-got-eof") + self.anythingElse() + return True + + +class BeforeHtmlPhase(Phase): + __slots__ = tuple() + + # helper methods + def insertHtmlElement(self): + self.tree.insertRoot(impliedTagToken("html", "StartTag")) + self.parser.phase = self.parser.phases["beforeHead"] + + # other + def processEOF(self): + self.insertHtmlElement() + return True + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + pass + + def processCharacters(self, token): + self.insertHtmlElement() + return token + + def processStartTag(self, token): + if token["name"] == "html": + self.parser.firstStartTag = True + self.insertHtmlElement() + return token + + def processEndTag(self, token): + if token["name"] not in ("head", "body", "html", "br"): + self.parser.parseError("unexpected-end-tag-before-html", {"name": token["name"]}) - self.anythingElse() + else: + self.insertHtmlElement() return token - def processEndTag(self, token): - self.parser.parseError("expected-doctype-but-got-end-tag", - {"name": token["name"]}) - self.anythingElse() - return token - def processEOF(self): - self.parser.parseError("expected-doctype-but-got-eof") - self.anythingElse() - return True +class BeforeHeadPhase(Phase): + __slots__ = tuple() - class BeforeHtmlPhase(Phase): - __slots__ = tuple() + def processEOF(self): + self.startTagHead(impliedTagToken("head", "StartTag")) + return True - # helper methods - def insertHtmlElement(self): - self.tree.insertRoot(impliedTagToken("html", "StartTag")) - self.parser.phase = self.parser.phases["beforeHead"] + def processSpaceCharacters(self, token): + pass - # other - def processEOF(self): - self.insertHtmlElement() - return True + def processCharacters(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def processSpaceCharacters(self, token): - pass + def startTagHead(self, token): + self.tree.insertElement(token) + self.tree.headPointer = self.tree.openElements[-1] + self.parser.phase = self.parser.phases["inHead"] - def processCharacters(self, token): - self.insertHtmlElement() - return token + def startTagOther(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processStartTag(self, token): - if token["name"] == "html": - self.parser.firstStartTag = True - self.insertHtmlElement() - return token + def endTagImplyHead(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processEndTag(self, token): - if token["name"] not in ("head", "body", "html", "br"): - self.parser.parseError("unexpected-end-tag-before-html", - {"name": token["name"]}) - else: - self.insertHtmlElement() - return token + def endTagOther(self, token): + self.parser.parseError("end-tag-after-implied-root", + {"name": token["name"]}) - class BeforeHeadPhase(Phase): - __slots__ = tuple() + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther - def processEOF(self): - self.startTagHead(impliedTagToken("head", "StartTag")) - return True + endTagHandler = _utils.MethodDispatcher([ + (("head", "body", "html", "br"), endTagImplyHead) + ]) + endTagHandler.default = endTagOther - def processSpaceCharacters(self, token): - pass - def processCharacters(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token +class InHeadPhase(Phase): + __slots__ = tuple() - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + # the real thing + def processEOF(self): + self.anythingElse() + return True - def startTagHead(self, token): - self.tree.insertElement(token) - self.tree.headPointer = self.tree.openElements[-1] - self.parser.phase = self.parser.phases["inHead"] + def processCharacters(self, token): + self.anythingElse() + return token - def startTagOther(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def endTagImplyHead(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token + def startTagHead(self, token): + self.parser.parseError("two-heads-are-not-better-than-one") - def endTagOther(self, token): - self.parser.parseError("end-tag-after-implied-root", - {"name": token["name"]}) + def startTagBaseLinkCommand(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml), - ("head", startTagHead) - ]) - startTagHandler.default = startTagOther + def startTagMeta(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + attributes = token["data"] + if self.parser.tokenizer.stream.charEncoding[1] == "tentative": + if "charset" in attributes: + self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) + elif ("content" in attributes and + "http-equiv" in attributes and + attributes["http-equiv"].lower() == "content-type"): + # Encoding it as UTF-8 here is a hack, as really we should pass + # the abstract Unicode string, and just use the + # ContentAttrParser on that, but using UTF-8 allows all chars + # to be encoded and as a ASCII-superset works. + data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) + parser = _inputstream.ContentAttrParser(data) + codec = parser.parse() + self.parser.tokenizer.stream.changeEncoding(codec) + + def startTagTitle(self, token): + self.parser.parseRCDataRawtext(token, "RCDATA") + + def startTagNoFramesStyle(self, token): + # Need to decide whether to implement the scripting-disabled case + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagNoscript(self, token): + if self.parser.scripting: + self.parser.parseRCDataRawtext(token, "RAWTEXT") + else: + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inHeadNoscript"] - endTagHandler = _utils.MethodDispatcher([ - (("head", "body", "html", "br"), endTagImplyHead) - ]) - endTagHandler.default = endTagOther + def startTagScript(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState + self.parser.originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["text"] + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHead(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "head", "Expected head got %s" % node.name + self.parser.phase = self.parser.phases["afterHead"] + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.endTagHead(impliedTagToken("head")) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("title", startTagTitle), + (("noframes", "style"), startTagNoFramesStyle), + ("noscript", startTagNoscript), + ("script", startTagScript), + (("base", "basefont", "bgsound", "command", "link"), + startTagBaseLinkCommand), + ("meta", startTagMeta), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("head", endTagHead), + (("br", "html", "body"), endTagHtmlBodyBr) + ]) + endTagHandler.default = endTagOther + + +class InHeadNoscriptPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + self.parser.parseError("eof-in-head-noscript") + self.anythingElse() + return True + + def processComment(self, token): + return self.parser.phases["inHead"].processComment(token) + + def processCharacters(self, token): + self.parser.parseError("char-in-head-noscript") + self.anythingElse() + return token + + def processSpaceCharacters(self, token): + return self.parser.phases["inHead"].processSpaceCharacters(token) + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBaseLinkCommand(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagHeadNoscript(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagNoscript(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "noscript", "Expected noscript got %s" % node.name + self.parser.phase = self.parser.phases["inHead"] + + def endTagBr(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + # Caller must raise parse error first! + self.endTagNoscript(impliedTagToken("noscript")) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), + (("head", "noscript"), startTagHeadNoscript), + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("noscript", endTagNoscript), + ("br", endTagBr), + ]) + endTagHandler.default = endTagOther + + +class AfterHeadPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + self.anythingElse() + return True + + def processCharacters(self, token): + self.anythingElse() + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBody(self, token): + self.parser.framesetOK = False + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inBody"] - class InHeadPhase(Phase): - __slots__ = tuple() + def startTagFrameset(self, token): + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] - # the real thing - def processEOF(self): - self.anythingElse() - return True + def startTagFromHead(self, token): + self.parser.parseError("unexpected-start-tag-out-of-my-head", + {"name": token["name"]}) + self.tree.openElements.append(self.tree.headPointer) + self.parser.phases["inHead"].processStartTag(token) + for node in self.tree.openElements[::-1]: + if node.name == "head": + self.tree.openElements.remove(node) + break - def processCharacters(self, token): - self.anythingElse() - return token + def startTagHead(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.tree.insertElement(impliedTagToken("body", "StartTag")) + self.parser.phase = self.parser.phases["inBody"] + self.parser.framesetOK = True + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("body", startTagBody), + ("frameset", startTagFrameset), + (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", + "style", "title"), + startTagFromHead), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), + endTagHtmlBodyBr)]) + endTagHandler.default = endTagOther + + +class InBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody + # the really-really-really-very crazy mode + __slots__ = ("processSpaceCharacters",) + + def __init__(self, *args, **kwargs): + super(InBodyPhase, self).__init__(*args, **kwargs) + # Set this to the default handler + self.processSpaceCharacters = self.processSpaceCharactersNonPre + + def isMatchingFormattingElement(self, node1, node2): + return (node1.name == node2.name and + node1.namespace == node2.namespace and + node1.attributes == node2.attributes) + + # helper + def addFormattingElement(self, token): + self.tree.insertElement(token) + element = self.tree.openElements[-1] - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + matchingElements = [] + for node in self.tree.activeFormattingElements[::-1]: + if node is Marker: + break + elif self.isMatchingFormattingElement(node, element): + matchingElements.append(node) + + assert len(matchingElements) <= 3 + if len(matchingElements) == 3: + self.tree.activeFormattingElements.remove(matchingElements[-1]) + self.tree.activeFormattingElements.append(element) + + # the real deal + def processEOF(self): + allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", + "tfoot", "th", "thead", "tr", "body", + "html")) + for node in self.tree.openElements[::-1]: + if node.name not in allowed_elements: + self.parser.parseError("expected-closing-tag-but-got-eof") + break + # Stop parsing + + def processSpaceCharactersDropNewline(self, token): + # Sometimes (start of
, , and