From 1be9532f79fd7744be0945c4ab42d2f5b41e4e73 Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:23:01 -0500 Subject: [PATCH 001/219] Added iframe seamless boolean attribute --- html5lib/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/html5lib/constants.py b/html5lib/constants.py index e7089846..659f2b5e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -535,6 +535,7 @@ "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), "output": frozenset(("disabled", "readonly")), + "iframe": frozenset(("seamless")), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It From 4dfe3cd9f97ce51c53463d633308f4a3fe6ad9e6 Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:25:04 -0500 Subject: [PATCH 002/219] Update CHANGES.rst --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 1431b3c9..89e48f94 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,7 +6,7 @@ Change Log Released on XXX, 2014 -* XXX +* Fix #XXX: added the seamless attribute for iframes. 0.999 From 7fd79e31e083ab75305b3e837ea9aa8c9b4675ff Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:25:28 -0500 Subject: [PATCH 003/219] Update AUTHORS.rst --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..787c3b94 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,4 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Ritwik Gupta From 3065630b2c43eeaf8d5d12fea930847e3e508350 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 16:19:02 +0100 Subject: [PATCH 004/219] Get rid of last remenents of our usage of the stdlib unittest --- html5lib/tests/test_encoding.py | 6 - html5lib/tests/test_parser2.py | 71 +++-- html5lib/tests/test_stream.py | 344 +++++++++++------------ html5lib/tests/test_whitespace_filter.py | 246 ++++++++-------- 4 files changed, 315 insertions(+), 352 deletions(-) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 41b888c4..09504654 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -1,12 +1,6 @@ from __future__ import absolute_import, division, unicode_literals import os -import unittest - -try: - unittest.TestCase.assertEqual -except AttributeError: - unittest.TestCase.assertEqual = unittest.TestCase.assertEquals from .support import get_data_files, test_dir, errorMessage, TestData as _TestData from html5lib import HTMLParser, inputstream diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 01f16eea..26eff241 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -2,63 +2,56 @@ import io +import pytest + from . import support # flake8: noqa from html5lib import html5parser from html5lib.constants import namespaces from html5lib import treebuilders -import unittest # tests that aren't autogenerated from text files +@pytest.fixture +def dom_tree(): + return treebuilders.getTreeBuilder("dom") -class MoreParserTests(unittest.TestCase): - - def setUp(self): - self.dom_tree = treebuilders.getTreeBuilder("dom") +def test_assertDoctypeCloneable(dom_tree): + parser = html5parser.HTMLParser(tree=dom_tree) + doc = parser.parse('') + assert doc.cloneNode(True) is not None - def test_assertDoctypeCloneable(self): - parser = html5parser.HTMLParser(tree=self.dom_tree) - doc = parser.parse('') - self.assertTrue(doc.cloneNode(True)) - def test_line_counter(self): - # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0 - parser = html5parser.HTMLParser(tree=self.dom_tree) - parser.parse("
\nx\n>\n
") +def test_line_counter(dom_tree): + # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0 + parser = html5parser.HTMLParser(tree=dom_tree) + parser.parse("
\nx\n>\n
") - def test_namespace_html_elements_0_dom(self): - parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=True) - doc = parser.parse("") - self.assertTrue(doc.childNodes[0].namespaceURI == namespaces["html"]) - def test_namespace_html_elements_1_dom(self): - parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False) - doc = parser.parse("") - self.assertTrue(doc.childNodes[0].namespaceURI is None) +def test_namespace_html_elements_0_dom(dom_tree): + parser = html5parser.HTMLParser(tree=dom_tree, namespaceHTMLElements=True) + doc = parser.parse("") + assert doc.childNodes[0].namespaceURI == namespaces["html"] - def test_namespace_html_elements_0_etree(self): - parser = html5parser.HTMLParser(namespaceHTMLElements=True) - doc = parser.parse("") - self.assertTrue(doc.tag == "{%s}html" % (namespaces["html"],)) - def test_namespace_html_elements_1_etree(self): - parser = html5parser.HTMLParser(namespaceHTMLElements=False) - doc = parser.parse("") - self.assertTrue(doc.tag == "html") +def test_namespace_html_elements_1_dom(dom_tree): + parser = html5parser.HTMLParser(tree=dom_tree, namespaceHTMLElements=False) + doc = parser.parse("") + assert doc.childNodes[0].namespaceURI is None - def test_unicode_file(self): - parser = html5parser.HTMLParser() - parser.parse(io.StringIO("a")) +def test_namespace_html_elements_0_etree(): + parser = html5parser.HTMLParser(namespaceHTMLElements=True) + doc = parser.parse("") + assert doc.tag == "{%s}html" % (namespaces["html"],) -def buildTestSuite(): - return unittest.defaultTestLoader.loadTestsFromName(__name__) +def test_namespace_html_elements_1_etree(): + parser = html5parser.HTMLParser(namespaceHTMLElements=False) + doc = parser.parse("") + assert doc.tag == "html" -def main(): - buildTestSuite() - unittest.main() -if __name__ == '__main__': - main() +def test_unicode_file(): + parser = html5parser.HTMLParser() + parser.parse(io.StringIO("a")) diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index ed203766..3b659fbb 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,7 +1,6 @@ from __future__ import absolute_import, division, unicode_literals from . import support # flake8: noqa -import unittest import codecs from io import BytesIO import socket @@ -12,66 +11,65 @@ from html5lib.inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) -class BufferedStreamTest(unittest.TestCase): - def test_basic(self): - s = b"abc" - fp = BufferedStream(BytesIO(s)) - read = fp.read(10) - assert read == s - - def test_read_length(self): - fp = BufferedStream(BytesIO(b"abcdef")) - read1 = fp.read(1) - assert read1 == b"a" - read2 = fp.read(2) - assert read2 == b"bc" - read3 = fp.read(3) - assert read3 == b"def" - read4 = fp.read(4) - assert read4 == b"" - - def test_tell(self): - fp = BufferedStream(BytesIO(b"abcdef")) - read1 = fp.read(1) - assert fp.tell() == 1 - read2 = fp.read(2) - assert fp.tell() == 3 - read3 = fp.read(3) - assert fp.tell() == 6 - read4 = fp.read(4) - assert fp.tell() == 6 - - def test_seek(self): - fp = BufferedStream(BytesIO(b"abcdef")) - read1 = fp.read(1) - assert read1 == b"a" - fp.seek(0) - read2 = fp.read(1) - assert read2 == b"a" - read3 = fp.read(2) - assert read3 == b"bc" - fp.seek(2) - read4 = fp.read(2) - assert read4 == b"cd" - fp.seek(4) - read5 = fp.read(2) - assert read5 == b"ef" - - def test_seek_tell(self): - fp = BufferedStream(BytesIO(b"abcdef")) - read1 = fp.read(1) - assert fp.tell() == 1 - fp.seek(0) - read2 = fp.read(1) - assert fp.tell() == 1 - read3 = fp.read(2) - assert fp.tell() == 3 - fp.seek(2) - read4 = fp.read(2) - assert fp.tell() == 4 - fp.seek(4) - read5 = fp.read(2) - assert fp.tell() == 6 +def test_basic(): + s = b"abc" + fp = BufferedStream(BytesIO(s)) + read = fp.read(10) + assert read == s + +def test_read_length(): + fp = BufferedStream(BytesIO(b"abcdef")) + read1 = fp.read(1) + assert read1 == b"a" + read2 = fp.read(2) + assert read2 == b"bc" + read3 = fp.read(3) + assert read3 == b"def" + read4 = fp.read(4) + assert read4 == b"" + +def test_tell(): + fp = BufferedStream(BytesIO(b"abcdef")) + read1 = fp.read(1) + assert fp.tell() == 1 + read2 = fp.read(2) + assert fp.tell() == 3 + read3 = fp.read(3) + assert fp.tell() == 6 + read4 = fp.read(4) + assert fp.tell() == 6 + +def test_seek(): + fp = BufferedStream(BytesIO(b"abcdef")) + read1 = fp.read(1) + assert read1 == b"a" + fp.seek(0) + read2 = fp.read(1) + assert read2 == b"a" + read3 = fp.read(2) + assert read3 == b"bc" + fp.seek(2) + read4 = fp.read(2) + assert read4 == b"cd" + fp.seek(4) + read5 = fp.read(2) + assert read5 == b"ef" + +def test_seek_tell(): + fp = BufferedStream(BytesIO(b"abcdef")) + read1 = fp.read(1) + assert fp.tell() == 1 + fp.seek(0) + read2 = fp.read(1) + assert fp.tell() == 1 + read3 = fp.read(2) + assert fp.tell() == 3 + fp.seek(2) + read4 = fp.read(2) + assert fp.tell() == 4 + fp.seek(4) + read5 = fp.read(2) + assert fp.tell() == 6 class HTMLUnicodeInputStreamShortChunk(HTMLUnicodeInputStream): @@ -82,122 +80,108 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream): _defaultChunkSize = 2 -class HTMLInputStreamTest(unittest.TestCase): - - def test_char_ascii(self): - stream = HTMLInputStream(b"'", encoding='ascii') - self.assertEqual(stream.charEncoding[0].name, 'windows-1252') - self.assertEqual(stream.char(), "'") - - def test_char_utf8(self): - stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8') - self.assertEqual(stream.charEncoding[0].name, 'utf-8') - self.assertEqual(stream.char(), '\u2018') - - def test_char_win1252(self): - stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252')) - self.assertEqual(stream.charEncoding[0].name, 'windows-1252') - self.assertEqual(stream.char(), "\xa9") - self.assertEqual(stream.char(), "\xf1") - self.assertEqual(stream.char(), "\u2019") - - def test_bom(self): - stream = HTMLInputStream(codecs.BOM_UTF8 + b"'") - self.assertEqual(stream.charEncoding[0].name, 'utf-8') - self.assertEqual(stream.char(), "'") - - def test_utf_16(self): - stream = HTMLInputStream((' ' * 1025).encode('utf-16')) - self.assertTrue(stream.charEncoding[0].name in ['utf-16le', 'utf-16be'], stream.charEncoding) - self.assertEqual(len(stream.charsUntil(' ', True)), 1025) - - def test_newlines(self): - stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe") - self.assertEqual(stream.position(), (1, 0)) - self.assertEqual(stream.charsUntil('c'), "a\nbb\n") - self.assertEqual(stream.position(), (3, 0)) - self.assertEqual(stream.charsUntil('x'), "ccc\ndddd") - self.assertEqual(stream.position(), (4, 4)) - self.assertEqual(stream.charsUntil('e'), "x") - self.assertEqual(stream.position(), (4, 5)) - - def test_newlines2(self): - size = HTMLUnicodeInputStream._defaultChunkSize - stream = HTMLInputStream("\r" * size + "\n") - self.assertEqual(stream.charsUntil('x'), "\n" * size) - - def test_position(self): - stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh") - self.assertEqual(stream.position(), (1, 0)) - self.assertEqual(stream.charsUntil('c'), "a\nbb\n") - self.assertEqual(stream.position(), (3, 0)) - stream.unget("\n") - self.assertEqual(stream.position(), (2, 2)) - self.assertEqual(stream.charsUntil('c'), "\n") - self.assertEqual(stream.position(), (3, 0)) - stream.unget("\n") - self.assertEqual(stream.position(), (2, 2)) - self.assertEqual(stream.char(), "\n") - self.assertEqual(stream.position(), (3, 0)) - self.assertEqual(stream.charsUntil('e'), "ccc\nddd") - self.assertEqual(stream.position(), (4, 3)) - self.assertEqual(stream.charsUntil('h'), "e\nf\ng") - self.assertEqual(stream.position(), (6, 1)) - - def test_position2(self): - stream = HTMLUnicodeInputStreamShortChunk("abc\nd") - self.assertEqual(stream.position(), (1, 0)) - self.assertEqual(stream.char(), "a") - self.assertEqual(stream.position(), (1, 1)) - self.assertEqual(stream.char(), "b") - self.assertEqual(stream.position(), (1, 2)) - self.assertEqual(stream.char(), "c") - self.assertEqual(stream.position(), (1, 3)) - self.assertEqual(stream.char(), "\n") - self.assertEqual(stream.position(), (2, 0)) - self.assertEqual(stream.char(), "d") - self.assertEqual(stream.position(), (2, 1)) - - def test_python_issue_20007(self): - """ - Make sure we have a work-around for Python bug #20007 - http://bugs.python.org/issue20007 - """ - class FakeSocket(object): - def makefile(self, _mode, _bufsize=None): - return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") - - source = http_client.HTTPResponse(FakeSocket()) - source.begin() - stream = HTMLInputStream(source) - self.assertEqual(stream.charsUntil(" "), "Text") - - def test_python_issue_20007_b(self): - """ - Make sure we have a work-around for Python bug #20007 - http://bugs.python.org/issue20007 - """ - if six.PY2: - return - - class FakeSocket(object): - def makefile(self, _mode, _bufsize=None): - return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") - - source = http_client.HTTPResponse(FakeSocket()) - source.begin() - wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com") - stream = HTMLInputStream(wrapped) - self.assertEqual(stream.charsUntil(" "), "Text") - - -def buildTestSuite(): - return unittest.defaultTestLoader.loadTestsFromName(__name__) - - -def main(): - buildTestSuite() - unittest.main() - -if __name__ == '__main__': - main() +def test_char_ascii(): + stream = HTMLInputStream(b"'", encoding='ascii') + assert stream.charEncoding[0].name == 'windows-1252' + assert stream.char() == "'" + +def test_char_utf8(): + stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8') + assert stream.charEncoding[0].name == 'utf-8' + assert stream.char() == '\u2018' + +def test_char_win1252(): + stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252')) + assert stream.charEncoding[0].name == 'windows-1252' + assert stream.char() == "\xa9" + assert stream.char() == "\xf1" + assert stream.char() == "\u2019" + +def test_bom(): + stream = HTMLInputStream(codecs.BOM_UTF8 + b"'") + assert stream.charEncoding[0].name == 'utf-8' + assert stream.char() == "'" + +def test_utf_16(): + stream = HTMLInputStream((' ' * 1025).encode('utf-16')) + assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be'] + assert len(stream.charsUntil(' ', True)) == 1025 + +def test_newlines(): + stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe") + assert stream.position() == (1, 0) + assert stream.charsUntil('c') == "a\nbb\n" + assert stream.position() == (3, 0) + assert stream.charsUntil('x') == "ccc\ndddd" + assert stream.position() == (4, 4) + assert stream.charsUntil('e') == "x" + assert stream.position() == (4, 5) + +def test_newlines2(): + size = HTMLUnicodeInputStream._defaultChunkSize + stream = HTMLInputStream("\r" * size + "\n") + assert stream.charsUntil('x') == "\n" * size + +def test_position(): + stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh") + assert stream.position() == (1, 0) + assert stream.charsUntil('c') == "a\nbb\n" + assert stream.position() == (3, 0) + stream.unget("\n") + assert stream.position() == (2, 2) + assert stream.charsUntil('c') == "\n" + assert stream.position() == (3, 0) + stream.unget("\n") + assert stream.position() == (2, 2) + assert stream.char() == "\n" + assert stream.position() == (3, 0) + assert stream.charsUntil('e') == "ccc\nddd" + assert stream.position() == (4, 3) + assert stream.charsUntil('h') == "e\nf\ng" + assert stream.position() == (6, 1) + +def test_position2(): + stream = HTMLUnicodeInputStreamShortChunk("abc\nd") + assert stream.position() == (1, 0) + assert stream.char() == "a" + assert stream.position() == (1, 1) + assert stream.char() == "b" + assert stream.position() == (1, 2) + assert stream.char() == "c" + assert stream.position() == (1, 3) + assert stream.char() == "\n" + assert stream.position() == (2, 0) + assert stream.char() == "d" + assert stream.position() == (2, 1) + +def test_python_issue_20007(): + """ + Make sure we have a work-around for Python bug #20007 + http://bugs.python.org/issue20007 + """ + class FakeSocket(object): + def makefile(self, _mode, _bufsize=None): + return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") + + source = http_client.HTTPResponse(FakeSocket()) + source.begin() + stream = HTMLInputStream(source) + assert stream.charsUntil(" ") == "Text" + +def test_python_issue_20007_b(): + """ + Make sure we have a work-around for Python bug #20007 + http://bugs.python.org/issue20007 + """ + if six.PY2: + return + + class FakeSocket(object): + def makefile(self, _mode, _bufsize=None): + return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") + + source = http_client.HTTPResponse(FakeSocket()) + source.begin() + wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com") + stream = HTMLInputStream(wrapped) + assert stream.charsUntil(" ") == "Text" diff --git a/html5lib/tests/test_whitespace_filter.py b/html5lib/tests/test_whitespace_filter.py index 9ed27fd6..e9da6140 100644 --- a/html5lib/tests/test_whitespace_filter.py +++ b/html5lib/tests/test_whitespace_filter.py @@ -1,133 +1,125 @@ from __future__ import absolute_import, division, unicode_literals -import unittest - from html5lib.filters.whitespace import Filter from html5lib.constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) -try: - unittest.TestCase.assertEqual -except AttributeError: - unittest.TestCase.assertEqual = unittest.TestCase.assertEquals - - -class TestCase(unittest.TestCase): - def runTest(self, input, expected): - output = list(Filter(input)) - errorMsg = "\n".join(["\n\nInput:", str(input), - "\nExpected:", str(expected), - "\nReceived:", str(output)]) - self.assertEqual(output, expected, errorMsg) - - def runTestUnmodifiedOutput(self, input): - self.runTest(input, input) - - def testPhrasingElements(self): - self.runTestUnmodifiedOutput( - [{"type": "Characters", "data": "This is a "}, - {"type": "StartTag", "name": "span", "data": []}, - {"type": "Characters", "data": "phrase"}, - {"type": "EndTag", "name": "span", "data": []}, - {"type": "SpaceCharacters", "data": " "}, - {"type": "Characters", "data": "with"}, - {"type": "SpaceCharacters", "data": " "}, - {"type": "StartTag", "name": "em", "data": []}, - {"type": "Characters", "data": "emphasised text"}, - {"type": "EndTag", "name": "em", "data": []}, - {"type": "Characters", "data": " and an "}, - {"type": "StartTag", "name": "img", "data": [["alt", "image"]]}, - {"type": "Characters", "data": "."}]) - - def testLeadingWhitespace(self): - self.runTest( - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "SpaceCharacters", "data": spaceCharacters}, - {"type": "Characters", "data": "foo"}, - {"type": "EndTag", "name": "p", "data": []}], - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "SpaceCharacters", "data": " "}, - {"type": "Characters", "data": "foo"}, - {"type": "EndTag", "name": "p", "data": []}]) - - def testLeadingWhitespaceAsCharacters(self): - self.runTest( - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": spaceCharacters + "foo"}, - {"type": "EndTag", "name": "p", "data": []}], - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": " foo"}, - {"type": "EndTag", "name": "p", "data": []}]) - - def testTrailingWhitespace(self): - self.runTest( - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": "foo"}, - {"type": "SpaceCharacters", "data": spaceCharacters}, - {"type": "EndTag", "name": "p", "data": []}], - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": "foo"}, - {"type": "SpaceCharacters", "data": " "}, - {"type": "EndTag", "name": "p", "data": []}]) - - def testTrailingWhitespaceAsCharacters(self): - self.runTest( - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": "foo" + spaceCharacters}, - {"type": "EndTag", "name": "p", "data": []}], - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": "foo "}, - {"type": "EndTag", "name": "p", "data": []}]) - - def testWhitespace(self): - self.runTest( - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": "foo" + spaceCharacters + "bar"}, - {"type": "EndTag", "name": "p", "data": []}], - [{"type": "StartTag", "name": "p", "data": []}, - {"type": "Characters", "data": "foo bar"}, - {"type": "EndTag", "name": "p", "data": []}]) - - def testLeadingWhitespaceInPre(self): - self.runTestUnmodifiedOutput( - [{"type": "StartTag", "name": "pre", "data": []}, - {"type": "SpaceCharacters", "data": spaceCharacters}, - {"type": "Characters", "data": "foo"}, - {"type": "EndTag", "name": "pre", "data": []}]) - - def testLeadingWhitespaceAsCharactersInPre(self): - self.runTestUnmodifiedOutput( - [{"type": "StartTag", "name": "pre", "data": []}, - {"type": "Characters", "data": spaceCharacters + "foo"}, - {"type": "EndTag", "name": "pre", "data": []}]) - - def testTrailingWhitespaceInPre(self): - self.runTestUnmodifiedOutput( - [{"type": "StartTag", "name": "pre", "data": []}, - {"type": "Characters", "data": "foo"}, - {"type": "SpaceCharacters", "data": spaceCharacters}, - {"type": "EndTag", "name": "pre", "data": []}]) - - def testTrailingWhitespaceAsCharactersInPre(self): - self.runTestUnmodifiedOutput( - [{"type": "StartTag", "name": "pre", "data": []}, - {"type": "Characters", "data": "foo" + spaceCharacters}, - {"type": "EndTag", "name": "pre", "data": []}]) - - def testWhitespaceInPre(self): - self.runTestUnmodifiedOutput( - [{"type": "StartTag", "name": "pre", "data": []}, - {"type": "Characters", "data": "foo" + spaceCharacters + "bar"}, - {"type": "EndTag", "name": "pre", "data": []}]) - - -def buildTestSuite(): - return unittest.defaultTestLoader.loadTestsFromName(__name__) - - -def main(): - buildTestSuite() - unittest.main() - -if __name__ == "__main__": - main() + +def runTest(input, expected): + output = list(Filter(input)) + errorMsg = "\n".join(["\n\nInput:", str(input), + "\nExpected:", str(expected), + "\nReceived:", str(output)]) + assert expected == output, errorMsg + + +def runTestUnmodifiedOutput(input): + runTest(input, input) + + +def testPhrasingElements(): + runTestUnmodifiedOutput( + [{"type": "Characters", "data": "This is a "}, + {"type": "StartTag", "name": "span", "data": []}, + {"type": "Characters", "data": "phrase"}, + {"type": "EndTag", "name": "span", "data": []}, + {"type": "SpaceCharacters", "data": " "}, + {"type": "Characters", "data": "with"}, + {"type": "SpaceCharacters", "data": " "}, + {"type": "StartTag", "name": "em", "data": []}, + {"type": "Characters", "data": "emphasised text"}, + {"type": "EndTag", "name": "em", "data": []}, + {"type": "Characters", "data": " and an "}, + {"type": "StartTag", "name": "img", "data": [["alt", "image"]]}, + {"type": "Characters", "data": "."}]) + + +def testLeadingWhitespace(): + runTest( + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "SpaceCharacters", "data": spaceCharacters}, + {"type": "Characters", "data": "foo"}, + {"type": "EndTag", "name": "p", "data": []}], + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "SpaceCharacters", "data": " "}, + {"type": "Characters", "data": "foo"}, + {"type": "EndTag", "name": "p", "data": []}]) + + +def testLeadingWhitespaceAsCharacters(): + runTest( + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": spaceCharacters + "foo"}, + {"type": "EndTag", "name": "p", "data": []}], + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": " foo"}, + {"type": "EndTag", "name": "p", "data": []}]) + + +def testTrailingWhitespace(): + runTest( + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": "foo"}, + {"type": "SpaceCharacters", "data": spaceCharacters}, + {"type": "EndTag", "name": "p", "data": []}], + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": "foo"}, + {"type": "SpaceCharacters", "data": " "}, + {"type": "EndTag", "name": "p", "data": []}]) + + +def testTrailingWhitespaceAsCharacters(): + runTest( + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": "foo" + spaceCharacters}, + {"type": "EndTag", "name": "p", "data": []}], + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": "foo "}, + {"type": "EndTag", "name": "p", "data": []}]) + + +def testWhitespace(): + runTest( + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": "foo" + spaceCharacters + "bar"}, + {"type": "EndTag", "name": "p", "data": []}], + [{"type": "StartTag", "name": "p", "data": []}, + {"type": "Characters", "data": "foo bar"}, + {"type": "EndTag", "name": "p", "data": []}]) + + +def testLeadingWhitespaceInPre(): + runTestUnmodifiedOutput( + [{"type": "StartTag", "name": "pre", "data": []}, + {"type": "SpaceCharacters", "data": spaceCharacters}, + {"type": "Characters", "data": "foo"}, + {"type": "EndTag", "name": "pre", "data": []}]) + + +def testLeadingWhitespaceAsCharactersInPre(): + runTestUnmodifiedOutput( + [{"type": "StartTag", "name": "pre", "data": []}, + {"type": "Characters", "data": spaceCharacters + "foo"}, + {"type": "EndTag", "name": "pre", "data": []}]) + + +def testTrailingWhitespaceInPre(): + runTestUnmodifiedOutput( + [{"type": "StartTag", "name": "pre", "data": []}, + {"type": "Characters", "data": "foo"}, + {"type": "SpaceCharacters", "data": spaceCharacters}, + {"type": "EndTag", "name": "pre", "data": []}]) + + +def testTrailingWhitespaceAsCharactersInPre(): + runTestUnmodifiedOutput( + [{"type": "StartTag", "name": "pre", "data": []}, + {"type": "Characters", "data": "foo" + spaceCharacters}, + {"type": "EndTag", "name": "pre", "data": []}]) + + +def testWhitespaceInPre(): + runTestUnmodifiedOutput( + [{"type": "StartTag", "name": "pre", "data": []}, + {"type": "Characters", "data": "foo" + spaceCharacters + "bar"}, + {"type": "EndTag", "name": "pre", "data": []}]) From 1df7e5f6ec91584f43b5067d18732673f842f587 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 16:24:56 +0100 Subject: [PATCH 005/219] Cleanup test_parser2.py a bit --- html5lib/tests/test_parser2.py | 44 ++++++++++++++++------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 26eff241..2f3ba2c8 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -7,51 +7,47 @@ from . import support # flake8: noqa from html5lib import html5parser from html5lib.constants import namespaces -from html5lib import treebuilders +from html5lib import parse # tests that aren't autogenerated from text files -@pytest.fixture -def dom_tree(): - return treebuilders.getTreeBuilder("dom") - - -def test_assertDoctypeCloneable(dom_tree): - parser = html5parser.HTMLParser(tree=dom_tree) - doc = parser.parse('') +def test_assertDoctypeCloneable(): + doc = parse('', treebuilder="dom") assert doc.cloneNode(True) is not None -def test_line_counter(dom_tree): +def test_line_counter(): # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0 - parser = html5parser.HTMLParser(tree=dom_tree) - parser.parse("
\nx\n>\n
") + assert parse("
\nx\n>\n
") is not None -def test_namespace_html_elements_0_dom(dom_tree): - parser = html5parser.HTMLParser(tree=dom_tree, namespaceHTMLElements=True) - doc = parser.parse("") +def test_namespace_html_elements_0_dom(): + doc = parse("", + treebuilder="dom", + namespaceHTMLElements=True) assert doc.childNodes[0].namespaceURI == namespaces["html"] -def test_namespace_html_elements_1_dom(dom_tree): - parser = html5parser.HTMLParser(tree=dom_tree, namespaceHTMLElements=False) - doc = parser.parse("") +def test_namespace_html_elements_1_dom(): + doc = parse("", + treebuilder="dom", + namespaceHTMLElements=False) assert doc.childNodes[0].namespaceURI is None def test_namespace_html_elements_0_etree(): - parser = html5parser.HTMLParser(namespaceHTMLElements=True) - doc = parser.parse("") + doc = parse("", + treebuilder="etree", + namespaceHTMLElements=True) assert doc.tag == "{%s}html" % (namespaces["html"],) def test_namespace_html_elements_1_etree(): - parser = html5parser.HTMLParser(namespaceHTMLElements=False) - doc = parser.parse("") + doc = parse("", + treebuilder="etree", + namespaceHTMLElements=False) assert doc.tag == "html" def test_unicode_file(): - parser = html5parser.HTMLParser() - parser.parse(io.StringIO("a")) + assert parse(io.StringIO("a")) is not None From c64bfca788e2f858d336105eb7da18399a7894c6 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 3 Dec 2015 18:21:33 +0000 Subject: [PATCH 006/219] Get rid of mutable default arguments --- html5lib/html5parser.py | 4 +++- html5lib/treebuilders/etree_lxml.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index e6808425..43c1dc61 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -259,8 +259,10 @@ def parseFragment(self, stream, container="div", encoding=None, encoding=encoding, scripting=scripting) return self.tree.getFragment() - def parseError(self, errorcode="XXX-undefined-error", datavars={}): + def parseError(self, errorcode="XXX-undefined-error", datavars=None): # XXX The idea is to make errorcode mandatory. + if datavars is None: + datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError(E[errorcode] % datavars) diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 138b30bd..79a4d4c5 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -193,7 +193,9 @@ def __init__(self, namespaceHTMLElements, fullTree=False): self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): - def __init__(self, element, value={}): + def __init__(self, element, value=None): + if value is None: + value = {} self._element = element dict.__init__(self, value) for key, value in self.items(): From c1c16ceed7ff484b6ce056b0a377404aa06e01f7 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 4 Dec 2015 02:14:15 +0000 Subject: [PATCH 007/219] Avoid noisiness from pylint and the parser's set patterns --- html5lib/html5parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 43c1dc61..aad6a059 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -460,6 +460,7 @@ def getMetaclass(use_metaclass, metaclass_func): else: return type + # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ @@ -2765,6 +2766,7 @@ def startTagOther(self, token): def processEndTag(self, token): self.parser.parseError("expected-eof-but-got-end-tag", {"name": token["name"]}) + # pylint:enable=unused-argument return { "initial": InitialPhase, From 2c3b64b0b9cbd7ffcd67f3ddae93a0a8d75af908 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 17:23:40 +0100 Subject: [PATCH 008/219] add pep8/flake8 config to get something useful happening with them this makes run-flake8.sh just call flake8 simply --- flake8-run.sh | 7 ++----- html5lib/filters/sanitizer.py | 2 +- html5lib/ihatexml.py | 4 ++-- html5lib/inputstream.py | 2 +- html5lib/tests/test_sanitizer.py | 3 ++- setup.cfg | 9 +++++++++ 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/flake8-run.sh b/flake8-run.sh index 685ec6ab..b175ec80 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,8 +5,5 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi -find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501 -flake1=$? -flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py -flake2=$? -exit $[$flake1 || $flake2] +flake8 html5lib +exit $? diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index caddd318..fdd4181d 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -862,7 +862,7 @@ def sanitize_css(self, style): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index 5da5d938..57fec9d6 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -175,9 +175,9 @@ def escapeRegexp(string): return string # output from the above -nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') +nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa -nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') +nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa # Simpler things nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]") diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 15acba0d..5cfc2cc5 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -32,7 +32,7 @@ class BufferedIOBase(object): spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" +invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa if utils.supports_lone_surrogates: # Use one extra step of indirection and create surrogates with diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 1f8a06f6..9f8ae22c 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -63,7 +63,8 @@ def test_sanitizer(): for ns, tag_name in sanitizer.allowed_elements: if ns != constants.namespaces["html"]: continue - if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'select']: + if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', + 'tfoot', 'th', 'thead', 'tr', 'select']: continue # TODO if tag_name == 'image': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, diff --git a/setup.cfg b/setup.cfg index 2a9acf13..3152ac54 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,11 @@ [bdist_wheel] universal = 1 + +[pep8] +ignore = N +max-line-length = 139 +exclude = .git,__pycache__,.tox,doc + +[flake8] +ignore = N +max-line-length = 139 From 823864882ee969ebb7c16986a80388d5785cb9ea Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 17:37:55 +0100 Subject: [PATCH 009/219] Fix all the files outside of html5lib to flake8 cleanly --- flake8-run.sh | 2 +- parse.py | 31 +++++++++++++++++------------ setup.py | 12 ++++++------ utils/entities.py | 50 +++++++++++++++++++++++++++++------------------ utils/spider.py | 43 +++++++++++++++++++++------------------- 5 files changed, 80 insertions(+), 58 deletions(-) diff --git a/flake8-run.sh b/flake8-run.sh index b175ec80..d9264946 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,5 +5,5 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi -flake8 html5lib +flake8 `dirname $0` exit $? diff --git a/parse.py b/parse.py index cceea84d..2ed8f1c2 100755 --- a/parse.py +++ b/parse.py @@ -5,7 +5,6 @@ """ import sys -import os import traceback from optparse import OptionParser @@ -15,9 +14,10 @@ from html5lib import constants from html5lib import utils + def parse(): optParser = getOptParser() - opts,args = optParser.parse_args() + opts, args = optParser.parse_args() encoding = "utf8" try: @@ -25,7 +25,10 @@ def parse(): # Try opening from the internet if f.startswith('http://'): try: - import urllib.request, urllib.parse, urllib.error, cgi + import urllib.request + import urllib.parse + import urllib.error + import cgi f = urllib.request.urlopen(f) contentType = f.headers.get('content-type') if contentType: @@ -41,7 +44,7 @@ def parse(): try: # Try opening from file system f = open(f, "rb") - except IOError as e: + except IOError as e: sys.stderr.write("Unable to open file: %s\n" % e) sys.exit(1) except IndexError: @@ -82,14 +85,15 @@ def parse(): if document: printOutput(p, document, opts) t2 = time.time() - sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)) + sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1)) else: - sys.stderr.write("\n\nRun took: %fs"%(t1-t0)) + sys.stderr.write("\n\nRun took: %fs" % (t1 - t0)) else: document = run(parseMethod, f, encoding, opts.scripting) if document: printOutput(p, document, opts) + def run(parseMethod, f, encoding, scripting): try: document = parseMethod(f, encoding=encoding, scripting=scripting) @@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting): traceback.print_exc() return document + def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) @@ -116,7 +121,7 @@ def printOutput(parser, document, opts): elif tb == "etree": sys.stdout.write(utils.default_etree.tostring(document)) elif opts.tree: - if not hasattr(document,'__getitem__'): + if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) @@ -126,7 +131,7 @@ def printOutput(parser, document, opts): kwargs = {} for opt in serializer.HTMLSerializer.options: try: - kwargs[opt] = getattr(opts,opt) + kwargs[opt] = getattr(opts, opt) except: pass if not kwargs['quote_char']: @@ -142,12 +147,14 @@ def printOutput(parser, document, opts): encoding = "utf-8" for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): sys.stdout.write(text) - if not text.endswith('\n'): sys.stdout.write('\n') + if not text.endswith('\n'): + sys.stdout.write('\n') if opts.error: - errList=[] + errList = [] for pos, errorcode, datavars in parser.errors: - errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) - sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n") + errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) + sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n") + def getOptParser(): parser = OptionParser(usage=__doc__) diff --git a/setup.py b/setup.py index b6ea24af..b42ba400 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup -classifiers=[ +classifiers = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', @@ -20,9 +20,9 @@ 'Programming Language :: Python :: 3.5', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Text Processing :: Markup :: HTML' - ] +] -packages = ['html5lib'] + ['html5lib.'+name +packages = ['html5lib'] + ['html5lib.' + name for name in os.listdir(os.path.join('html5lib')) if os.path.isdir(os.path.join('html5lib', name)) and not name.startswith('.') and name != 'tests'] @@ -39,9 +39,9 @@ assignments = filter(lambda x: isinstance(x, ast.Assign), t.body) for a in assignments: if (len(a.targets) == 1 and - isinstance(a.targets[0], ast.Name) and - a.targets[0].id == "__version__" and - isinstance(a.value, ast.Str)): + isinstance(a.targets[0], ast.Name) and + a.targets[0].id == "__version__" and + isinstance(a.value, ast.Str)): version = a.value.s setup(name='html5lib', diff --git a/utils/entities.py b/utils/entities.py index 116a27cb..6dccf5f0 100644 --- a/utils/entities.py +++ b/utils/entities.py @@ -2,50 +2,59 @@ import html5lib + def parse(path="html5ents.xml"): return html5lib.parse(open(path), treebuilder="lxml") + def entity_table(tree): return dict((entity_name("".join(tr[0].xpath(".//text()"))), entity_characters(tr[1].text)) for tr in tree.xpath("//h:tbody/h:tr", - namespaces={"h":"http://www.w3.org/1999/xhtml"})) + namespaces={"h": "http://www.w3.org/1999/xhtml"})) + def entity_name(inp): return inp.strip() + def entity_characters(inp): return "".join(codepoint_to_character(item) - for item in inp.split() - if item) + for item in inp.split() + if item) + def codepoint_to_character(inp): - return ("\U000"+inp[2:]).decode("unicode-escape") + return ("\\U000" + inp[2:]).decode("unicode-escape") + def make_tests_json(entities): test_list = make_test_list(entities) tests_json = {"tests": - [make_test(*item) for item in test_list] + [make_test(*item) for item in test_list] } return tests_json + def make_test(name, characters, good): return { - "description":test_description(name, good), - "input":"&%s"%name, - "output":test_expected(name, characters, good) - } + "description": test_description(name, good), + "input": "&%s" % name, + "output": test_expected(name, characters, good) + } + def test_description(name, good): with_semicolon = name.endswith(";") - semicolon_text = {True:"with a semi-colon", - False:"without a semi-colon"}[with_semicolon] + semicolon_text = {True: "with a semi-colon", + False: "without a semi-colon"}[with_semicolon] if good: - text = "Named entity: %s %s"%(name, semicolon_text) + text = "Named entity: %s %s" % (name, semicolon_text) else: - text = "Bad named entity: %s %s"%(name, semicolon_text) + text = "Bad named entity: %s %s" % (name, semicolon_text) return text + def test_expected(name, characters, good): rv = [] if not good or not name.endswith(";"): @@ -53,6 +62,7 @@ def test_expected(name, characters, good): rv.append(["Character", characters]) return rv + def make_test_list(entities): tests = [] for entity_name, characters in entities.items(): @@ -61,20 +71,23 @@ def make_test_list(entities): tests.append((entity_name, characters, True)) return sorted(tests) + def subentity_exists(entity_name, entities): for i in range(1, len(entity_name)): if entity_name[:-i] in entities: return True return False + def make_entities_code(entities): - entities_text = "\n".join(" \"%s\": u\"%s\","%( - name, entities[name].encode( - "unicode-escape").replace("\"", "\\\"")) - for name in sorted(entities.keys())) + entities_text = "\n".join(" \"%s\": u\"%s\"," % ( + name, entities[name].encode( + "unicode-escape").replace("\"", "\\\"")) + for name in sorted(entities.keys())) return """entities = { %s -}"""%entities_text +}""" % entities_text + def main(): entities = entity_table(parse()) @@ -85,4 +98,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/utils/spider.py b/utils/spider.py index ac5f9fbe..3a325888 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -7,7 +7,9 @@ s.spider("http://www.google.com", maxURLs=100) """ -import urllib.request, urllib.error, urllib.parse +import urllib.request +import urllib.error +import urllib.parse import urllib.robotparser import md5 @@ -16,11 +18,13 @@ import html5lib from html5lib.treebuilders import etree + class Spider(object): + def __init__(self): self.unvisitedURLs = set() self.visitedURLs = set() - self.buggyURLs=set() + self.buggyURLs = set() self.robotParser = urllib.robotparser.RobotFileParser() self.contentDigest = {} self.http = httplib2.Http(".cache") @@ -70,18 +74,18 @@ def updateURLs(self, tree): update the list of visited and unvisited URLs according to whether we have seen them before or not""" urls = set() - #Remove all links we have already visited + # Remove all links we have already visited for link in tree.findall(".//a"): - try: - url = urllib.parse.urldefrag(link.attrib['href'])[0] - if (url and url not in self.unvisitedURLs and url + try: + url = urllib.parse.urldefrag(link.attrib['href'])[0] + if (url and url not in self.unvisitedURLs and url not in self.visitedURLs): - urls.add(url) - except KeyError: - pass + urls.add(url) + except KeyError: + pass - #Remove all non-http URLs and add a suitable base URL where that is - #missing + # Remove all non-http URLs and add a suitable base URL where that is + # missing newUrls = set() for url in urls: splitURL = list(urllib.parse.urlsplit(url)) @@ -93,23 +97,22 @@ def updateURLs(self, tree): urls = newUrls responseHeaders = {} - #Now we want to find the content types of the links we haven't visited + # Now we want to find the content types of the links we haven't visited for url in urls: try: resp, content = self.http.request(url, "HEAD") responseHeaders[url] = resp - except AttributeError as KeyError: - #Don't know why this happens + except AttributeError: + # Don't know why this happens pass - - #Remove links not of content-type html or pages not found - #XXX - need to deal with other status codes? + # Remove links not of content-type html or pages not found + # XXX - need to deal with other status codes? toVisit = set([url for url in urls if url in responseHeaders and - "html" in responseHeaders[url]['content-type'] and - responseHeaders[url]['status'] == "200"]) + "html" in responseHeaders[url]['content-type'] and + responseHeaders[url]['status'] == "200"]) - #Now check we are allowed to spider the page + # Now check we are allowed to spider the page for url in toVisit: robotURL = list(urllib.parse.urlsplit(url)[:2]) robotURL.extend(["robots.txt", "", ""]) From de6bcf22e8171e06b0e07558b699075f1b970dd0 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 18:12:08 +0100 Subject: [PATCH 010/219] Fix incorrectly hidden flake8 errors --- html5lib/tests/support.py | 10 +++++----- html5lib/tests/test_encoding.py | 6 +++--- html5lib/tests/test_parser2.py | 20 +++++++++----------- html5lib/tests/test_stream.py | 28 ++++++++++++++++++++++++++-- html5lib/tests/test_treeadapters.py | 4 ++-- html5lib/tokenizer.py | 6 +++--- html5lib/treeadapters/__init__.py | 2 +- 7 files changed, 49 insertions(+), 27 deletions(-) diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index 6e6a916b..5f3cc619 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -13,7 +13,7 @@ os.path.pardir, os.path.pardir))) -from html5lib import treebuilders, treewalkers, treeadapters +from html5lib import treebuilders, treewalkers, treeadapters # noqa del base_path # Build a dict of available trees @@ -26,14 +26,14 @@ } # ElementTree impls -import xml.etree.ElementTree as ElementTree +import xml.etree.ElementTree as ElementTree # noqa treeTypes['ElementTree'] = { "builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True), "walker": treewalkers.getTreeWalker("etree", ElementTree) } try: - import xml.etree.cElementTree as cElementTree + import xml.etree.cElementTree as cElementTree # noqa except ImportError: treeTypes['cElementTree'] = None else: @@ -47,7 +47,7 @@ } try: - import lxml.etree as lxml # flake8: noqa + import lxml.etree as lxml # noqa except ImportError: treeTypes['lxml'] = None else: @@ -58,7 +58,7 @@ # Genshi impls try: - import genshi # flake8: noqa + import genshi # noqa except ImportError: pass else: diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 09504654..16dd1189 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -57,13 +57,13 @@ def test_encoding(): try: try: - import charade # flake8: noqa + import charade # noqa except ImportError: - import chardet # flake8: noqa + import chardet # noqa except ImportError: print("charade/chardet not found, skipping chardet tests") else: def test_chardet(): - with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp: + with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp: encoding = inputstream.HTMLInputStream(fp.read()).charEncoding assert encoding[0].name == "big5" diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 2f3ba2c8..f8e1ac43 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -2,10 +2,8 @@ import io -import pytest +from . import support # noqa -from . import support # flake8: noqa -from html5lib import html5parser from html5lib.constants import namespaces from html5lib import parse @@ -23,29 +21,29 @@ def test_line_counter(): def test_namespace_html_elements_0_dom(): doc = parse("", - treebuilder="dom", - namespaceHTMLElements=True) + treebuilder="dom", + namespaceHTMLElements=True) assert doc.childNodes[0].namespaceURI == namespaces["html"] def test_namespace_html_elements_1_dom(): doc = parse("", - treebuilder="dom", - namespaceHTMLElements=False) + treebuilder="dom", + namespaceHTMLElements=False) assert doc.childNodes[0].namespaceURI is None def test_namespace_html_elements_0_etree(): doc = parse("", - treebuilder="etree", - namespaceHTMLElements=True) + treebuilder="etree", + namespaceHTMLElements=True) assert doc.tag == "{%s}html" % (namespaces["html"],) def test_namespace_html_elements_1_etree(): doc = parse("", - treebuilder="etree", - namespaceHTMLElements=False) + treebuilder="etree", + namespaceHTMLElements=False) assert doc.tag == "html" diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 3b659fbb..a92ee0a3 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,9 +1,9 @@ from __future__ import absolute_import, division, unicode_literals -from . import support # flake8: noqa +from . import support # noqa + import codecs from io import BytesIO -import socket import six from six.moves import http_client, urllib @@ -11,12 +11,14 @@ from html5lib.inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) + def test_basic(): s = b"abc" fp = BufferedStream(BytesIO(s)) read = fp.read(10) assert read == s + def test_read_length(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) @@ -28,17 +30,23 @@ def test_read_length(): read4 = fp.read(4) assert read4 == b"" + def test_tell(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) + assert read1 == b"a" assert fp.tell() == 1 read2 = fp.read(2) + assert read2 == b"bc" assert fp.tell() == 3 read3 = fp.read(3) + assert read3 == b"def" assert fp.tell() == 6 read4 = fp.read(4) + assert read4 == b"" assert fp.tell() == 6 + def test_seek(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) @@ -55,20 +63,26 @@ def test_seek(): read5 = fp.read(2) assert read5 == b"ef" + def test_seek_tell(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) + assert read1 == b"a" assert fp.tell() == 1 fp.seek(0) read2 = fp.read(1) + assert read2 == b"a" assert fp.tell() == 1 read3 = fp.read(2) + assert read3 == b"bc" assert fp.tell() == 3 fp.seek(2) read4 = fp.read(2) + assert read4 == b"cd" assert fp.tell() == 4 fp.seek(4) read5 = fp.read(2) + assert read5 == b"ef" assert fp.tell() == 6 @@ -85,11 +99,13 @@ def test_char_ascii(): assert stream.charEncoding[0].name == 'windows-1252' assert stream.char() == "'" + def test_char_utf8(): stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8') assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == '\u2018' + def test_char_win1252(): stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252')) assert stream.charEncoding[0].name == 'windows-1252' @@ -97,16 +113,19 @@ def test_char_win1252(): assert stream.char() == "\xf1" assert stream.char() == "\u2019" + def test_bom(): stream = HTMLInputStream(codecs.BOM_UTF8 + b"'") assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == "'" + def test_utf_16(): stream = HTMLInputStream((' ' * 1025).encode('utf-16')) assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be'] assert len(stream.charsUntil(' ', True)) == 1025 + def test_newlines(): stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe") assert stream.position() == (1, 0) @@ -117,11 +136,13 @@ def test_newlines(): assert stream.charsUntil('e') == "x" assert stream.position() == (4, 5) + def test_newlines2(): size = HTMLUnicodeInputStream._defaultChunkSize stream = HTMLInputStream("\r" * size + "\n") assert stream.charsUntil('x') == "\n" * size + def test_position(): stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh") assert stream.position() == (1, 0) @@ -140,6 +161,7 @@ def test_position(): assert stream.charsUntil('h') == "e\nf\ng" assert stream.position() == (6, 1) + def test_position2(): stream = HTMLUnicodeInputStreamShortChunk("abc\nd") assert stream.position() == (1, 0) @@ -154,6 +176,7 @@ def test_position2(): assert stream.char() == "d" assert stream.position() == (2, 1) + def test_python_issue_20007(): """ Make sure we have a work-around for Python bug #20007 @@ -168,6 +191,7 @@ def makefile(self, _mode, _bufsize=None): stream = HTMLInputStream(source) assert stream.charsUntil(" ") == "Text" + def test_python_issue_20007_b(): """ Make sure we have a work-around for Python bug #20007 diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py index 5f38b6c3..95e56c00 100644 --- a/html5lib/tests/test_treeadapters.py +++ b/html5lib/tests/test_treeadapters.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from . import support # flake8: noqa +from . import support # noqa import html5lib from html5lib.treeadapters import sax @@ -25,7 +25,7 @@ def test_to_sax(): ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), - ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 79774578..3d21c32d 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, unicode_literals try: - chr = unichr # flake8: noqa + chr = unichr # noqa except NameError: pass @@ -147,8 +147,8 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False): output = "&" charStack = [self.stream.char()] - if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") - or (allowedChar is not None and allowedChar == charStack[0])): + if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or + (allowedChar is not None and allowedChar == charStack[0])): self.stream.unget(charStack[0]) elif charStack[0] == "#": diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py index 57d71304..4f978466 100644 --- a/html5lib/treeadapters/__init__.py +++ b/html5lib/treeadapters/__init__.py @@ -5,7 +5,7 @@ __all__ = ["sax"] try: - from . import genshi # flake8: noqa + from . import genshi # noqa except ImportError: pass else: From 0bd31c4251889a4216b8ac3a59e5833534643e48 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 18:58:42 +0100 Subject: [PATCH 011/219] Get rid of type()-based type-check --- html5lib/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/utils.py b/html5lib/utils.py index c70de172..c83a089f 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -52,7 +52,7 @@ def __init__(self, items=()): # anything here. _dictEntries = [] for name, value in items: - if type(name) in (list, tuple, frozenset, set): + if isinstance(name, (list, tuple, frozenset, set)): for item in name: _dictEntries.append((item, value)) else: From d440a830fb75beafed838327c21e9a8a773c9743 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 19:16:57 +0100 Subject: [PATCH 012/219] Silence pytest unused-variable warnings --- html5lib/ihatexml.py | 2 +- html5lib/inputstream.py | 4 ++-- html5lib/serializer/htmlserializer.py | 2 +- html5lib/tests/test_encoding.py | 2 +- html5lib/tests/test_serializer.py | 2 +- html5lib/tests/test_treewalkers.py | 2 +- html5lib/tokenizer.py | 4 ++-- html5lib/treebuilders/dom.py | 2 +- html5lib/treebuilders/etree.py | 4 ++-- html5lib/treewalkers/etree.py | 4 ++-- html5lib/treewalkers/genshistream.py | 2 +- 11 files changed, 15 insertions(+), 15 deletions(-) diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index 57fec9d6..738d2457 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -232,7 +232,7 @@ def coerceComment(self, data): def coerceCharacters(self, data): if self.replaceFormFeedCharacters: - for i in range(data.count("\x0C")): + for _ in range(data.count("\x0C")): warnings.warn("Text cannot contain U+000C", DataLossWarning) data = data.replace("\x0C", " ") # Other non-xml characters diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 5cfc2cc5..4231ae19 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -296,7 +296,7 @@ def readChunk(self, chunkSize=None): return True def characterErrorsUCS4(self, data): - for i in range(len(invalid_unicode_re.findall(data))): + for _ in range(len(invalid_unicode_re.findall(data))): self.errors.append("invalid-codepoint") def characterErrorsUCS2(self, data): @@ -681,7 +681,7 @@ def getEncoding(self): (b" 0: - for i in range(nullCount): + for _ in range(nullCount): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) data = data.replace("\u0000", "\uFFFD") diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index 8656244f..27432c7a 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -109,7 +109,7 @@ def getNameTuple(self): nameTuple = property(getNameTuple) - class TreeBuilder(_base.TreeBuilder): + class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable def documentClass(self): self.dom = Dom.getDOMImplementation().createDocument(None, None, None) return weakref.proxy(self) diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 2c8ed19f..b607948b 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -253,7 +253,7 @@ def serializeElement(element, indent=0): return "\n".join(rv) - def tostring(element): + def tostring(element): # pylint:disable=unused-variable """Serialize an element and its child nodes to a string""" rv = [] filter = ihatexml.InfosetFilter() @@ -307,7 +307,7 @@ def serializeElement(element): return "".join(rv) - class TreeBuilder(_base.TreeBuilder): + class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable documentClass = Document doctypeClass = DocumentType elementClass = Element diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 73c8e26a..d3b0c50e 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag - class TreeWalker(_base.NonRecursiveTreeWalker): + class TreeWalker(_base.NonRecursiveTreeWalker): # pylint:disable=unused-variable """Given the particular ElementTree representation, this implementation, to avoid using recursion, returns "nodes" as tuples with the following content: @@ -38,7 +38,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): """ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element - elt, key, parents, flag = node + elt, _, _, flag = node if flag in ("text", "tail"): return _base.TEXT, getattr(elt, flag) else: diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 83cd1654..61cbfede 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -25,7 +25,7 @@ def __iter__(self): yield token def tokens(self, event, next): - kind, data, pos = event + kind, data, _ = event if kind == START: tag, attribs = data name = tag.localname From 5c1d8e2743383b3875ef840cc0ab842dbc1e7618 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 19:21:57 +0100 Subject: [PATCH 013/219] Remove duplicate entry from constants.replacementCharacters --- html5lib/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index 2244933c..df1f061e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2819,7 +2819,6 @@ 0x0d: "\u000D", 0x80: "\u20AC", 0x81: "\u0081", - 0x81: "\u0081", 0x82: "\u201A", 0x83: "\u0192", 0x84: "\u201E", From 1b86ccbeec08069d1a40cd22d0dcc8492bdd789a Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 19:23:44 +0100 Subject: [PATCH 014/219] Remove gratuitious argument in sanitizer --- html5lib/filters/sanitizer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index fdd4181d..7f81c0d1 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -765,15 +765,15 @@ def sanitize_token(self, token): if ((namespace, name) in self.allowed_elements or (namespace is None and (namespaces["html"], name) in self.allowed_elements)): - return self.allowed_token(token, token_type) + return self.allowed_token(token) else: - return self.disallowed_token(token, token_type) + return self.disallowed_token(token) elif token_type == "Comment": pass else: return token - def allowed_token(self, token, token_type): + def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) @@ -823,7 +823,8 @@ def allowed_token(self, token, token_type): token["data"] = attrs return token - def disallowed_token(self, token, token_type): + def disallowed_token(self, token): + token_type = token["type"] if token_type == "EndTag": token["data"] = "" % token["name"] elif token["data"]: From 82d623bc8287d00db13ca98bf9e6d7a1921c6a56 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 19:39:29 +0100 Subject: [PATCH 015/219] Silence redefined-variable-type --- html5lib/html5parser.py | 2 +- html5lib/tokenizer.py | 2 +- html5lib/treewalkers/lxmletree.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index aad6a059..86b3e609 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -121,7 +121,7 @@ def reset(self): self.phase.insertHtmlElement() self.resetInsertionMode() else: - self.innerHTML = False + self.innerHTML = False # pylint:disable=redefined-variable-type self.phase = self.phases["initial"] self.lastPhase = None diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index ef7a7b1f..50e505a9 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -1716,7 +1716,7 @@ def cdataSectionState(self): else: data.append(char) - data = "".join(data) + data = "".join(data) # pylint:disable=redefined-variable-type # Deal with null here rather than in the parser nullCount = data.count("\u0000") if nullCount > 0: diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 36850086..7d99adc2 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -117,6 +117,7 @@ def __len__(self): class TreeWalker(_base.NonRecursiveTreeWalker): def __init__(self, tree): + # pylint:disable=redefined-variable-type if hasattr(tree, "getroot"): self.fragmentChildren = set() tree = Root(tree) From a017b8881f42b2ab21a2f47af993ba6d58b25ca2 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 19:59:08 +0100 Subject: [PATCH 016/219] Silence unused-argument --- html5lib/html5parser.py | 2 ++ html5lib/ihatexml.py | 4 ++-- html5lib/inputstream.py | 1 + html5lib/serializer/htmlserializer.py | 2 +- html5lib/tests/test_sanitizer.py | 2 +- html5lib/tests/test_stream.py | 2 ++ html5lib/tests/tokenizer.py | 1 + html5lib/treebuilders/etree_lxml.py | 2 ++ 8 files changed, 12 insertions(+), 4 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 86b3e609..66ad7430 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -241,6 +241,7 @@ def parse(self, stream, encoding=None, parseMeta=True, def parseFragment(self, stream, container="div", encoding=None, parseMeta=False, useChardet=True, scripting=False): + # pylint:disable=unused-argument """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -363,6 +364,7 @@ def adjustForeignAttributes(self, token): del token["data"][originalName] def reparseTokenNormal(self, token): + # pylint:disable=unused-argument self.parser.phase() def resetInsertionMode(self): diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index 738d2457..d6d1d6fb 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -186,7 +186,7 @@ def escapeRegexp(string): class InfosetFilter(object): replacementRegexp = re.compile(r"U[\dA-F]{5,5}") - def __init__(self, replaceChars=None, + def __init__(self, dropXmlnsLocalName=False, dropXmlnsAttrNs=False, preventDoubleDashComments=False, @@ -217,7 +217,7 @@ def coerceAttribute(self, name, namespace=None): else: return self.toXmlName(name) - def coerceElement(self, name, namespace=None): + def coerceElement(self, name): return self.toXmlName(name) def coerceComment(self, data): diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 4231ae19..a9aa2a15 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -571,6 +571,7 @@ def __new__(self, value): return bytes.__new__(self, value.lower()) def __init__(self, value): + # pylint:disable=unused-argument self._position = -1 def __iter__(self): diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index d66ff36c..641d8c1c 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -328,6 +328,6 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): raise SerializeError -def SerializeError(Exception): +class SerializeError(Exception): """Error in serialized tree""" pass diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 9f8ae22c..e19deea8 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -4,7 +4,7 @@ from html5lib.filters import sanitizer -def runSanitizerTest(name, expected, input): +def runSanitizerTest(_, expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index a92ee0a3..835e32e5 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -184,6 +184,7 @@ def test_python_issue_20007(): """ class FakeSocket(object): def makefile(self, _mode, _bufsize=None): + # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") source = http_client.HTTPResponse(FakeSocket()) @@ -202,6 +203,7 @@ def test_python_issue_20007_b(): class FakeSocket(object): def makefile(self, _mode, _bufsize=None): + # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") source = http_client.HTTPResponse(FakeSocket()) diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index c6163a1f..255c1859 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -19,6 +19,7 @@ def __init__(self, initialState, lastStartTag=None): self._lastStartTag = lastStartTag def parse(self, stream, encoding=None, innerHTML=False): + # pylint:disable=unused-argument tokenizer = self.tokenizer(stream, encoding) self.outputTokens = [] diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 79a4d4c5..a92b3aa9 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -305,6 +305,8 @@ def insertDoctype(self, token): self.doctype = doctype def insertCommentInitial(self, data, parent=None): + assert parent is None or parent is self.document + assert self.document._elementTree is None self.initial_comments.append(data) def insertCommentMain(self, data, parent=None): From e5d395c28c7357ace9352fd162f8efe1d8ac8143 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 20:25:29 +0100 Subject: [PATCH 017/219] Silence wrong-import-position --- html5lib/serializer/htmlserializer.py | 103 ++++++++++++-------------- html5lib/tests/support.py | 4 + html5lib/tests/test_encoding.py | 2 + html5lib/tests/test_serializer.py | 2 + html5lib/tokenizer.py | 5 +- html5lib/trie/__init__.py | 2 + 6 files changed, 60 insertions(+), 58 deletions(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 641d8c1c..be2718d3 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -3,6 +3,8 @@ import re +from codecs import register_error, xmlcharrefreplace_errors + from ..constants import voidElements, booleanAttributes, spaceCharacters from ..constants import rcdataElements, entities, xmlEntities from .. import utils @@ -21,61 +23,54 @@ "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" "\u3000]") -try: - from codecs import register_error, xmlcharrefreplace_errors -except ImportError: - unicode_encode_errors = "strict" -else: - unicode_encode_errors = "htmlentityreplace" - - encode_entity_map = {} - is_ucs4 = len("\U0010FFFF") == 1 - for k, v in list(entities.items()): - # skip multi-character entities - if ((is_ucs4 and len(v) > 1) or - (not is_ucs4 and len(v) > 2)): - continue - if v != "&": - if len(v) == 2: - v = utils.surrogatePairToCodepoint(v) - else: - v = ord(v) - if v not in encode_entity_map or k.islower(): - # prefer < over < and similarly for &, >, etc. - encode_entity_map[v] = k - - def htmlentityreplace_errors(exc): - if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): - res = [] - codepoints = [] - skip = False - for i, c in enumerate(exc.object[exc.start:exc.end]): - if skip: - skip = False - continue - index = i + exc.start - if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): - codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) - skip = True - else: - codepoint = ord(c) - codepoints.append(codepoint) - for cp in codepoints: - e = encode_entity_map.get(cp) - if e: - res.append("&") - res.append(e) - if not e.endswith(";"): - res.append(";") - else: - res.append("&#x%s;" % (hex(cp)[2:])) - return ("".join(res), exc.end) - else: - return xmlcharrefreplace_errors(exc) - register_error(unicode_encode_errors, htmlentityreplace_errors) +encode_entity_map = {} +is_ucs4 = len("\U0010FFFF") == 1 +for k, v in list(entities.items()): + # skip multi-character entities + if ((is_ucs4 and len(v) > 1) or + (not is_ucs4 and len(v) > 2)): + continue + if v != "&": + if len(v) == 2: + v = utils.surrogatePairToCodepoint(v) + else: + v = ord(v) + if v not in encode_entity_map or k.islower(): + # prefer < over < and similarly for &, >, etc. + encode_entity_map[v] = k + + +def htmlentityreplace_errors(exc): + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): + res = [] + codepoints = [] + skip = False + for i, c in enumerate(exc.object[exc.start:exc.end]): + if skip: + skip = False + continue + index = i + exc.start + if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) + skip = True + else: + codepoint = ord(c) + codepoints.append(codepoint) + for cp in codepoints: + e = encode_entity_map.get(cp) + if e: + res.append("&") + res.append(e) + if not e.endswith(";"): + res.append(";") + else: + res.append("&#x%s;" % (hex(cp)[2:])) + return ("".join(res), exc.end) + else: + return xmlcharrefreplace_errors(exc) - del register_error +register_error("htmlentityreplace", htmlentityreplace_errors) class HTMLSerializer(object): @@ -168,7 +163,7 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) if self.encoding: - return string.encode(self.encoding, unicode_encode_errors) + return string.encode(self.encoding, "htmlentityreplace") else: return string diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index 5f3cc619..6ae09dbe 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -1,5 +1,7 @@ from __future__ import absolute_import, division, unicode_literals +# pylint:disable=wrong-import-position + import os import sys import codecs @@ -68,6 +70,8 @@ "walker": treewalkers.getTreeWalker("genshi") } +# pylint:enable=wrong-import-position + def get_data_files(subdirectory, files='*.dat', search_dir=test_dir): return sorted(glob.glob(os.path.join(search_dir, subdirectory, files))) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index c380957c..c5d2af12 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -55,6 +55,7 @@ def test_encoding(): yield (runParserEncodingTest, test[b'data'], test[b'encoding']) yield (runPreScanEncodingTest, test[b'data'], test[b'encoding']) +# pylint:disable=wrong-import-position try: try: import charade # noqa @@ -67,3 +68,4 @@ def test_chardet(): with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp: encoding = inputstream.HTMLInputStream(fp.read()).charEncoding assert encoding[0].name == "big5" +# pylint:enable=wrong-import-position diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index d2e3a48a..b3cda7d7 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -12,6 +12,7 @@ from html5lib.serializer import HTMLSerializer, serialize from html5lib.treewalkers._base import TreeWalker +# pylint:disable=wrong-import-position optionals_loaded = [] try: @@ -19,6 +20,7 @@ optionals_loaded.append("lxml") except ImportError: pass +# pylint:enable=wrong-import-position default_namespace = constants.namespaces["html"] diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 50e505a9..dd6ea75f 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -1,9 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -try: - chr = unichr # noqa -except NameError: - pass +from six import unichr as chr from collections import deque diff --git a/html5lib/trie/__init__.py b/html5lib/trie/__init__.py index a8cca8a9..a5ba4bf1 100644 --- a/html5lib/trie/__init__.py +++ b/html5lib/trie/__init__.py @@ -4,9 +4,11 @@ Trie = PyTrie +# pylint:disable=wrong-import-position try: from .datrie import Trie as DATrie except ImportError: pass else: Trie = DATrie +# pylint:enable=wrong-import-position From b64df28cfb9e721ec3450e514ef8866001314eec Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 20 May 2016 20:29:15 +0100 Subject: [PATCH 018/219] Change which way around we overwrite this for clarity's sake --- html5lib/html5parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 66ad7430..331b8fd7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -953,8 +953,8 @@ class InBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - # Keep a ref to this for special handling of whitespace in
-            self.processSpaceCharactersNonPre = self.processSpaceCharacters
+            # Set this to the default handler
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
 
             self.startTagHandler = utils.MethodDispatcher([
                 ("html", self.startTagHtml),
@@ -1087,7 +1087,7 @@ def processCharacters(self, token):
                      for char in token["data"]])):
                 self.parser.framesetOK = False
 
-        def processSpaceCharacters(self, token):
+        def processSpaceCharactersNonPre(self, token):
             self.tree.reconstructActiveFormattingElements()
             self.tree.insertText(token["data"])
 

From df0b2ba4ddb78384e0b35be9f31a3848f21a2464 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 20:33:43 +0100
Subject: [PATCH 019/219] Remove unused import

---
 html5lib/inputstream.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index a9aa2a15..b43c2bda 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -19,12 +19,6 @@
 except ImportError:
     BytesIO = StringIO
 
-try:
-    from io import BufferedIOBase
-except ImportError:
-    class BufferedIOBase(object):
-        pass
-
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])

From 742715d5948456adc6bff21fce88e6b0858364d6 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:18:52 +0100
Subject: [PATCH 020/219] Fix invalid_unicode_re on platforms supporting lone
 surrogates

---
 html5lib/inputstream.py       |   6 +-
 html5lib/tests/test_stream.py | 112 +++++++++++++++++++++++++++++++++-
 2 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index b43c2bda..e63e1215 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -33,8 +33,10 @@
     # unichr. Not using this indirection would introduce an illegal
     # unicode literal on platforms not supporting such lone
     # surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
-                                    eval('"\\uD800-\\uDFFF"'))
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +
+                                    "]")
 else:
     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
 
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index 835e32e5..4e8453df 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -3,13 +3,17 @@
 from . import support  # noqa
 
 import codecs
-from io import BytesIO
+import sys
+from io import BytesIO, StringIO
+
+import pytest
 
 import six
 from six.moves import http_client, urllib
 
 from html5lib.inputstream import (BufferedStream, HTMLInputStream,
                                   HTMLUnicodeInputStream, HTMLBinaryInputStream)
+from html5lib.utils import supports_lone_surrogates
 
 
 def test_basic():
@@ -211,3 +215,109 @@ def makefile(self, _mode, _bufsize=None):
     wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
     stream = HTMLInputStream(wrapped)
     assert stream.charsUntil(" ") == "Text"
+
+
+@pytest.mark.parametrize("inp,num",
+                         [("\u0000", 0),
+                          ("\u0001", 1),
+                          ("\u0008", 1),
+                          ("\u0009", 0),
+                          ("\u000A", 0),
+                          ("\u000B", 1),
+                          ("\u000C", 0),
+                          ("\u000D", 0),
+                          ("\u000E", 1),
+                          ("\u001F", 1),
+                          ("\u0020", 0),
+                          ("\u007E", 0),
+                          ("\u007F", 1),
+                          ("\u009F", 1),
+                          ("\u00A0", 0),
+                          ("\uFDCF", 0),
+                          ("\uFDD0", 1),
+                          ("\uFDEF", 1),
+                          ("\uFDF0", 0),
+                          ("\uFFFD", 0),
+                          ("\uFFFE", 1),
+                          ("\uFFFF", 1),
+                          ("\U0001FFFD", 0),
+                          ("\U0001FFFE", 1),
+                          ("\U0001FFFF", 1),
+                          ("\U0002FFFD", 0),
+                          ("\U0002FFFE", 1),
+                          ("\U0002FFFF", 1),
+                          ("\U0003FFFD", 0),
+                          ("\U0003FFFE", 1),
+                          ("\U0003FFFF", 1),
+                          ("\U0004FFFD", 0),
+                          ("\U0004FFFE", 1),
+                          ("\U0004FFFF", 1),
+                          ("\U0005FFFD", 0),
+                          ("\U0005FFFE", 1),
+                          ("\U0005FFFF", 1),
+                          ("\U0006FFFD", 0),
+                          ("\U0006FFFE", 1),
+                          ("\U0006FFFF", 1),
+                          ("\U0007FFFD", 0),
+                          ("\U0007FFFE", 1),
+                          ("\U0007FFFF", 1),
+                          ("\U0008FFFD", 0),
+                          ("\U0008FFFE", 1),
+                          ("\U0008FFFF", 1),
+                          ("\U0009FFFD", 0),
+                          ("\U0009FFFE", 1),
+                          ("\U0009FFFF", 1),
+                          ("\U000AFFFD", 0),
+                          ("\U000AFFFE", 1),
+                          ("\U000AFFFF", 1),
+                          ("\U000BFFFD", 0),
+                          ("\U000BFFFE", 1),
+                          ("\U000BFFFF", 1),
+                          ("\U000CFFFD", 0),
+                          ("\U000CFFFE", 1),
+                          ("\U000CFFFF", 1),
+                          ("\U000DFFFD", 0),
+                          ("\U000DFFFE", 1),
+                          ("\U000DFFFF", 1),
+                          ("\U000EFFFD", 0),
+                          ("\U000EFFFE", 1),
+                          ("\U000EFFFF", 1),
+                          ("\U000FFFFD", 0),
+                          ("\U000FFFFE", 1),
+                          ("\U000FFFFF", 1),
+                          ("\U0010FFFD", 0),
+                          ("\U0010FFFE", 1),
+                          ("\U0010FFFF", 1),
+                          ("\x01\x01\x01", 3),
+                          ("a\x01a\x01a\x01a", 3)])
+def test_invalid_codepoints(inp, num):
+    stream = HTMLUnicodeInputStream(StringIO(inp))
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num
+
+
+@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
+@pytest.mark.parametrize("inp,num",
+                         [("'\\uD7FF'", 0),
+                          ("'\\uD800'", 1),
+                          ("'\\uDBFF'", 1),
+                          ("'\\uDC00'", 1),
+                          ("'\\uDFFF'", 1),
+                          ("'\\uE000'", 0),
+                          ("'\\uD800\\uD800\\uD800'", 3),
+                          ("'a\\uD800a\\uD800a\\uD800a'", 3),
+                          ("'\\uDFFF\\uDBFF'", 2),
+                          pytest.mark.skipif(sys.maxunicode == 0xFFFF,
+                                             ("'\\uDBFF\\uDFFF'", 2),
+                                             reason="narrow Python")])
+def test_invalid_codepoints_surrogates(inp, num):
+    inp = eval(inp)
+    fp = StringIO(inp)
+    if ord(max(fp.read())) > 0xFFFF:
+        pytest.skip("StringIO altered string")
+    fp.seek(0)
+    stream = HTMLUnicodeInputStream(fp)
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num

From cd74ec7a49943ab858fc120c19642e6181b58667 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:19:31 +0100
Subject: [PATCH 021/219] Fix comment

---
 html5lib/inputstream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index e63e1215..bb240015 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -30,7 +30,7 @@
 
 if utils.supports_lone_surrogates:
     # Use one extra step of indirection and create surrogates with
-    # unichr. Not using this indirection would introduce an illegal
+    # eval. Not using this indirection would introduce an illegal
     # unicode literal on platforms not supporting such lone
     # surrogates.
     assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1

From 15e126fcba9948779f662d5382e5665f6355e629 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:27:49 +0100
Subject: [PATCH 022/219] Silence eval-used

---
 html5lib/inputstream.py       | 2 +-
 html5lib/tests/test_stream.py | 2 +-
 html5lib/utils.py             | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index bb240015..1ed277ca 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -35,7 +35,7 @@
     # surrogates.
     assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
-                                    eval('"\\uD800-\\uDFFF"') +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
                                     "]")
 else:
     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index 4e8453df..77e411d5 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -312,7 +312,7 @@ def test_invalid_codepoints(inp, num):
                                              ("'\\uDBFF\\uDFFF'", 2),
                                              reason="narrow Python")])
 def test_invalid_codepoints_surrogates(inp, num):
-    inp = eval(inp)
+    inp = eval(inp)  # pylint:disable=eval-used
     fp = StringIO(inp)
     if ord(max(fp.read())) > 0xFFFF:
         pytest.skip("StringIO altered string")
diff --git a/html5lib/utils.py b/html5lib/utils.py
index c83a089f..f27ca73a 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -22,10 +22,10 @@
 # surrogates, and there is no mechanism to further escape such
 # escapes.
 try:
-    _x = eval('"\\uD800"')
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
     if not isinstance(_x, text_type):
         # We need this with u"" because of http://bugs.jython.org/issue2039
-        _x = eval('u"\\uD800"')
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
         assert isinstance(_x, text_type)
 except:
     supports_lone_surrogates = False

From bfc278ae93cbe56e619d3fc3e0a82f9346584104 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:45:00 +0100
Subject: [PATCH 023/219] Silence bare-except

---
 html5lib/inputstream.py | 2 +-
 html5lib/utils.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 1ed277ca..58d626c9 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -449,7 +449,7 @@ def openStream(self, source):
 
         try:
             stream.seek(stream.tell())
-        except:
+        except:  # pylint:disable=bare-except
             stream = BufferedStream(stream)
 
         return stream
diff --git a/html5lib/utils.py b/html5lib/utils.py
index f27ca73a..5fe237a0 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -27,7 +27,7 @@
         # We need this with u"" because of http://bugs.jython.org/issue2039
         _x = eval('u"\\uD800"')  # pylint:disable=eval-used
         assert isinstance(_x, text_type)
-except:
+except:  # pylint:disable=bare-except
     supports_lone_surrogates = False
 else:
     supports_lone_surrogates = True

From b46fcdf6faf27bdfc99c47b3c2b9129606c02728 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:48:21 +0100
Subject: [PATCH 024/219] Silence too-many-nested-blocks

---
 html5lib/serializer/htmlserializer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
index be2718d3..8a9439df 100644
--- a/html5lib/serializer/htmlserializer.py
+++ b/html5lib/serializer/htmlserializer.py
@@ -175,6 +175,7 @@ def encodeStrict(self, string):
             return string
 
     def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
         self.encoding = encoding
         in_cdata = False
         self.errors = []

From 6945bc480d1813f4cfccf135d7f38aadaaad8161 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:48:31 +0100
Subject: [PATCH 025/219] Silence not-callable

---
 html5lib/treebuilders/_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py
index 8196f591..900a724c 100644
--- a/html5lib/treebuilders/_base.py
+++ b/html5lib/treebuilders/_base.py
@@ -126,6 +126,7 @@ class TreeBuilder(object):
     commentClass - the class to use for comments
     doctypeClass - the class to use for doctypes
     """
+    # pylint:disable=not-callable
 
     # Document class
     documentClass = None

From 0c290e06f8eb34786b1c0b6acd14ed1f555ae27f Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:50:45 +0100
Subject: [PATCH 026/219] Kill long-dead finalText code

---
 html5lib/treebuilders/etree_lxml.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index a92b3aa9..71285b68 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -53,7 +53,6 @@ def _getChildNodes(self):
 
 def testSerializer(element):
     rv = []
-    finalText = None
     infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
 
     def serializeElement(element, indent=0):
@@ -128,16 +127,12 @@ def serializeElement(element, indent=0):
                 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
     serializeElement(element, 0)
 
-    if finalText is not None:
-        rv.append("|%s\"%s\"" % (' ' * 2, finalText))
-
     return "\n".join(rv)
 
 
 def tostring(element):
     """Serialize an element and its child nodes to a string"""
     rv = []
-    finalText = None
 
     def serializeElement(element):
         if not hasattr(element, "tag"):
@@ -173,9 +168,6 @@ def serializeElement(element):
 
     serializeElement(element)
 
-    if finalText is not None:
-        rv.append("%s\"" % (' ' * 2, finalText))
-
     return "".join(rv)
 
 

From da099dce1bb72428336e643f54ff1a8934f9804d Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 22:59:19 +0100
Subject: [PATCH 027/219] Silence a buggily output non-parent-init-called

---
 html5lib/treebuilders/etree_lxml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index 71285b68..09d85039 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -189,7 +189,7 @@ def __init__(self, element, value=None):
                 if value is None:
                     value = {}
                 self._element = element
-                dict.__init__(self, value)
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
                 for key, value in self.items():
                     if isinstance(key, tuple):
                         name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))

From 97427de90dd2a9ebf12cc1e36858eea931deab60 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 23:00:36 +0100
Subject: [PATCH 028/219] Fix indentation

---
 html5lib/treebuilders/etree_lxml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index 09d85039..abcd4b1d 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -304,7 +304,7 @@ def insertCommentInitial(self, data, parent=None):
     def insertCommentMain(self, data, parent=None):
         if (parent == self.document and
                 self.document._elementTree.getroot()[-1].tag == comment_type):
-                warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
         super(TreeBuilder, self).insertComment(data, parent)
 
     def insertRoot(self, token):

From 2afe09bcbcc728e98ec8da39b68ea65f4c270fdb Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 23:05:48 +0100
Subject: [PATCH 029/219] Make this in practice unreachable code work on Py2

---
 html5lib/trie/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py
index 724486b1..be6cb6e3 100644
--- a/html5lib/trie/_base.py
+++ b/html5lib/trie/_base.py
@@ -7,7 +7,7 @@ class Trie(Mapping):
     """Abstract base class for tries"""
 
     def keys(self, prefix=None):
-        keys = super().keys()
+        keys = super(Trie, self).keys()
 
         if prefix is None:
             return set(keys)

From c0df867ebdeda6adc6dca9ff796eccf64e3ebda0 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 23:07:38 +0100
Subject: [PATCH 030/219] Silence arguments-differ

---
 html5lib/trie/_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py
index be6cb6e3..25eece46 100644
--- a/html5lib/trie/_base.py
+++ b/html5lib/trie/_base.py
@@ -7,6 +7,7 @@ class Trie(Mapping):
     """Abstract base class for tries"""
 
     def keys(self, prefix=None):
+        # pylint:disable=arguments-differ
         keys = super(Trie, self).keys()
 
         if prefix is None:

From 5dce4f27289090ed4662aee8881782a2efbcd20c Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 20 May 2016 23:19:55 +0100
Subject: [PATCH 031/219] Silence protected-access

---
 html5lib/treebuilders/dom.py        | 1 +
 html5lib/treebuilders/etree.py      | 2 ++
 html5lib/treebuilders/etree_lxml.py | 1 +
 3 files changed, 4 insertions(+)

diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
index 27432c7a..b7df74b2 100644
--- a/html5lib/treebuilders/dom.py
+++ b/html5lib/treebuilders/dom.py
@@ -158,6 +158,7 @@ def insertText(self, data, parent=None):
             else:
                 # HACK: allow text nodes as children of the document node
                 if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
                     if Node.TEXT_NODE not in self.dom._child_node_types:
                         self.dom._child_node_types = list(self.dom._child_node_types)
                         self.dom._child_node_types.append(Node.TEXT_NODE)
diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py
index b607948b..d394148d 100644
--- a/html5lib/treebuilders/etree.py
+++ b/html5lib/treebuilders/etree.py
@@ -1,4 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
 from six import text_type
 
 import re
diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index abcd4b1d..2a69769b 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -10,6 +10,7 @@
 """
 
 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
 
 import warnings
 import re

From a2b8c110cd0c5c7d60573f2a86d951cabefc516b Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Fri, 4 Dec 2015 02:13:53 +0000
Subject: [PATCH 032/219] Add prospector/pylint config for the sake of
 Landscape.

---
 .prospector.yaml | 21 +++++++++++++++++++++
 .pylintrc        | 10 ++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 .prospector.yaml
 create mode 100644 .pylintrc

diff --git a/.prospector.yaml b/.prospector.yaml
new file mode 100644
index 00000000..7e8efe1a
--- /dev/null
+++ b/.prospector.yaml
@@ -0,0 +1,21 @@
+strictness: veryhigh
+doc-warnings: false
+test-warnings: false
+
+max-line-length: 139
+
+requirements:
+  - requirements.txt
+  - requirements-test.txt
+  - requirements-optional.txt
+
+ignore-paths:
+  - parse.py
+  - utils/
+
+python-targets:
+  - 2
+  - 3
+
+mccabe:
+  run: false
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 00000000..ea74d5db
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,10 @@
+[MASTER]
+ignore=tests
+
+[MESSAGES CONTROL]
+# messages up to fixme should probably be fixed somehow
+disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda
+
+[FORMAT]
+max-line-length=139
+single-line-if-stmt=no

From 3d0eaea85bc00a739e3910ef03316d4312ad84de Mon Sep 17 00:00:00 2001
From: Gabi Davar 
Date: Sat, 12 Dec 2015 11:23:36 +0200
Subject: [PATCH 033/219] drop usage of charade now chardet is maintained again

---
 CHANGES.rst                     | 2 ++
 README.rst                      | 5 ++---
 debug-info.py                   | 2 +-
 html5lib/inputstream.py         | 5 +----
 html5lib/tests/test_encoding.py | 7 ++-----
 requirements-optional.txt       | 4 ++--
 setup.py                        | 4 ++--
 7 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 51a254c6..1f87d9ab 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -44,6 +44,8 @@ Released on XXX
   (instead of the tokenizer); as such, this will require amending all
   callers of it to use it via the treewalker API.**
 
+* **Drop support of charade, now that chardet is supported once more.**
+
 
 0.9999999/1.0b8
 ~~~~~~~~~~~~~~~
diff --git a/README.rst b/README.rst
index 879dabad..e73b1639 100644
--- a/README.rst
+++ b/README.rst
@@ -113,9 +113,8 @@ functionality:
 
 - ``genshi`` has a treewalker (but not builder); and
 
-- ``charade`` can be used as a fallback when character encoding cannot
-  be determined; ``chardet``, from which it was forked, can also be used
-  on Python 2.
+- ``chardet`` can be used as a fallback when character encoding cannot
+  be determined.
 
 - ``ordereddict`` can be used under Python 2.6
   (``collections.OrderedDict`` is used instead on later versions) to
diff --git a/debug-info.py b/debug-info.py
index b5d2bb6a..f93fbdbe 100644
--- a/debug-info.py
+++ b/debug-info.py
@@ -12,7 +12,7 @@
     "maxsize": sys.maxsize
 }
 
-search_modules = ["charade", "chardet", "datrie", "genshi", "html5lib", "lxml", "six"]
+search_modules = ["chardet", "datrie", "genshi", "html5lib", "lxml", "six"]
 found_modules = []
 
 for m in search_modules:
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 58d626c9..cfabdd86 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -468,10 +468,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
         if encoding is None and chardet:
             confidence = "tentative"
             try:
-                try:
-                    from charade.universaldetector import UniversalDetector
-                except ImportError:
-                    from chardet.universaldetector import UniversalDetector
+                from chardet.universaldetector import UniversalDetector
                 buffers = []
                 detector = UniversalDetector()
                 while not detector.done:
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index c5d2af12..a66a2178 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -57,12 +57,9 @@ def test_encoding():
 
 # pylint:disable=wrong-import-position
 try:
-    try:
-        import charade  # noqa
-    except ImportError:
-        import chardet  # noqa
+    import chardet  # noqa
 except ImportError:
-    print("charade/chardet not found, skipping chardet tests")
+    print("chardet not found, skipping chardet tests")
 else:
     def test_chardet():
         with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
diff --git a/requirements-optional.txt b/requirements-optional.txt
index ac6539cb..781ab8c2 100644
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -4,9 +4,9 @@
 # streams.
 genshi
 
-# charade can be used as a fallback in case we are unable to determine
+# chardet can be used as a fallback in case we are unable to determine
 # the encoding of a document.
-charade
+chardet>=2.2
 
 # lxml is supported with its own treebuilder ("lxml") and otherwise
 # uses the standard ElementTree support
diff --git a/setup.py b/setup.py
index b42ba400..ccb38680 100644
--- a/setup.py
+++ b/setup.py
@@ -70,13 +70,13 @@
 
           # Standard extras, will be installed when the extra is requested.
           "genshi": ["genshi"],
-          "charade": ["charade"],
+          "chardet": ["chardet>=2.2"],
 
           # The all extra combines a standard extra which will be used anytime
           # the all extra is requested, and it extends it with a conditional
           # extra that will be installed whenever the condition matches and the
           # all extra is requested.
-          "all": ["genshi", "charade"],
+          "all": ["genshi", "chardet>=2.2"],
           "all:platform.python_implementation == 'CPython'": ["datrie", "lxml"],
       },
       )

From a8a10d6306277a941c10eee76579c1fc162b6f8c Mon Sep 17 00:00:00 2001
From: Gabi Davar 
Date: Sat, 12 Dec 2015 11:23:36 +0200
Subject: [PATCH 034/219] use find_packages for simplicity in setup.py

---
 setup.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index ccb38680..d5a5d2a1 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 import os
 import codecs
 
-from setuptools import setup
+from setuptools import setup, find_packages
 
 
 classifiers = [
@@ -22,11 +22,6 @@
     'Topic :: Text Processing :: Markup :: HTML'
 ]
 
-packages = ['html5lib'] + ['html5lib.' + name
-                           for name in os.listdir(os.path.join('html5lib'))
-                           if os.path.isdir(os.path.join('html5lib', name)) and
-                           not name.startswith('.') and name != 'tests']
-
 current_dir = os.path.dirname(__file__)
 with codecs.open(os.path.join(current_dir, 'README.rst'), 'r', 'utf8') as readme_file:
     with codecs.open(os.path.join(current_dir, 'CHANGES.rst'), 'r', 'utf8') as changes_file:
@@ -53,7 +48,7 @@
       classifiers=classifiers,
       maintainer='James Graham',
       maintainer_email='james@hoppipolla.co.uk',
-      packages=packages,
+      packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
       install_requires=[
           'six',
           'webencodings',

From 5a62f05bff5081b9d9985fe81d8b2dd18ad63445 Mon Sep 17 00:00:00 2001
From: Gabi Davar 
Date: Sat, 12 Dec 2015 11:23:36 +0200
Subject: [PATCH 035/219] Import things from os.path to make setup.py more
 readable

---
 setup.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index d5a5d2a1..4d5f1523 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 import ast
-import os
 import codecs
 
+from os.path import join, dirname
 from setuptools import setup, find_packages
 
 
@@ -22,13 +22,13 @@
     'Topic :: Text Processing :: Markup :: HTML'
 ]
 
-current_dir = os.path.dirname(__file__)
-with codecs.open(os.path.join(current_dir, 'README.rst'), 'r', 'utf8') as readme_file:
-    with codecs.open(os.path.join(current_dir, 'CHANGES.rst'), 'r', 'utf8') as changes_file:
+here = dirname(__file__)
+with codecs.open(join(here, 'README.rst'), 'r', 'utf8') as readme_file:
+    with codecs.open(join(here, 'CHANGES.rst'), 'r', 'utf8') as changes_file:
         long_description = readme_file.read() + '\n' + changes_file.read()
 
 version = None
-with open(os.path.join("html5lib", "__init__.py"), "rb") as init_file:
+with open(join("html5lib", "__init__.py"), "rb") as init_file:
     t = ast.parse(init_file.read(), filename="__init__.py", mode="exec")
     assert isinstance(t, ast.Module)
     assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)

From 2812e44c3cb44bdb2cf24ad26b11ead425fb5c76 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Sat, 7 May 2016 22:58:28 +0100
Subject: [PATCH 036/219] Make DOM treebuilder's AttrList return a
 MutableMapping

---
 html5lib/treebuilders/dom.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
index b7df74b2..9d7f4824 100644
--- a/html5lib/treebuilders/dom.py
+++ b/html5lib/treebuilders/dom.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 
 
+from collections import MutableMapping
 from xml.dom import minidom, Node
 import weakref
 
@@ -13,34 +14,41 @@
 def getDomBuilder(DomImplementation):
     Dom = DomImplementation
 
-    class AttrList(object):
+    class AttrList(MutableMapping):
         def __init__(self, element):
             self.element = element
 
         def __iter__(self):
-            return list(self.element.attributes.items()).__iter__()
+            return iter(self.element.attributes.keys())
 
         def __setitem__(self, name, value):
-            self.element.setAttribute(name, value)
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                attr = self.element.ownerDocument.createAttribute(name)
+                attr.value = value
+                self.element.attributes[name] = attr
 
         def __len__(self):
-            return len(list(self.element.attributes.items()))
+            return len(self.element.attributes)
 
         def items(self):
-            return [(item[0], item[1]) for item in
-                    list(self.element.attributes.items())]
+            return list(self.element.attributes.items())
 
-        def keys(self):
-            return list(self.element.attributes.keys())
+        def values(self):
+            return list(self.element.attributes.values())
 
         def __getitem__(self, name):
-            return self.element.getAttribute(name)
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.attributes[name].value
 
-        def __contains__(self, name):
+        def __delitem__(self, name):
             if isinstance(name, tuple):
                 raise NotImplementedError
             else:
-                return self.element.hasAttribute(name)
+                del self.element.attributes[name]
 
     class NodeBuilder(_base.Node):
         def __init__(self, element):

From 29f0512e4cf4a4fb63b5e7c90ca3e53150a59743 Mon Sep 17 00:00:00 2001
From: Kovid Goyal 
Date: Sat, 26 Oct 2013 13:17:09 +0530
Subject: [PATCH 037/219] Speed up unnecessarily slow and obtuse dict
 comparison

---
 html5lib/html5parser.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index 331b8fd7..ff886e89 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -1022,17 +1022,9 @@ def __init__(self, parser, tree):
             self.endTagHandler.default = self.endTagOther
 
         def isMatchingFormattingElement(self, node1, node2):
-            if node1.name != node2.name or node1.namespace != node2.namespace:
-                return False
-            elif len(node1.attributes) != len(node2.attributes):
-                return False
-            else:
-                attributes1 = sorted(node1.attributes.items())
-                attributes2 = sorted(node2.attributes.items())
-                for attr1, attr2 in zip(attributes1, attributes2):
-                    if attr1 != attr2:
-                        return False
-            return True
+            return (node1.name == node2.name and
+                    node1.namespace == node2.namespace and
+                    node1.attributes == node2.attributes)
 
         # helper
         def addFormattingElement(self, token):

From 0a885c655192dca52484c83ea607f94720525da1 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon 
Date: Sat, 7 May 2016 21:07:47 +0100
Subject: [PATCH 038/219] Clean up the constants imports in html5parser

---
 html5lib/html5parser.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index ff886e89..5c281b8e 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -10,15 +10,16 @@
 from .treebuilders._base import Marker
 
 from . import utils
-from . import constants
-from .constants import spaceCharacters, asciiUpper2Lower
-from .constants import specialElements
-from .constants import headingElements
-from .constants import cdataElements, rcdataElements
-from .constants import tokenTypes, ReparseException, namespaces
-from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
-from .constants import adjustForeignAttributes as adjustForeignAttributesMap
-from .constants import E
+from .constants import (
+    spaceCharacters, asciiUpper2Lower,
+    specialElements, headingElements, cdataElements, rcdataElements,
+    tokenTypes, tagTokenTypes,
+    namespaces,
+    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
+    adjustForeignAttributes as adjustForeignAttributesMap,
+    E,
+    ReparseException
+)
 
 
 def parse(doc, treebuilder="etree", encoding=None,
@@ -434,7 +435,7 @@ def getPhases(debug):
     def log(function):
         """Logger that records which phase processes each token"""
         type_names = dict((value, key) for key, value in
-                          constants.tokenTypes.items())
+                          tokenTypes.items())
 
         def wrapped(self, *args, **kwargs):
             if function.__name__.startswith("process") and len(args) > 0:
@@ -443,7 +444,7 @@ def wrapped(self, *args, **kwargs):
                     info = {"type": type_names[token['type']]}
                 except:
                     raise
-                if token['type'] in constants.tagTokenTypes:
+                if token['type'] in tagTokenTypes:
                     info["name"] = token['name']
 
                 self.parser.log.append((self.parser.tokenizer.state.__name__,

From a137d14bfd8c6aca007d9dd3118b18e91ccf7fa5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal 
Date: Sat, 26 Oct 2013 14:57:42 +0530
Subject: [PATCH 039/219] Preserve attribute order when parsing

---
 README.rst                     |   4 --
 html5lib/constants.py          |  67 +++++++++++++++++++++
 html5lib/html5parser.py        | 104 +++++++--------------------------
 html5lib/tests/test_parser2.py |   7 +++
 html5lib/utils.py              |   5 +-
 requirements-optional.txt      |   4 --
 requirements.txt               |   1 +
 7 files changed, 100 insertions(+), 92 deletions(-)

diff --git a/README.rst b/README.rst
index e73b1639..47eb90d3 100644
--- a/README.rst
+++ b/README.rst
@@ -116,10 +116,6 @@ functionality:
 - ``chardet`` can be used as a fallback when character encoding cannot
   be determined.
 
-- ``ordereddict`` can be used under Python 2.6
-  (``collections.OrderedDict`` is used instead on later versions) to
-  serialize attributes in alphabetical order.
-
 
 Bugs
 ----
diff --git a/html5lib/constants.py b/html5lib/constants.py
index df1f061e..9e7541d3 100644
--- a/html5lib/constants.py
+++ b/html5lib/constants.py
@@ -437,6 +437,73 @@
     (namespaces["mathml"], "mtext")
 ])
 
+adjustSVGAttributes = {
+    "attributename": "attributeName",
+    "attributetype": "attributeType",
+    "basefrequency": "baseFrequency",
+    "baseprofile": "baseProfile",
+    "calcmode": "calcMode",
+    "clippathunits": "clipPathUnits",
+    "contentscripttype": "contentScriptType",
+    "contentstyletype": "contentStyleType",
+    "diffuseconstant": "diffuseConstant",
+    "edgemode": "edgeMode",
+    "externalresourcesrequired": "externalResourcesRequired",
+    "filterres": "filterRes",
+    "filterunits": "filterUnits",
+    "glyphref": "glyphRef",
+    "gradienttransform": "gradientTransform",
+    "gradientunits": "gradientUnits",
+    "kernelmatrix": "kernelMatrix",
+    "kernelunitlength": "kernelUnitLength",
+    "keypoints": "keyPoints",
+    "keysplines": "keySplines",
+    "keytimes": "keyTimes",
+    "lengthadjust": "lengthAdjust",
+    "limitingconeangle": "limitingConeAngle",
+    "markerheight": "markerHeight",
+    "markerunits": "markerUnits",
+    "markerwidth": "markerWidth",
+    "maskcontentunits": "maskContentUnits",
+    "maskunits": "maskUnits",
+    "numoctaves": "numOctaves",
+    "pathlength": "pathLength",
+    "patterncontentunits": "patternContentUnits",
+    "patterntransform": "patternTransform",
+    "patternunits": "patternUnits",
+    "pointsatx": "pointsAtX",
+    "pointsaty": "pointsAtY",
+    "pointsatz": "pointsAtZ",
+    "preservealpha": "preserveAlpha",
+    "preserveaspectratio": "preserveAspectRatio",
+    "primitiveunits": "primitiveUnits",
+    "refx": "refX",
+    "refy": "refY",
+    "repeatcount": "repeatCount",
+    "repeatdur": "repeatDur",
+    "requiredextensions": "requiredExtensions",
+    "requiredfeatures": "requiredFeatures",
+    "specularconstant": "specularConstant",
+    "specularexponent": "specularExponent",
+    "spreadmethod": "spreadMethod",
+    "startoffset": "startOffset",
+    "stddeviation": "stdDeviation",
+    "stitchtiles": "stitchTiles",
+    "surfacescale": "surfaceScale",
+    "systemlanguage": "systemLanguage",
+    "tablevalues": "tableValues",
+    "targetx": "targetX",
+    "targety": "targetY",
+    "textlength": "textLength",
+    "viewbox": "viewBox",
+    "viewtarget": "viewTarget",
+    "xchannelselector": "xChannelSelector",
+    "ychannelselector": "yChannelSelector",
+    "zoomandpan": "zoomAndPan"
+}
+
+adjustMathMLAttributes = {"definitionurl": "definitionURL"}
+
 adjustForeignAttributes = {
     "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
     "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index 5c281b8e..df2a6cf7 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -1,8 +1,13 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import with_metaclass
+from six import with_metaclass, viewkeys, PY3
 
 import types
 
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
 from . import inputstream
 from . import tokenizer
 
@@ -17,6 +22,7 @@
     namespaces,
     htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
     adjustForeignAttributes as adjustForeignAttributesMap,
+    adjustMathMLAttributes, adjustSVGAttributes,
     E,
     ReparseException
 )
@@ -273,96 +279,18 @@ def normalizeToken(self, token):
         """ HTML5 specific normalizations to the token stream """
 
         if token["type"] == tokenTypes["StartTag"]:
-            token["data"] = dict(token["data"][::-1])
+            token["data"] = OrderedDict(token['data'][::-1])
 
         return token
 
     def adjustMathMLAttributes(self, token):
-        replacements = {"definitionurl": "definitionURL"}
-        for k, v in replacements.items():
-            if k in token["data"]:
-                token["data"][v] = token["data"][k]
-                del token["data"][k]
+        adjust_attributes(token, adjustMathMLAttributes)
 
     def adjustSVGAttributes(self, token):
-        replacements = {
-            "attributename": "attributeName",
-            "attributetype": "attributeType",
-            "basefrequency": "baseFrequency",
-            "baseprofile": "baseProfile",
-            "calcmode": "calcMode",
-            "clippathunits": "clipPathUnits",
-            "contentscripttype": "contentScriptType",
-            "contentstyletype": "contentStyleType",
-            "diffuseconstant": "diffuseConstant",
-            "edgemode": "edgeMode",
-            "externalresourcesrequired": "externalResourcesRequired",
-            "filterres": "filterRes",
-            "filterunits": "filterUnits",
-            "glyphref": "glyphRef",
-            "gradienttransform": "gradientTransform",
-            "gradientunits": "gradientUnits",
-            "kernelmatrix": "kernelMatrix",
-            "kernelunitlength": "kernelUnitLength",
-            "keypoints": "keyPoints",
-            "keysplines": "keySplines",
-            "keytimes": "keyTimes",
-            "lengthadjust": "lengthAdjust",
-            "limitingconeangle": "limitingConeAngle",
-            "markerheight": "markerHeight",
-            "markerunits": "markerUnits",
-            "markerwidth": "markerWidth",
-            "maskcontentunits": "maskContentUnits",
-            "maskunits": "maskUnits",
-            "numoctaves": "numOctaves",
-            "pathlength": "pathLength",
-            "patterncontentunits": "patternContentUnits",
-            "patterntransform": "patternTransform",
-            "patternunits": "patternUnits",
-            "pointsatx": "pointsAtX",
-            "pointsaty": "pointsAtY",
-            "pointsatz": "pointsAtZ",
-            "preservealpha": "preserveAlpha",
-            "preserveaspectratio": "preserveAspectRatio",
-            "primitiveunits": "primitiveUnits",
-            "refx": "refX",
-            "refy": "refY",
-            "repeatcount": "repeatCount",
-            "repeatdur": "repeatDur",
-            "requiredextensions": "requiredExtensions",
-            "requiredfeatures": "requiredFeatures",
-            "specularconstant": "specularConstant",
-            "specularexponent": "specularExponent",
-            "spreadmethod": "spreadMethod",
-            "startoffset": "startOffset",
-            "stddeviation": "stdDeviation",
-            "stitchtiles": "stitchTiles",
-            "surfacescale": "surfaceScale",
-            "systemlanguage": "systemLanguage",
-            "tablevalues": "tableValues",
-            "targetx": "targetX",
-            "targety": "targetY",
-            "textlength": "textLength",
-            "viewbox": "viewBox",
-            "viewtarget": "viewTarget",
-            "xchannelselector": "xChannelSelector",
-            "ychannelselector": "yChannelSelector",
-            "zoomandpan": "zoomAndPan"
-        }
-        for originalName in list(token["data"].keys()):
-            if originalName in replacements:
-                svgName = replacements[originalName]
-                token["data"][svgName] = token["data"][originalName]
-                del token["data"][originalName]
+        adjust_attributes(token, adjustSVGAttributes)
 
     def adjustForeignAttributes(self, token):
-        replacements = adjustForeignAttributesMap
-
-        for originalName in token["data"].keys():
-            if originalName in replacements:
-                foreignName = replacements[originalName]
-                token["data"][foreignName] = token["data"][originalName]
-                del token["data"][originalName]
+        adjust_attributes(token, adjustForeignAttributesMap)
 
     def reparseTokenNormal(self, token):
         # pylint:disable=unused-argument
@@ -2791,6 +2719,16 @@ def processEndTag(self, token):
     }
 
 
+def adjust_attributes(token, replacements):
+    if PY3 or utils.PY27:
+        needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
+    else:
+        needs_adjustment = frozenset(token['data']) & frozenset(replacements)
+    if needs_adjustment:
+        token['data'] = OrderedDict((replacements.get(k, k), v)
+                                    for k, v in token['data'].items())
+
+
 def impliedTagToken(name, type="EndTag", attributes=None,
                     selfClosing=False):
     if attributes is None:
diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
index f8e1ac43..47738405 100644
--- a/html5lib/tests/test_parser2.py
+++ b/html5lib/tests/test_parser2.py
@@ -49,3 +49,10 @@ def test_namespace_html_elements_1_etree():
 
 def test_unicode_file():
     assert parse(io.StringIO("a")) is not None
+
+
+def test_duplicate_attribute():
+    # This is here because we impl it in parser and not tokenizer
+    doc = parse('

') + el = doc[1][0] + assert el.get("class") == "a" diff --git a/html5lib/utils.py b/html5lib/utils.py index 5fe237a0..ea65ab6b 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +import sys from types import ModuleType from six import text_type @@ -12,9 +13,11 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", "surrogatePairToCodepoint", "moduleFactoryFactory", - "supports_lone_surrogates"] + "supports_lone_surrogates", "PY27"] +PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7 + # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be # caught by the below test. In general this would be any platform # using UTF-16 as its encoding of unicode strings, such as diff --git a/requirements-optional.txt b/requirements-optional.txt index 781ab8c2..c00fd242 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -15,7 +15,3 @@ lxml ; platform_python_implementation == 'CPython' # DATrie can be used in place of our Python trie implementation for # slightly better parsing performance. datrie ; platform_python_implementation == 'CPython' - -# Can be used to force attributes to be serialized in alphabetical -# order. -ordereddict ; python_version < '2.7' diff --git a/requirements.txt b/requirements.txt index 15cae9dc..745993b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ six webencodings +ordereddict ; python_version < '2.7' From 761f3ab2511038c36416ad7c50d9cb9694005b8e Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sun, 22 May 2016 02:37:05 +0100 Subject: [PATCH 040/219] Add a test for HTMLParser(debug=True) --- html5lib/tests/test_parser2.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 47738405..0ec5b049 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -1,11 +1,13 @@ from __future__ import absolute_import, division, unicode_literals +from six import PY2, text_type + import io from . import support # noqa from html5lib.constants import namespaces -from html5lib import parse +from html5lib import parse, HTMLParser # tests that aren't autogenerated from text files @@ -56,3 +58,33 @@ def test_duplicate_attribute(): doc = parse('

') el = doc[1][0] assert el.get("class") == "a" + + +def test_debug_log(): + parser = HTMLParser(debug=True) + parser.parse("a

bd

e") + + expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}), + ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), + ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), + ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), + ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}), + ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), + ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), + ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), + ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})] + + if PY2: + for i, log in enumerate(expected): + log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log] + expected[i] = tuple(log) + + assert parser.log == expected From 66a2f7763f2dda24d3d2681c22bf799c94ee049c Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 25 May 2016 14:34:46 +0100 Subject: [PATCH 041/219] Check that no tag in the method dispatcher is duplicated (#255) Check that no tag in the method dispatcher is duplicated; r=nobody! --- html5lib/html5parser.py | 2 +- html5lib/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index df2a6cf7..daee854c 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -893,7 +893,7 @@ def __init__(self, parser, tree): ("body", self.startTagBody), ("frameset", self.startTagFrameset), (("address", "article", "aside", "blockquote", "center", "details", - "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", + "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "section", "summary", "ul"), self.startTagCloseP), diff --git a/html5lib/utils.py b/html5lib/utils.py index ea65ab6b..03f0dab7 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -61,6 +61,7 @@ def __init__(self, items=()): else: _dictEntries.append((name, value)) dict.__init__(self, _dictEntries) + assert len(self) == len(_dictEntries) self.default = None def __getitem__(self, key): From 2d376737a6246ebb38a79600a7fe75abd923cf3e Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 25 May 2016 17:55:28 +0100 Subject: [PATCH 042/219] Fix #228: make sure the lxml treewalker works with trees from lxml (#256) r=nobody! --- html5lib/tests/test_treewalkers.py | 21 +++++++++++++++++++++ html5lib/treewalkers/lxmletree.py | 29 ++++++++++++++++++----------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 332027ac..81ed2778 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -2,6 +2,11 @@ import pytest +try: + import lxml.etree +except ImportError: + pass + from .support import treeTypes from html5lib import html5parser, treewalkers @@ -93,3 +98,19 @@ def test_treewalker_six_mix(): for tree in sorted(treeTypes.items()): for intext, attrs, expected in sm_tests: yield runTreewalkerEditTest, intext, expected, attrs, tree + + +@pytest.mark.skipif(treeTypes["lxml"] is None, reason="lxml not importable") +def test_lxml_xml(): + expected = [ + {'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'}, + {'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'}, + {'name': 'div', 'namespace': None, 'type': 'EndTag'}, + {'name': 'div', 'namespace': None, 'type': 'EndTag'} + ] + + lxmltree = lxml.etree.fromstring('
') + walker = treewalkers.getTreeWalker('lxml') + output = Lint(walker(lxmltree)) + + assert list(output) == expected diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 7d99adc2..ff31a44e 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -22,13 +22,20 @@ class Root(object): def __init__(self, et): self.elementtree = et self.children = [] - if et.docinfo.internalDTD: - self.children.append(Doctype(self, - ensure_str(et.docinfo.root_name), - ensure_str(et.docinfo.public_id), - ensure_str(et.docinfo.system_url))) - root = et.getroot() - node = root + + try: + if et.docinfo.internalDTD: + self.children.append(Doctype(self, + ensure_str(et.docinfo.root_name), + ensure_str(et.docinfo.public_id), + ensure_str(et.docinfo.system_url))) + except AttributeError: + pass + + try: + node = et.getroot() + except AttributeError: + node = et while node.getprevious() is not None: node = node.getprevious() @@ -118,12 +125,12 @@ def __len__(self): class TreeWalker(_base.NonRecursiveTreeWalker): def __init__(self, tree): # pylint:disable=redefined-variable-type - if hasattr(tree, "getroot"): - self.fragmentChildren = set() - tree = Root(tree) - elif isinstance(tree, list): + if isinstance(tree, list): self.fragmentChildren = set(tree) tree = FragmentRoot(tree) + else: + self.fragmentChildren = set() + tree = Root(tree) _base.NonRecursiveTreeWalker.__init__(self, tree) self.filter = ihatexml.InfosetFilter() From c35d84c1c01db16bace0c69f0b1992f5123b322e Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sun, 29 May 2016 04:38:10 +0100 Subject: [PATCH 043/219] Fix #217: Fully remove element in removeChild in etree treebuilder (#259) This adds a test here because we still fail the upstream one, as our implementation of AAA is outdated. --- html5lib/tests/test_parser2.py | 7 ++++++- html5lib/treebuilders/etree.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 0ec5b049..b7a92fd7 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -7,7 +7,7 @@ from . import support # noqa from html5lib.constants import namespaces -from html5lib import parse, HTMLParser +from html5lib import parse, parseFragment, HTMLParser # tests that aren't autogenerated from text files @@ -88,3 +88,8 @@ def test_debug_log(): expected[i] = tuple(log) assert parser.log == expected + + +def test_no_duplicate_clone(): + frag = parseFragment("