Make Py3 code work on Py2 as well.

gsnedders · gsnedders · commit 82377ecbf588 · 2013-02-04T20:40:22.000Z
Everything works except the lxml treewalkers. That really needs rewritten.
diff --git a/html5lib/__init__.py b/html5lib/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 """ 
 HTML parsing library based on the WHATWG "HTML5"
 specification. The parser is designed to be compatible with existing
diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import string, gettext
 _ = gettext.gettext
 
diff --git a/html5lib/filters/_base.py b/html5lib/filters/_base.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 
 class Filter(object):
     def __init__(self, source):
diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from . import _base
 
 class Filter(_base.Filter):
diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from gettext import gettext
 _ = gettext
 
diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from . import _base
 
 class Filter(_base.Filter):
diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from . import _base
 from html5lib.sanitizer import HTMLSanitizerMixin
 
diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import re
 
 from . import _base
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -1,3 +1,6 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import with_metaclass
+
 import sys
 import types
 
@@ -444,7 +447,7 @@ def getMetaclass(use_metaclass, metaclass_func):
         else:
             return type
 
-    class Phase(object, metaclass=getMetaclass(debug, log)):
+    class Phase(with_metaclass(getMetaclass(debug, log))):
         """Base class for helper object that implements each phase of processing
         """
 
@@ -2686,7 +2689,7 @@ def impliedTagToken(name, type="EndTag", attributes = None,
                     selfClosing = False):
     if attributes is None:
         attributes = {}
-    return {"type":tokenTypes[type], "name":str(name), "data":attributes,
+    return {"type":tokenTypes[type], "name":name, "data":attributes,
             "selfClosing":selfClosing}
 
 class ParseError(Exception):
diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import re
 import warnings
 
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -1,3 +1,6 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
 import codecs
 import re
 import types
@@ -118,9 +121,9 @@ def _readFromBuffer(self, bytes):
 
 def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
     if hasattr(source, "read"):
-        isUnicode = isinstance(source.read(0), str)
+        isUnicode = isinstance(source.read(0), text_type)
     else:
-        isUnicode = isinstance(source, str)
+        isUnicode = isinstance(source, text_type)
 
     if isUnicode:
         if encoding is not None:
@@ -565,6 +568,10 @@ def __next__(self):
             raise TypeError
         return self[p:p+1]
 
+    def next(self):
+        # Py2 compat
+        return self.__next__()
+
     def previous(self):
         p = self._position
         if p >= len(self):
@@ -641,6 +648,7 @@ def jumpTo(self, bytes):
         else:
             raise StopIteration
 
+
 class EncodingParser(object):
     """Mini parser for detecting character encoding from meta elements"""
 
diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import re
 from xml.sax.saxutils import escape, unescape
 
diff --git a/html5lib/serializer/__init__.py b/html5lib/serializer/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 
 from html5lib import treewalkers
 
diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
@@ -1,3 +1,6 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
 import gettext
 _ = gettext.gettext
 
@@ -158,14 +161,14 @@ def __init__(self, **kwargs):
         self.strict = False
 
     def encode(self, string):
-        assert(isinstance(string, str))
+        assert(isinstance(string, text_type))
         if self.encoding:
             return string.encode(self.encoding, unicode_encode_errors)
         else:
             return string
 
     def encodeStrict(self, string):
-        assert(isinstance(string, str))
+        assert(isinstance(string, text_type))
         if self.encoding:
             return string.encode(self.encoding, "strict")
         else:
diff --git a/html5lib/serializer/xhtmlserializer.py b/html5lib/serializer/xhtmlserializer.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from .htmlserializer import HTMLSerializer
 
 class XHTMLSerializer(HTMLSerializer):
diff --git a/html5lib/tests/__init__.py b/html5lib/tests/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import sys
 import os
 
diff --git a/html5lib/tests/mockParser.py b/html5lib/tests/mockParser.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import sys
 import os
 
diff --git a/html5lib/tests/performance/concatenation.py b/html5lib/tests/performance/concatenation.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 def f1():
     x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
     y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import os
 import sys
 import codecs
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import re
 import os
 import unittest
diff --git a/html5lib/tests/test_parser.py b/html5lib/tests/test_parser.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import os
 import sys
 import traceback
diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import io
 
 from . import support
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import os
 import sys
 import unittest
diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import os
 import unittest
 from .support import get_data_files
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from . import support
 import unittest, codecs
 
diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 
 
 import sys
@@ -157,9 +159,9 @@ def runTokenizerTest(test):
     received = normalizeTokens(tokens)
     errorMsg = "\n".join(["\n\nInitial state:",
                           test['initialState'] ,
-                          "\nInput:", str(test['input']),
-                          "\nExpected:", str(expected),
-                          "\nreceived:", str(tokens)])
+                          "\nInput:", test['input'],
+                          "\nExpected:", repr(expected),
+                          "\nreceived:", repr(tokens)])
     errorMsg = errorMsg
     ignoreErrorOrder = test.get('ignoreErrorOrder', False)
     assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import os
 import sys
 import unittest
diff --git a/html5lib/tests/test_whitespace_filter.py b/html5lib/tests/test_whitespace_filter.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import unittest
 
 from html5lib.filters.whitespace import Filter
diff --git a/html5lib/tests/tokenizertotree.py b/html5lib/tests/tokenizertotree.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import sys
 import os
 import json
diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py
@@ -1,3 +1,10 @@
+from __future__ import absolute_import, division, unicode_literals
+
+try:
+    chr = unichr
+except NameError:
+    pass
+
 from collections import deque
     
 from .constants import spaceCharacters
@@ -122,7 +129,8 @@ def consumeNumberEntity(self, isHex):
                 # within the BMP.
                 char = chr(charAsInt)
             except ValueError:
-                char = eval("u'\\U%08x'" % charAsInt)
+                v = charAsInt - 0x10000
+                char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
 
         # Discard the ; if present. Otherwise, put it back on the queue and
         # invoke parseError on parser.
diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 """A collection of modules for building different kinds of tree from
 HTML documents.
 
diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py
@@ -1,3 +1,6 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
 from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
 
 # The scope markers are inserted when entering object elements,
@@ -281,7 +284,7 @@ def _setInsertFromTable(self, value):
         
     def insertElementNormal(self, token):
         name = token["name"]
-        assert type(name) == str, "Element %s not unicode"%name
+        assert isinstance(name, text_type), "Element %s not unicode"%name
         namespace = token.get("namespace", self.defaultNamespace)
         element = self.elementClass(name, namespace)
         element.attributes = token["data"]
diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
 import re
diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py
@@ -1,3 +1,6 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
 import re
 
 from . import _base
@@ -207,7 +210,7 @@ def serializeElement(element, indent=0):
             elif element.tag == ElementTree.Comment:
                 rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
             else:
-                assert type(element.tag) is str, "Expected unicode, got %s, %s"%(type(element.tag), element.tag)
+                assert isinstance(element.tag, text_type), "Expected unicode, got %s, %s"%(type(element.tag), element.tag)
                 nsmatch = tag_regexp.match(element.tag)
 
                 if nsmatch is None:
diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 """Module for supporting the lxml.etree library. The idea here is to use as much
 of the native library as possible, without using fragile hacks like custom element
 names that break between releases. The downside of this is that we cannot represent
@@ -297,7 +299,7 @@ def insertCommentMain(self, data, parent=None):
         if (parent == self.document and
             type(self.document._elementTree.getroot()[-1].tag) == type(etree.Comment)):
                 warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
-        super().insertComment(data, parent)
+        super(TreeBuilder, self).insertComment(data, parent)
     
     def insertRoot(self, token):
         """Create the document root"""
diff --git a/html5lib/treebuilders/simpletree.py b/html5lib/treebuilders/simpletree.py
@@ -1,3 +1,6 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
 from . import _base
 from html5lib.constants import voidElements, namespaces, prefixes
 from xml.sax.saxutils import escape
@@ -25,7 +28,7 @@ def toxml(self):
         raise NotImplementedError
 
     def printTree(self, indent=0):
-        tree = '\n|%s%s' % (' '* indent, str(self))
+        tree = '\n|%s%s' % (' '* indent, text_type(self))
         for child in self.childNodes:
             tree += child.printTree(indent + 2)
         return tree
@@ -40,7 +43,7 @@ def appendChild(self, node):
         node.parent = self
 
     def insertText(self, data, insertBefore=None):
-        assert isinstance(data, str), "data %s is of type %s expected unicode"%(repr(data), type(data))
+        assert isinstance(data, text_type), "data %s is of type %s expected unicode"%(repr(data), type(data))
         if insertBefore is None:
             self.appendChild(TextNode(data))
         else:
@@ -102,7 +105,7 @@ def hilite(self, encoding="utf-8"):
         return result.encode(encoding) + "</pre>"
     
     def printTree(self):
-        tree = str(self)
+        tree = text_type(self)
         for child in self.childNodes:
             tree += child.printTree(2)
         return tree
@@ -203,7 +206,7 @@ def hilite(self):
         return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
 
     def printTree(self, indent):
-        tree = '\n|%s%s' % (' '*indent, str(self))
+        tree = '\n|%s%s' % (' '*indent, text_type(self))
         indent += 2
         if self.attributes:
             for name, value in sorted(self.attributes.items()):
diff --git a/html5lib/treebuilders/soup.py b/html5lib/treebuilders/soup.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import warnings
 
 warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 """A collection of modules for iterating through different kinds of
 tree, generating tokens identical to those produced by the tokenizer
 module.
diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
diff --git a/html5lib/treewalkers/dom.py b/html5lib/treewalkers/dom.py
diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py
diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py
diff --git a/html5lib/treewalkers/pulldom.py b/html5lib/treewalkers/pulldom.py
diff --git a/html5lib/treewalkers/simpletree.py b/html5lib/treewalkers/simpletree.py
diff --git a/html5lib/treewalkers/soup.py b/html5lib/treewalkers/soup.py
diff --git a/html5lib/trie/__init__.py b/html5lib/trie/__init__.py
diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py
diff --git a/html5lib/trie/datrie.py b/html5lib/trie/datrie.py
diff --git a/html5lib/trie/py.py b/html5lib/trie/py.py
diff --git a/html5lib/utils.py b/html5lib/utils.py
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`"""`
`2`	`4`	`HTML parsing library based on the WHATWG "HTML5"`
`3`	`5`	`specification. The parser is designed to be compatible with existing`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`import string, gettext`
`2`	`4`	`_ = gettext.gettext`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`
`2`	`4`	`class Filter(object):`
`3`	`5`	`def __init__(self, source):`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`from . import _base`
`2`	`4`
`3`	`5`	`class Filter(_base.Filter):`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`from gettext import gettext`
`2`	`4`	`_ = gettext`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`import re`
`2`	`4`
`3`	`5`	`from . import _base`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`
`2`	`4`	`from html5lib import treewalkers`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`from .htmlserializer import HTMLSerializer`
`2`	`4`
`3`	`5`	`class XHTMLSerializer(HTMLSerializer):`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`import sys`
`2`	`4`	`import os`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import absolute_import, division, unicode_literals`
	`2`	`+`
`1`	`3`	`def f1():`
`2`	`4`	`x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"`
`3`	`5`	`y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"`