Skip to content

Commit 82377ec

Browse files
committed
Make Py3 code work on Py2 as well.
Everything works except the lxml treewalkers. That really needs rewritten.
1 parent 84b1710 commit 82377ec

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+157
-41
lines changed

html5lib/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
"""
24
HTML parsing library based on the WHATWG "HTML5"
35
specification. The parser is designed to be compatible with existing

html5lib/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import string, gettext
24
_ = gettext.gettext
35

html5lib/filters/_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13

24
class Filter(object):
35
def __init__(self, source):

html5lib/filters/inject_meta_charset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
from . import _base
24

35
class Filter(_base.Filter):

html5lib/filters/lint.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
from gettext import gettext
24
_ = gettext
35

html5lib/filters/optionaltags.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
from . import _base
24

35
class Filter(_base.Filter):

html5lib/filters/sanitizer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
from . import _base
24
from html5lib.sanitizer import HTMLSanitizerMixin
35

html5lib/filters/whitespace.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import re
24

35
from . import _base

html5lib/html5parser.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
from six import with_metaclass
3+
14
import sys
25
import types
36

@@ -444,7 +447,7 @@ def getMetaclass(use_metaclass, metaclass_func):
444447
else:
445448
return type
446449

447-
class Phase(object, metaclass=getMetaclass(debug, log)):
450+
class Phase(with_metaclass(getMetaclass(debug, log))):
448451
"""Base class for helper object that implements each phase of processing
449452
"""
450453

@@ -2686,7 +2689,7 @@ def impliedTagToken(name, type="EndTag", attributes = None,
26862689
selfClosing = False):
26872690
if attributes is None:
26882691
attributes = {}
2689-
return {"type":tokenTypes[type], "name":str(name), "data":attributes,
2692+
return {"type":tokenTypes[type], "name":name, "data":attributes,
26902693
"selfClosing":selfClosing}
26912694

26922695
class ParseError(Exception):

html5lib/ihatexml.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import re
24
import warnings
35

html5lib/inputstream.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
from six import text_type
3+
14
import codecs
25
import re
36
import types
@@ -118,9 +121,9 @@ def _readFromBuffer(self, bytes):
118121

119122
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
120123
if hasattr(source, "read"):
121-
isUnicode = isinstance(source.read(0), str)
124+
isUnicode = isinstance(source.read(0), text_type)
122125
else:
123-
isUnicode = isinstance(source, str)
126+
isUnicode = isinstance(source, text_type)
124127

125128
if isUnicode:
126129
if encoding is not None:
@@ -565,6 +568,10 @@ def __next__(self):
565568
raise TypeError
566569
return self[p:p+1]
567570

571+
def next(self):
572+
# Py2 compat
573+
return self.__next__()
574+
568575
def previous(self):
569576
p = self._position
570577
if p >= len(self):
@@ -641,6 +648,7 @@ def jumpTo(self, bytes):
641648
else:
642649
raise StopIteration
643650

651+
644652
class EncodingParser(object):
645653
"""Mini parser for detecting character encoding from meta elements"""
646654

html5lib/sanitizer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import re
24
from xml.sax.saxutils import escape, unescape
35

html5lib/serializer/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13

24
from html5lib import treewalkers
35

html5lib/serializer/htmlserializer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
from six import text_type
3+
14
import gettext
25
_ = gettext.gettext
36

@@ -158,14 +161,14 @@ def __init__(self, **kwargs):
158161
self.strict = False
159162

160163
def encode(self, string):
161-
assert(isinstance(string, str))
164+
assert(isinstance(string, text_type))
162165
if self.encoding:
163166
return string.encode(self.encoding, unicode_encode_errors)
164167
else:
165168
return string
166169

167170
def encodeStrict(self, string):
168-
assert(isinstance(string, str))
171+
assert(isinstance(string, text_type))
169172
if self.encoding:
170173
return string.encode(self.encoding, "strict")
171174
else:

html5lib/serializer/xhtmlserializer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
from .htmlserializer import HTMLSerializer
24

35
class XHTMLSerializer(HTMLSerializer):

html5lib/tests/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import sys
24
import os
35

html5lib/tests/mockParser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import sys
24
import os
35

html5lib/tests/performance/concatenation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
def f1():
24
x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
35
y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"

html5lib/tests/support.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import os
24
import sys
35
import codecs

html5lib/tests/test_encoding.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import re
24
import os
35
import unittest

html5lib/tests/test_parser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import os
24
import sys
35
import traceback

html5lib/tests/test_parser2.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import io
24

35
from . import support

html5lib/tests/test_sanitizer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import os
24
import sys
35
import unittest

html5lib/tests/test_serializer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import os
24
import unittest
35
from .support import get_data_files

html5lib/tests/test_stream.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
from . import support
24
import unittest, codecs
35

html5lib/tests/test_tokenizer.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13

24

35
import sys
@@ -157,9 +159,9 @@ def runTokenizerTest(test):
157159
received = normalizeTokens(tokens)
158160
errorMsg = "\n".join(["\n\nInitial state:",
159161
test['initialState'] ,
160-
"\nInput:", str(test['input']),
161-
"\nExpected:", str(expected),
162-
"\nreceived:", str(tokens)])
162+
"\nInput:", test['input'],
163+
"\nExpected:", repr(expected),
164+
"\nreceived:", repr(tokens)])
163165
errorMsg = errorMsg
164166
ignoreErrorOrder = test.get('ignoreErrorOrder', False)
165167
assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg

html5lib/tests/test_treewalkers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import os
24
import sys
35
import unittest

html5lib/tests/test_whitespace_filter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import unittest
24

35
from html5lib.filters.whitespace import Filter

html5lib/tests/tokenizertotree.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import sys
24
import os
35
import json

html5lib/tokenizer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
3+
try:
4+
chr = unichr
5+
except NameError:
6+
pass
7+
18
from collections import deque
29

310
from .constants import spaceCharacters
@@ -122,7 +129,8 @@ def consumeNumberEntity(self, isHex):
122129
# within the BMP.
123130
char = chr(charAsInt)
124131
except ValueError:
125-
char = eval("u'\\U%08x'" % charAsInt)
132+
v = charAsInt - 0x10000
133+
char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
126134

127135
# Discard the ; if present. Otherwise, put it back on the queue and
128136
# invoke parseError on parser.

html5lib/treebuilders/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
"""A collection of modules for building different kinds of tree from
24
HTML documents.
35

html5lib/treebuilders/_base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
from six import text_type
3+
14
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
25

36
# The scope markers are inserted when entering object elements,
@@ -281,7 +284,7 @@ def _setInsertFromTable(self, value):
281284

282285
def insertElementNormal(self, token):
283286
name = token["name"]
284-
assert type(name) == str, "Element %s not unicode"%name
287+
assert isinstance(name, text_type), "Element %s not unicode"%name
285288
namespace = token.get("namespace", self.defaultNamespace)
286289
element = self.elementClass(name, namespace)
287290
element.attributes = token["data"]

html5lib/treebuilders/dom.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13

24
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
35
import re

html5lib/treebuilders/etree.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
from six import text_type
3+
14
import re
25

36
from . import _base
@@ -207,7 +210,7 @@ def serializeElement(element, indent=0):
207210
elif element.tag == ElementTree.Comment:
208211
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
209212
else:
210-
assert type(element.tag) is str, "Expected unicode, got %s, %s"%(type(element.tag), element.tag)
213+
assert isinstance(element.tag, text_type), "Expected unicode, got %s, %s"%(type(element.tag), element.tag)
211214
nsmatch = tag_regexp.match(element.tag)
212215

213216
if nsmatch is None:

html5lib/treebuilders/etree_lxml.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
"""Module for supporting the lxml.etree library. The idea here is to use as much
24
of the native library as possible, without using fragile hacks like custom element
35
names that break between releases. The downside of this is that we cannot represent
@@ -297,7 +299,7 @@ def insertCommentMain(self, data, parent=None):
297299
if (parent == self.document and
298300
type(self.document._elementTree.getroot()[-1].tag) == type(etree.Comment)):
299301
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
300-
super().insertComment(data, parent)
302+
super(TreeBuilder, self).insertComment(data, parent)
301303

302304
def insertRoot(self, token):
303305
"""Create the document root"""

html5lib/treebuilders/simpletree.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
from six import text_type
3+
14
from . import _base
25
from html5lib.constants import voidElements, namespaces, prefixes
36
from xml.sax.saxutils import escape
@@ -25,7 +28,7 @@ def toxml(self):
2528
raise NotImplementedError
2629

2730
def printTree(self, indent=0):
28-
tree = '\n|%s%s' % (' '* indent, str(self))
31+
tree = '\n|%s%s' % (' '* indent, text_type(self))
2932
for child in self.childNodes:
3033
tree += child.printTree(indent + 2)
3134
return tree
@@ -40,7 +43,7 @@ def appendChild(self, node):
4043
node.parent = self
4144

4245
def insertText(self, data, insertBefore=None):
43-
assert isinstance(data, str), "data %s is of type %s expected unicode"%(repr(data), type(data))
46+
assert isinstance(data, text_type), "data %s is of type %s expected unicode"%(repr(data), type(data))
4447
if insertBefore is None:
4548
self.appendChild(TextNode(data))
4649
else:
@@ -102,7 +105,7 @@ def hilite(self, encoding="utf-8"):
102105
return result.encode(encoding) + "</pre>"
103106

104107
def printTree(self):
105-
tree = str(self)
108+
tree = text_type(self)
106109
for child in self.childNodes:
107110
tree += child.printTree(2)
108111
return tree
@@ -203,7 +206,7 @@ def hilite(self):
203206
return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
204207

205208
def printTree(self, indent):
206-
tree = '\n|%s%s' % (' '*indent, str(self))
209+
tree = '\n|%s%s' % (' '*indent, text_type(self))
207210
indent += 2
208211
if self.attributes:
209212
for name, value in sorted(self.attributes.items()):

html5lib/treebuilders/soup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
import warnings
24

35
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)

html5lib/treewalkers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
13
"""A collection of modules for iterating through different kinds of
24
tree, generating tokens identical to those produced by the tokenizer
35
module.

0 commit comments

Comments
 (0)