Rebuild Py3

gsnedders · gsnedders · commit d81e89218cb3 · 2012-05-22T19:45:56.000+01:00
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -193,9 +193,9 @@ def openStream(self, source):
         else:
             # Otherwise treat source as a string and convert to a file object
             if isinstance(source, str):
-                # This can error (on invalid characters, thus the need for the argument)
-                source = source.encode('utf-32', errors="replace")
-                self.charEncoding = ("utf-32", "certain")
+                # XXX: we should handle lone surrogates here
+                source = source.encode('utf-8', errors="replace")
+                self.charEncoding = ("utf-8", "certain")
             try:
                 from io import BytesIO
             except:
@@ -783,9 +783,7 @@ def codecName(encoding):
     """Return the python codec name corresponding to an encoding or None if the
     string doesn't correspond to a valid encoding."""
     if encoding:
-        print(encoding)
         canonicalName = ascii_punctuation_re.sub("", encoding).lower()
-        print(canonicalName)
         return encodings.get(canonicalName, None)
     else:
         return None
diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py
@@ -49,8 +49,8 @@ class HTMLSanitizerMixin(object):
         'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
         'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
         'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
-        'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
-        'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
+        'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
+        'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
         'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
         'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
         'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
@@ -97,7 +97,7 @@ class HTMLSanitizerMixin(object):
         'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
         'y1', 'y2', 'zoomAndPan']
 
-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
         'xlink:href', 'xml:base']
 
     svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
@@ -62,9 +62,8 @@
 except ImportError:
     pass
 
-def html5lib_test_files(subdirectory, files='*.dat'):
+def get_data_files(subdirectory, files='*.dat'):
     return glob.glob(os.path.join(test_dir,subdirectory,files))
-html5lib_test_files.__test__ = False
 
 class DefaultDict(dict):
     def __init__(self, default, *args, **kwargs):
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
@@ -7,7 +7,7 @@
 except AttributeError:
     unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
 
-from .support import html5lib_test_files, TestData, test_dir
+from .support import get_data_files, TestData, test_dir
 from html5lib import HTMLParser, inputstream
 
 class Html5EncodingTestCase(unittest.TestCase):
@@ -24,7 +24,7 @@ def test_codec_name_d(self):
         self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
 
 def buildTestSuite():
-    for filename in html5lib_test_files("encoding"):
+    for filename in get_data_files("encoding"):
         test_name = os.path.basename(filename).replace('.dat',''). \
             replace('-','')
         tests = TestData(filename, "data")
diff --git a/html5lib/tests/test_parser.py b/html5lib/tests/test_parser.py
@@ -7,7 +7,7 @@
 
 warnings.simplefilter("error")
 
-from .support import html5lib_test_files as data_files
+from .support import get_data_files
 from .support import TestData, convert, convertExpected, treeTypes
 import html5lib
 from html5lib import html5parser, treebuilders, constants
@@ -67,7 +67,7 @@ def runParserTest(innerHTML, input, expected, errors, treeClass,
 
 def test_parser():
     sys.stderr.write('Testing tree builders '+ " ".join(list(treeTypes.keys())) + "\n")
-    files = data_files('tree-construction')
+    files = get_data_files('tree-construction')
     
     for filename in files:
         testName = os.path.basename(filename).replace(".dat","")
diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
@@ -1,6 +1,6 @@
 import os
 import unittest
-from .support import html5lib_test_files
+from .support import get_data_files
 
 try:
     import json
@@ -183,11 +183,12 @@ def testEntityNoResolve(self):
             self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
 
 def test_serializer():
-    for filename in html5lib_test_files('serializer', '*.test'):
-        tests = json.load(open(filename))
-        test_name = os.path.basename(filename).replace('.test','')
-        for index, test in enumerate(tests['tests']):
-            xhtml = test.get("xhtml", test["expected"])
-            if test_name == 'optionaltags': 
-                xhtml = None
-            yield runSerializerTest, test["input"], test["expected"], xhtml, test.get("options", {})
+    for filename in get_data_files('serializer', '*.test'):
+        with open(filename) as fp:
+            tests = json.load(fp)
+            test_name = os.path.basename(filename).replace('.test','')
+            for index, test in enumerate(tests['tests']):
+                xhtml = test.get("xhtml", test["expected"])
+                if test_name == 'optionaltags': 
+                    xhtml = None
+                yield runSerializerTest, test["input"], test["expected"], xhtml, test.get("options", {})
diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
@@ -2,7 +2,6 @@
 
 import sys
 import os
-import unittest
 import io
 import warnings
 import re
@@ -12,7 +11,7 @@
 except ImportError:
     import simplejson as json
 
-from .support import html5lib_test_files
+from .support import get_data_files
 from html5lib.tokenizer import HTMLTokenizer
 from html5lib import constants
 
@@ -124,7 +123,7 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
                         tokens[tokenType][1].append(token)
         return tokens["expected"] == tokens["received"]
 
-def unescape_test(test):
+def unescape(test):
     def decode(inp):
         return inp.encode("utf-8").decode("unicode-escape")
 
@@ -139,14 +138,12 @@ def decode(inp):
                     del token[2][key]
                     token[2][decode(key)] = decode(value)
     return test
-unescape_test.__test__ = False
-
 
 def runTokenizerTest(test):
     #XXX - move this out into the setup function
     #concatenate all consecutive character tokens into a single token
     if 'doubleEscaped' in test:
-        test = unescape_test(test)
+        test = unescape(test)
 
     expected = concatenateCharacterTokens(test['output'])            
     if 'lastStartTag' not in test:
@@ -166,8 +163,7 @@ def runTokenizerTest(test):
                           "\nreceived:", str(tokens)])
     errorMsg = errorMsg
     ignoreErrorOrder = test.get('ignoreErrorOrder', False)
-    assert tokensMatch(expected, received, ignoreErrorOrder), errorMsg
-
+    assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg
 
 def _doCapitalize(match):
     return match.group(1).upper()
@@ -179,19 +175,17 @@ def capitalize(s):
     s = _capitalizeRe(_doCapitalize, s)
     return s
 
-
-def test_tokenizer():
-    for filename in html5lib_test_files('tokenizer', '*.test'):
+def testTokenizer():
+    for filename in get_data_files('tokenizer', '*.test'):
         with open(filename) as fp:
             tests = json.load(fp)
             testName = os.path.basename(filename).replace(".test","")
             if 'tests' in tests:
                 for index,test in enumerate(tests['tests']):
-                    #Skip tests with a self closing flag
+                #Skip tests with a self closing flag
                     skip = False
                     if 'initialStates' not in test:
                         test["initialStates"] = ["Data state"]
                     for initialState in test["initialStates"]:
                         test["initialState"] = capitalize(initialState)
                         yield runTokenizerTest, test
-
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
@@ -10,7 +10,7 @@
 
 warnings.simplefilter("error")
 
-from .support import html5lib_test_files, TestData, convertExpected
+from .support import get_data_files, TestData, convertExpected
 
 from html5lib import html5parser, treewalkers, treebuilders, constants
 from html5lib.filters.lint import Filter as LintFilter, LintError
@@ -298,7 +298,7 @@ def test_treewalker():
     sys.stdout.write('Testing tree walkers '+ " ".join(list(treeTypes.keys())) + "\n")
 
     for treeName, treeCls in treeTypes.items():
-        files = html5lib_test_files('tree-construction')
+        files = get_data_files('tree-construction')
         for filename in files:
             testName = os.path.basename(filename).replace(".dat","")
 
diff --git a/html5lib/tests/tokenizertotree.py b/html5lib/tests/tokenizertotree.py
@@ -17,7 +17,7 @@ def main(out_path):
         sys.stderr.write("Path %s does not exist"%out_path)
         sys.exit(1)
 
-    for filename in support.html5lib_test_files('tokenizer', '*.test'):
+    for filename in support.get_data_files('tokenizer', '*.test'):
         run_file(filename, out_path)
 
 def run_file(filename, out_path):