Allow for Python implementations that don't support lone surrogates (read: Jython).

gsnedders · gsnedders · commit b51828b20c3a · 2015-04-28T23:30:58.000+01:00
This is based on earlier work by Jim Baker (thanks!).

The two major parts of this are:

 * Avoiding having lone surrogates in any string literals, and
 * Avoiding tests that contain lone surrogates.

As part of this, the decoder for double-escaped tokenizer tests is rewritten
to avoid unicode_escape as that has bogus behaviour with non-ASCII characters.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -32,3 +32,4 @@ Patches and suggestions
 - Juan Carlos Garcia Segovia
 - Mike West
 - Marc DM
+- Jim Baker
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,9 +4,10 @@ Change Log
 0.9999
 ~~~~~~
 
-Released on XXX, 2014
+Released on XXX, 2015
 
-* XXX
+* Add support for Python implementations that don't support lone surrogates
+  (read: Jython).
 
 
 0.999
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -28,7 +28,18 @@ class BufferedIOBase(object):
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
-invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+
+if utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # unichr. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
+                                    eval('"\\uD800-\\uDFFF"'))
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +175,18 @@ def __init__(self, source):
 
         """
 
-        # Craziness
-        if len("\U0010FFFF") == 1:
+        if not utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+            self.replaceCharactersRegexp = None
+        elif len("\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
+            self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+            self.replaceCharactersRegexp = re.compile(
+                eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
 
         # List of where new lines occur
         self.newLines = [0]
@@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        self.reportCharacterErrors(data)
+        if self.reportCharacterErrors:
+            self.reportCharacterErrors(data)
 
-        # Replace invalid characters
-        # Note U+0000 is dealt with in the tokenizer
-        data = self.replaceCharactersRegexp.sub("\ufffd", data)
+            # Replace invalid characters
+            # Note U+0000 is dealt with in the tokenizer
+            data = self.replaceCharactersRegexp.sub("\ufffd", data)
 
         data = data.replace("\r\n", "\n")
         data = data.replace("\r", "\n")
diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
@@ -4,10 +4,12 @@
 import warnings
 import re
 
+from six import unichr
+
 from .support import get_data_files
 
 from html5lib.tokenizer import HTMLTokenizer
-from html5lib import constants
+from html5lib import constants, utils
 
 
 class TokenizerTestParser(object):
@@ -122,9 +124,38 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
         return tokens["expected"] == tokens["received"]
 
 
+_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
+
+
 def unescape(test):
     def decode(inp):
-        return inp.encode("utf-8").decode("unicode-escape")
+        """Decode \\uXXXX escapes
+
+        This decodes \\uXXXX escapes, possibly into non-BMP characters when
+        two surrogate character escapes are adjacent to each other.
+        """
+        # This cannot be implemented using the unicode_escape codec
+        # because that requires its input be ISO-8859-1, and we need
+        # arbitrary unicode as input.
+        def repl(m):
+            if m.group(2) is not None:
+                high = int(m.group(1), 16)
+                low = int(m.group(2), 16)
+                if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
+                    cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
+                    return unichr(cp)
+                else:
+                    return unichr(high) + unichr(low)
+            else:
+                return unichr(int(m.group(1), 16))
+        try:
+            return _surrogateRe.sub(repl, inp)
+        except ValueError:
+            # This occurs when unichr throws ValueError, which should
+            # only be for a lone-surrogate.
+            if utils.supports_lone_surrogates:
+                raise
+            return None
 
     test["input"] = decode(test["input"])
     for token in test["output"]:
@@ -183,6 +214,8 @@ def testTokenizer():
                         test["initialStates"] = ["Data state"]
                     if 'doubleEscaped' in test:
                         test = unescape(test)
+                        if test["input"] is None:
+                            continue  # Not valid input for this platform
                     for initialState in test["initialStates"]:
                         test["initialState"] = capitalize(initialState)
                         yield runTokenizerTest, test
diff --git a/html5lib/utils.py b/html5lib/utils.py
@@ -2,14 +2,35 @@
 
 from types import ModuleType
 
+from six import text_type
+
 try:
     import xml.etree.cElementTree as default_etree
 except ImportError:
     import xml.etree.ElementTree as default_etree
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
-           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')
+        assert isinstance(_x, text_type)
+except:
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True
 
 
 class MethodDispatcher(dict):