Rewrite support_lone_surrogates to feature-sniff.

gsnedders · gsnedders · commit e04fff83e370 · 2015-04-26T02:50:41.000+01:00
diff --git a/html5lib/utils.py b/html5lib/utils.py
@@ -1,8 +1,9 @@
 from __future__ import absolute_import, division, unicode_literals
 
-import platform
 from types import ModuleType
 
+from six import text_type
+
 try:
     import xml.etree.cElementTree as default_etree
 except ImportError:
@@ -15,13 +16,21 @@
 
 
 # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
-# added to the below test. In general this would be any platform using
-# UTF-16 as its encoding of unicode strings, such as Jython. This is
-# because UTF-16 itself is based on the use of such surrogates, and
-# there is no mechanism to further escape such escapes.
-#
-# Otherwise we assume such support.
-supports_lone_surrogates = platform.python_implementation() != "Jython"
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')
+        assert isinstance(_x, text_type)
+except:
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True
 
 
 class MethodDispatcher(dict):