Removed non-ASCII bytes from JSON files, to avoid simplejson portability issues.

philiptaylor · philiptaylor · commit 5cc0cb644a53 · 2008-06-07T01:23:37.000Z
Fixed BeautifulSoup treewalker doctype/comment portability issues.

--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401167
diff --git a/src/html5lib/treewalkers/soup.py b/src/html5lib/treewalkers/soup.py
@@ -14,8 +14,12 @@ def getNodeDetails(self, node):
             return (_base.DOCUMENT,)
 
         elif isinstance(node, Declaration): # DocumentType
-            #Slice needed to remove markup added during unicode conversion
-            m = self.doctype_regexp.match(unicode(node.string)[2:-1])
+            string = unicode(node.string)
+            #Slice needed to remove markup added during unicode conversion,
+            #but only in some versions of BeautifulSoup/Python
+            if string.startswith('<!') and string.endswith('>'):
+                string = string[2:-1]
+            m = self.doctype_regexp.match(string)
             #This regexp approach seems wrong and fragile
             #but beautiful soup stores the doctype as a single thing and we want the seperate bits
             #It should work as long as the tree is created by html5lib itself but may be wrong if it's
@@ -31,7 +35,10 @@ def getNodeDetails(self, node):
             return _base.DOCTYPE, name, publicId or "", systemId or ""
 
         elif isinstance(node, Comment):
-            return _base.COMMENT, unicode(node.string)[4:-3]
+            string = unicode(node.string)
+            if string.startswith('<!--') and string.endswith('-->'):
+                string = string[4:-3]
+            return _base.COMMENT, string
 
         elif isinstance(node, unicode): # TextNode
             return _base.TEXT, node