Skip to content

Commit 5cc0cb6

Browse files
committed
Removed non-ASCII bytes from JSON files, to avoid simplejson portability issues.
Fixed BeautifulSoup treewalker doctype/comment portability issues. --HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401167
1 parent 54181f0 commit 5cc0cb6

File tree

1 file changed

+10
-3
lines changed

1 file changed

+10
-3
lines changed

src/html5lib/treewalkers/soup.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,12 @@ def getNodeDetails(self, node):
1414
return (_base.DOCUMENT,)
1515

1616
elif isinstance(node, Declaration): # DocumentType
17-
#Slice needed to remove markup added during unicode conversion
18-
m = self.doctype_regexp.match(unicode(node.string)[2:-1])
17+
string = unicode(node.string)
18+
#Slice needed to remove markup added during unicode conversion,
19+
#but only in some versions of BeautifulSoup/Python
20+
if string.startswith('<!') and string.endswith('>'):
21+
string = string[2:-1]
22+
m = self.doctype_regexp.match(string)
1923
#This regexp approach seems wrong and fragile
2024
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
2125
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
@@ -31,7 +35,10 @@ def getNodeDetails(self, node):
3135
return _base.DOCTYPE, name, publicId or "", systemId or ""
3236

3337
elif isinstance(node, Comment):
34-
return _base.COMMENT, unicode(node.string)[4:-3]
38+
string = unicode(node.string)
39+
if string.startswith('<!--') and string.endswith('-->'):
40+
string = string[4:-3]
41+
return _base.COMMENT, string
3542

3643
elif isinstance(node, unicode): # TextNode
3744
return _base.TEXT, node

0 commit comments

Comments
 (0)