@@ -14,8 +14,12 @@ def getNodeDetails(self, node):
14
14
return (_base .DOCUMENT ,)
15
15
16
16
elif isinstance (node , Declaration ): # DocumentType
17
- #Slice needed to remove markup added during unicode conversion
18
- m = self .doctype_regexp .match (unicode (node .string )[2 :- 1 ])
17
+ string = unicode (node .string )
18
+ #Slice needed to remove markup added during unicode conversion,
19
+ #but only in some versions of BeautifulSoup/Python
20
+ if string .startswith ('<!' ) and string .endswith ('>' ):
21
+ string = string [2 :- 1 ]
22
+ m = self .doctype_regexp .match (string )
19
23
#This regexp approach seems wrong and fragile
20
24
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
21
25
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
@@ -31,7 +35,10 @@ def getNodeDetails(self, node):
31
35
return _base .DOCTYPE , name , publicId or "" , systemId or ""
32
36
33
37
elif isinstance (node , Comment ):
34
- return _base .COMMENT , unicode (node .string )[4 :- 3 ]
38
+ string = unicode (node .string )
39
+ if string .startswith ('<!--' ) and string .endswith ('-->' ):
40
+ string = string [4 :- 3 ]
41
+ return _base .COMMENT , string
35
42
36
43
elif isinstance (node , unicode ): # TextNode
37
44
return _base .TEXT , node
0 commit comments