From fd101b9747337dc45469459226d3abc30b95fb01 Mon Sep 17 00:00:00 2001 From: Karim Valiev Date: Thu, 2 May 2013 09:02:45 +0400 Subject: [PATCH 1/2] fix http://code.google.com/p/html5lib/issues/detail?id=186 https://github.com/html5lib/html5lib-python/issues/33 --- html5lib/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index c49eee0d..5812c9b6 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -306,7 +306,7 @@ def rcdataState(self): # have already been appended to lastFourChars and will have broken # any sequences else: - chars = self.stream.charsUntil(("&", "<")) + chars = self.stream.charsUntil(("&", "<", u"\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True @@ -1016,7 +1016,7 @@ def attributeValueDoubleQuotedState(self): self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ - self.stream.charsUntil(("\"", "&")) + self.stream.charsUntil(("\"", "&", u"\u0000")) return True def attributeValueSingleQuotedState(self): @@ -1035,7 +1035,7 @@ def attributeValueSingleQuotedState(self): self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ - self.stream.charsUntil(("'", "&")) + self.stream.charsUntil(("'", "&", u"\u0000")) return True def attributeValueUnQuotedState(self): @@ -1060,7 +1060,7 @@ def attributeValueUnQuotedState(self): self.state = self.dataState else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( - frozenset(("&", ">", '"', "'", "=", "<", "`")) | spaceCharacters) + frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) return True def afterAttributeValueState(self): From ab436183c532fe17612e12815f8ebfad9546ec4f Mon Sep 17 00:00:00 2001 From: Karim Valiev Date: Fri, 3 May 2013 11:57:48 +0400 Subject: [PATCH 2/2] removed unicode prefix --- html5lib/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 5812c9b6..79774578 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -306,7 +306,7 @@ def rcdataState(self): # have already been appended to lastFourChars and will have broken # any sequences else: - chars = self.stream.charsUntil(("&", "<", u"\u0000")) + chars = self.stream.charsUntil(("&", "<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True @@ -1016,7 +1016,7 @@ def attributeValueDoubleQuotedState(self): self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ - self.stream.charsUntil(("\"", "&", u"\u0000")) + self.stream.charsUntil(("\"", "&", "\u0000")) return True def attributeValueSingleQuotedState(self): @@ -1035,7 +1035,7 @@ def attributeValueSingleQuotedState(self): self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ - self.stream.charsUntil(("'", "&", u"\u0000")) + self.stream.charsUntil(("'", "&", "\u0000")) return True def attributeValueUnQuotedState(self):