Skip to content

Commit bd57c61

Browse files
committed
Get Python tokenizer up to the version of the spec of the end of September.
1 parent 7d29315 commit bd57c61

File tree

1 file changed

+75
-5
lines changed

1 file changed

+75
-5
lines changed

src/html5lib/tokenizer.py

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,7 @@ def afterDoctypeNameState(self):
955955
matched = False
956956
break
957957
if matched:
958-
self.state = self.beforeDoctypePublicIdentifierState
958+
self.state = self.afterDoctypePublicKeywordState
959959
return True
960960
elif data in (u"s", u"S"):
961961
matched = True
@@ -966,7 +966,7 @@ def afterDoctypeNameState(self):
966966
matched = False
967967
break
968968
if matched:
969-
self.state = self.beforeDoctypeSystemIdentifierState
969+
self.state = self.afterDoctypeSystemKeywordState
970970
return True
971971

972972
# All the characters read before the current 'data' will be
@@ -981,6 +981,26 @@ def afterDoctypeNameState(self):
981981
self.state = self.bogusDoctypeState
982982

983983
return True
984+
985+
def afterDoctypePublicKeywordState(self):
986+
data = self.stream.char()
987+
if data in spaceCharacters:
988+
self.state = self.beforeDoctypePublicIdentifierState
989+
elif data in ("'", '"'):
990+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
991+
"unexpected-char-in-doctype"})
992+
self.stream.unget(data)
993+
self.state = self.beforeDoctypePublicIdentifierState
994+
elif data is EOF:
995+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
996+
"eof-in-doctype"})
997+
self.currentToken["correct"] = False
998+
self.tokenQueue.append(self.currentToken)
999+
self.state = self.dataState
1000+
else:
1001+
self.stream.unget(data)
1002+
self.state = self.beforeDoctypePublicIdentifierState
1003+
return True
9841004

9851005
def beforeDoctypePublicIdentifierState(self):
9861006
data = self.stream.char()
@@ -1054,17 +1074,47 @@ def doctypePublicIdentifierSingleQuotedState(self):
10541074
def afterDoctypePublicIdentifierState(self):
10551075
data = self.stream.char()
10561076
if data in spaceCharacters:
1057-
pass
1058-
elif data == "\"":
1077+
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1078+
elif data == ">":
1079+
self.tokenQueue.append(self.currentToken)
1080+
self.state = self.dataState
1081+
elif data == '"':
1082+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1083+
"unexpected-char-in-doctype"})
10591084
self.currentToken["systemId"] = u""
10601085
self.state = self.doctypeSystemIdentifierDoubleQuotedState
10611086
elif data == "'":
1087+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1088+
"unexpected-char-in-doctype"})
10621089
self.currentToken["systemId"] = u""
10631090
self.state = self.doctypeSystemIdentifierSingleQuotedState
1091+
elif data is EOF:
1092+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1093+
"eof-in-doctype"})
1094+
self.currentToken["correct"] = False
1095+
self.tokenQueue.append(self.currentToken)
1096+
self.state = self.dataState
1097+
else:
1098+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1099+
"unexpected-char-in-doctype"})
1100+
self.currentToken["correct"] = False
1101+
self.state = self.bogusDoctypeState
1102+
return True
1103+
1104+
def betweenDoctypePublicAndSystemIdentifiersState(self):
1105+
data = self.stream.char()
1106+
if data in spaceCharacters:
1107+
pass
10641108
elif data == ">":
10651109
self.tokenQueue.append(self.currentToken)
10661110
self.state = self.dataState
1067-
elif data is EOF:
1111+
elif data == '"':
1112+
self.currentToken["systemId"] = u""
1113+
self.state = self.doctypeSystemIdentifierDoubleQuotedState
1114+
elif data == "'":
1115+
self.currentToken["systemId"] = u""
1116+
self.state = self.doctypeSystemIdentifierSingleQuotedState
1117+
elif data == EOF:
10681118
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10691119
"eof-in-doctype"})
10701120
self.currentToken["correct"] = False
@@ -1077,6 +1127,26 @@ def afterDoctypePublicIdentifierState(self):
10771127
self.state = self.bogusDoctypeState
10781128
return True
10791129

1130+
def afterDoctypeSystemKeywordState(self):
1131+
data = self.stream.char()
1132+
if data in spaceCharacters:
1133+
self.state = self.beforeDoctypeSystemIdentifierState
1134+
elif data in ("'", '"'):
1135+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1136+
"unexpected-char-in-doctype"})
1137+
self.stream.unget(data)
1138+
self.state = self.beforeDoctypeSystemIdentifierState
1139+
elif data is EOF:
1140+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1141+
"eof-in-doctype"})
1142+
self.currentToken["correct"] = False
1143+
self.tokenQueue.append(self.currentToken)
1144+
self.state = self.dataState
1145+
else:
1146+
self.stream.unget(data)
1147+
self.state = self.beforeDoctypeSystemIdentifierState
1148+
return True
1149+
10801150
def beforeDoctypeSystemIdentifierState(self):
10811151
data = self.stream.char()
10821152
if data in spaceCharacters:

0 commit comments

Comments
 (0)