Skip to content

Commit b39e8a0

Browse files
committed
Update initial insertion mode to really match the spec (reorder it, and fix the quirks mode triggers).
1 parent c5de2df commit b39e8a0

File tree

1 file changed

+88
-107
lines changed

1 file changed

+88
-107
lines changed

src/html5lib/html5parser.py

Lines changed: 88 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -460,152 +460,133 @@ def processEndTag(self, token):
460460
self.endTagHandler[token["name"]](token)
461461

462462
class InitialPhase(Phase):
463-
# This phase deals with error handling as well which is currently not
464-
# covered in the specification. The error handling is typically known as
465-
# "quirks mode". It is expected that a future version of HTML5 will defin
466-
# this.
467-
def processEOF(self):
468-
self.parser.parseError("expected-doctype-but-got-eof")
469-
self.parser.compatMode = "quirks"
470-
self.parser.phase = self.parser.phases["beforeHtml"]
471-
self.parser.phase.processEOF()
472-
463+
def processSpaceCharacters(self, token):
464+
pass
465+
473466
def processComment(self, token):
474467
self.tree.insertComment(token, self.tree.document)
475468

476469
def processDoctype(self, token):
477-
478470
name = token["name"]
479471
publicId = token["publicId"]
480472
systemId = token["systemId"]
481473
correct = token["correct"]
482474

483475
if (name != "html" or publicId != None or
484-
systemId != None):
476+
systemId != None and systemId != "about:legacy-compat"):
485477
self.parser.parseError("unknown-doctype")
486478

487479
if publicId is None:
488480
publicId = ""
489-
if systemId is None:
490-
systemId = ""
491481

492482
self.tree.insertDoctype(token)
493483

494484
if publicId != "":
495485
publicId = publicId.translate(asciiUpper2Lower)
496486

497487
if (not correct or token["name"] != "html"
498-
or publicId in
499-
("+//silmaril//dtd html pro v0r11 19970101//en",
500-
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
501-
"-//as//dtd html 3.0 aswedit + extensions//en",
502-
"-//ietf//dtd html 2.0 level 1//en",
503-
"-//ietf//dtd html 2.0 level 2//en",
504-
"-//ietf//dtd html 2.0 strict level 1//en",
505-
"-//ietf//dtd html 2.0 strict level 2//en",
506-
"-//ietf//dtd html 2.0 strict//en",
507-
"-//ietf//dtd html 2.0//en",
508-
"-//ietf//dtd html 2.1e//en",
509-
"-//ietf//dtd html 3.0//en",
510-
"-//ietf//dtd html 3.0//en//",
511-
"-//ietf//dtd html 3.2 final//en",
512-
"-//ietf//dtd html 3.2//en",
513-
"-//ietf//dtd html 3//en",
514-
"-//ietf//dtd html level 0//en",
515-
"-//ietf//dtd html level 0//en//2.0",
516-
"-//ietf//dtd html level 1//en",
517-
"-//ietf//dtd html level 1//en//2.0",
518-
"-//ietf//dtd html level 2//en",
519-
"-//ietf//dtd html level 2//en//2.0",
520-
"-//ietf//dtd html level 3//en",
521-
"-//ietf//dtd html level 3//en//3.0",
522-
"-//ietf//dtd html strict level 0//en",
523-
"-//ietf//dtd html strict level 0//en//2.0",
524-
"-//ietf//dtd html strict level 1//en",
525-
"-//ietf//dtd html strict level 1//en//2.0",
526-
"-//ietf//dtd html strict level 2//en",
527-
"-//ietf//dtd html strict level 2//en//2.0",
528-
"-//ietf//dtd html strict level 3//en",
529-
"-//ietf//dtd html strict level 3//en//3.0",
530-
"-//ietf//dtd html strict//en",
531-
"-//ietf//dtd html strict//en//2.0",
532-
"-//ietf//dtd html strict//en//3.0",
533-
"-//ietf//dtd html//en",
534-
"-//ietf//dtd html//en//2.0",
535-
"-//ietf//dtd html//en//3.0",
536-
"-//metrius//dtd metrius presentational//en",
537-
"-//microsoft//dtd internet explorer 2.0 html strict//en",
538-
"-//microsoft//dtd internet explorer 2.0 html//en",
539-
"-//microsoft//dtd internet explorer 2.0 tables//en",
540-
"-//microsoft//dtd internet explorer 3.0 html strict//en",
541-
"-//microsoft//dtd internet explorer 3.0 html//en",
542-
"-//microsoft//dtd internet explorer 3.0 tables//en",
543-
"-//netscape comm. corp.//dtd html//en",
544-
"-//netscape comm. corp.//dtd strict html//en",
545-
"-//o'reilly and associates//dtd html 2.0//en",
546-
"-//o'reilly and associates//dtd html extended 1.0//en",
547-
"-//o'reilly and associates//dtd html extended relaxed 1.0//en",
548-
"-//spyglass//dtd html 2.0 extended//en",
549-
"-//sq//dtd html 2.0 hotmetal + extensions//en",
550-
"-//sun microsystems corp.//dtd hotjava html//en",
551-
"-//sun microsystems corp.//dtd hotjava strict html//en",
552-
"-//w3c//dtd html 3 1995-03-24//en",
553-
"-//w3c//dtd html 3.2 draft//en",
554-
"-//w3c//dtd html 3.2 final//en",
555-
"-//w3c//dtd html 3.2//en",
556-
"-//w3c//dtd html 3.2s draft//en",
557-
"-//w3c//dtd html 4.0 frameset//en",
558-
"-//w3c//dtd html 4.0 transitional//en",
559-
"-//w3c//dtd html experimental 19960712//en",
560-
"-//w3c//dtd html experimental 970421//en",
561-
"-//w3c//dtd w3 html//en",
562-
"-//w3o//dtd w3 html 3.0//en",
563-
"-//w3o//dtd w3 html 3.0//en//",
564-
"-//w3o//dtd w3 html strict 3.0//en//",
565-
"-//webtechs//dtd mozilla html 2.0//en",
566-
"-//webtechs//dtd mozilla html//en",
567-
"-/w3c/dtd html 4.0 transitional/en",
568-
"html")
569-
or (publicId in
570-
("-//w3c//dtd html 4.01 frameset//EN",
571-
"-//w3c//dtd html 4.01 transitional//EN") and
572-
systemId == None)
573-
or (systemId != None and
574-
systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
488+
or publicId.startswith(
489+
("+//silmaril//dtd html pro v0r11 19970101//",
490+
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
491+
"-//as//dtd html 3.0 aswedit + extensions//",
492+
"-//ietf//dtd html 2.0 level 1//",
493+
"-//ietf//dtd html 2.0 level 2//",
494+
"-//ietf//dtd html 2.0 strict level 1//",
495+
"-//ietf//dtd html 2.0 strict level 2//",
496+
"-//ietf//dtd html 2.0 strict//",
497+
"-//ietf//dtd html 2.0//",
498+
"-//ietf//dtd html 2.1e//",
499+
"-//ietf//dtd html 3.0//",
500+
"-//ietf//dtd html 3.2 final//",
501+
"-//ietf//dtd html 3.2//",
502+
"-//ietf//dtd html 3//",
503+
"-//ietf//dtd html level 0//",
504+
"-//ietf//dtd html level 1//",
505+
"-//ietf//dtd html level 2//",
506+
"-//ietf//dtd html level 3//",
507+
"-//ietf//dtd html strict level 0//",
508+
"-//ietf//dtd html strict level 1//",
509+
"-//ietf//dtd html strict level 2//",
510+
"-//ietf//dtd html strict level 3//",
511+
"-//ietf//dtd html strict//",
512+
"-//ietf//dtd html//",
513+
"-//metrius//dtd metrius presentational//",
514+
"-//microsoft//dtd internet explorer 2.0 html strict//",
515+
"-//microsoft//dtd internet explorer 2.0 html//",
516+
"-//microsoft//dtd internet explorer 2.0 tables//",
517+
"-//microsoft//dtd internet explorer 3.0 html strict//",
518+
"-//microsoft//dtd internet explorer 3.0 html//",
519+
"-//microsoft//dtd internet explorer 3.0 tables//",
520+
"-//netscape comm. corp.//dtd html//",
521+
"-//netscape comm. corp.//dtd strict html//",
522+
"-//o'reilly and associates//dtd html 2.0//",
523+
"-//o'reilly and associates//dtd html extended 1.0//",
524+
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
525+
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
526+
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
527+
"-//spyglass//dtd html 2.0 extended//",
528+
"-//sq//dtd html 2.0 hotmetal + extensions//",
529+
"-//sun microsystems corp.//dtd hotjava html//",
530+
"-//sun microsystems corp.//dtd hotjava strict html//",
531+
"-//w3c//dtd html 3 1995-03-24//",
532+
"-//w3c//dtd html 3.2 draft//",
533+
"-//w3c//dtd html 3.2 final//",
534+
"-//w3c//dtd html 3.2//",
535+
"-//w3c//dtd html 3.2s draft//",
536+
"-//w3c//dtd html 4.0 frameset//",
537+
"-//w3c//dtd html 4.0 transitional//",
538+
"-//w3c//dtd html experimental 19960712//",
539+
"-//w3c//dtd html experimental 970421//",
540+
"-//w3c//dtd w3 html//",
541+
"-//w3o//dtd w3 html 3.0//",
542+
"-//webtechs//dtd mozilla html 2.0//",
543+
"-//webtechs//dtd mozilla html//"))
544+
or publicId in
545+
("-//w3o//dtd w3 html strict 3.0//en//",
546+
"-/w3c/dtd html 4.0 transitional/en",
547+
"html")
548+
or publicId.startswith(
549+
("-//w3c//dtd html 4.01 frameset//",
550+
"-//w3c//dtd html 4.01 transitional//")) and
551+
systemId == None
552+
or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
575553
self.parser.compatMode = "quirks"
576-
elif (publicId in
577-
("-//w3c//dtd xhtml 1.0 frameset//EN",
578-
"-//w3c//dtd xhtml 1.0 transitional//EN")
579-
or (publicId in
580-
("-//w3c//dtd html 4.01 frameset//EN",
581-
"-//w3c//dtd html 4.01 transitional//EN") and
582-
systemId == None)):
554+
elif (publicId.startswith(
555+
("-//w3c//dtd xhtml 1.0 frameset//",
556+
"-//w3c//dtd xhtml 1.0 transitional//"))
557+
or publicId.startswith(
558+
("-//w3c//dtd html 4.01 frameset//",
559+
"-//w3c//dtd html 4.01 transitional//")) and
560+
systemId != None):
583561
self.parser.compatMode = "limited quirks"
584562

585563
self.parser.phase = self.parser.phases["beforeHtml"]
586-
587-
def processSpaceCharacters(self, token):
588-
pass
564+
565+
def anythingElse(self):
566+
self.parser.compatMode = "quirks"
567+
self.parser.phase = self.parser.phases["beforeHtml"]
589568

590569
def processCharacters(self, token):
591570
self.parser.parseError("expected-doctype-but-got-chars")
592-
self.parser.compatMode = "quirks"
593-
self.parser.phase = self.parser.phases["beforeHtml"]
571+
self.anythingElse()
594572
self.parser.phase.processCharacters(token)
595573

596574
def processStartTag(self, token):
597575
self.parser.parseError("expected-doctype-but-got-start-tag",
598576
{"name": token["name"]})
599-
self.parser.compatMode = "quirks"
600-
self.parser.phase = self.parser.phases["beforeHtml"]
577+
self.anythingElse()
601578
self.parser.phase.processStartTag(token)
602579

603580
def processEndTag(self, token):
604581
self.parser.parseError("expected-doctype-but-got-end-tag",
605582
{"name": token["name"]})
606-
self.parser.compatMode = "quirks"
607-
self.parser.phase = self.parser.phases["beforeHtml"]
583+
self.anythingElse()
608584
self.parser.phase.processEndTag(token)
585+
586+
def processEOF(self):
587+
self.parser.parseError("expected-doctype-but-got-eof")
588+
self.anythingElse()
589+
self.parser.phase.processEOF()
609590

610591

611592
class BeforeHtmlPhase(Phase):

0 commit comments

Comments
 (0)