Attempt at merging svgmathml branch to the default branch

jgraham · jgraham · commit babe4a3b4174 · 2009-05-30T23:07:14.000+02:00
--HG--
branch : svgmathml
rename : python/parse.py =&gt; python3/parse.py
rename : python/src/html5lib/__init__.py =&gt; python3/src/html5lib/__init__.py
rename : python/src/html5lib/constants.py =&gt; python3/src/html5lib/constants.py
rename : python/src/html5lib/filters/optionaltags.py =&gt; python3/src/html5lib/filters/optionaltags.py
rename : python/src/html5lib/html5parser.py =&gt; python3/src/html5lib/html5parser.py
rename : python/src/html5lib/inputstream.py =&gt; python3/src/html5lib/inputstream.py
rename : python/src/html5lib/sanitizer.py =&gt; python3/src/html5lib/sanitizer.py
rename : python/src/html5lib/serializer/__init__.py =&gt; python3/src/html5lib/serializer/__init__.py
rename : python/src/html5lib/tokenizer.py =&gt; python3/src/html5lib/tokenizer.py
rename : python/src/html5lib/treebuilders/etree_lxml.py =&gt; python3/src/html5lib/treebuilders/etree_lxml.py
rename : python/src/html5lib/treebuilders/simpletree.py =&gt; python3/src/html5lib/treebuilders/simpletree.py
rename : python/tests/test_encoding.py =&gt; python3/tests/test_encoding.py
rename : python/tests/test_parser.py =&gt; python3/tests/test_parser.py
rename : python/tests/test_tokenizer.py =&gt; python3/tests/test_tokenizer.py
diff --git a/parse.py b/parse.py
@@ -57,7 +57,7 @@ def parse():
     else:
         tokenizer = HTMLTokenizer
 
-    if opts.xml:
+    if opts.liberalxml:
         p = liberalxmlparser.XHTMLParser(tree=treebuilder, tokenizer=tokenizer)
     else:
         p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)
diff --git a/src/html5lib/constants.py b/src/html5lib/constants.py
@@ -1070,7 +1070,6 @@
     'utf16': 'utf-16',
     'utf16be': 'utf-16-be',
     'utf16le': 'utf-16-le',
-    'utf7': 'utf-7',
     'utf8': 'utf-8',
     'windows1250': 'cp1250',
     'windows1251': 'cp1251',
diff --git a/src/html5lib/filters/optionaltags.py b/src/html5lib/filters/optionaltags.py
@@ -31,7 +31,11 @@ def is_optional_start(self, tagname, previous, next):
         elif tagname == 'head':
             # A head element's start tag may be omitted if the first thing
             # inside the head element is an element.
-            return type == "StartTag"
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
         elif tagname == 'body':
             # A body element's start tag may be omitted if the first thing
             # inside the body element is not a space character or a comment,
@@ -52,7 +56,7 @@ def is_optional_start(self, tagname, previous, next):
             # inside the colgroup element is a col element, and if the element
             # is not immediately preceeded by another colgroup element whose
             # end tag has been omitted.
-            if type == "StartTag":
+            if type in ("StartTag", "EmptyTag"):
                 # XXX: we do not look at the preceding event, so instead we never
                 # omit the colgroup element's end tag when it is immediately
                 # followed by another colgroup element. See is_optional_end.
@@ -114,7 +118,7 @@ def is_optional_end(self, tagname, next):
             # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
             # nav, ol, p, pre, section, table, or ul, element, or if
             # there is no more content in the parent element.
-            if type == "StartTag":
+            if type in ("StartTag", "EmptyTag"):
                 return next["name"] in ('address', 'article', 'aside',     \
                     'blockquote', 'datagrid', 'dialog', 'dir', 'div',      \
                     'dl', 'fieldset', 'footer', 'form', 'h1', 'h2', 'h3',  \
diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py
@@ -108,7 +108,6 @@ def _parse(self, stream, innerHTML=False, container="div",
         # We only seem to have InBodyPhase testcases where the following is
         # relevant ... need others too
         self.lastPhase = None
-
         self.beforeRCDataPhase = None
 
         CharactersToken = tokenTypes["Characters"]
@@ -120,6 +119,8 @@ def _parse(self, stream, innerHTML=False, container="div",
         
         
         for token in self.normalizedTokens():
+            #print self.phase.__class__.__name__
+            #print token
             type = token["type"]
             if type == CharactersToken:
                 self.phase.processCharacters(token)
@@ -271,18 +272,6 @@ def __init__(self, parser, tree):
 
     def processEOF(self):
         raise NotImplementedError
-        self.tree.generateImpliedEndTags()
-        if len(self.tree.openElements) > 2:
-            self.parser.parseError("expected-closing-tag-but-got-eof")
-        elif len(self.tree.openElements) == 2 and\
-          self.tree.openElements[1].name != "body":
-            # This happens for framesets or something?
-            self.parser.parseError("expected-closing-tag-but-got-eof")
-        elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
-            # XXX This is not what the specification says. Not sure what to do
-            # here.
-            self.parser.parseError("eof-in-innerhtml")
-        # Betting ends.
 
     def processComment(self, token):
         # For most phases the following is correct. Where it's not it will be
@@ -318,7 +307,7 @@ class InitialPhase(Phase):
     # this.
     def processEOF(self):
         self.parser.parseError("expected-doctype-but-got-eof")
-        self.compatMode = "quirks"
+        self.parser.compatMode = "quirks"
         self.parser.phase = self.parser.phases["beforeHtml"]
         self.parser.phase.processEOF()
 
@@ -346,8 +335,9 @@ def processDoctype(self, token):
         if publicId != "":
             publicId = publicId.translate(asciiUpper2Lower)
 
-        if (not correct or token["name"] != "html"
-            or publicId in 
+
+        if ((not correct) or nameLower != "html"
+            or publicId in
             ("+//silmaril//dtd html pro v0r11 19970101//en",
              "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
              "-//as//dtd html 3.0 aswedit + extensions//en",
@@ -419,19 +409,18 @@ def processDoctype(self, token):
              "html")
             or (publicId in
                 ("-//w3c//dtd html 4.01 frameset//EN",
-                 "-//w3c//dtd html 4.01 transitional//EN") and 
-                systemId == None)
+                 "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)
             or (systemId != None and
-                systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
-            self.compatMode = "quirks"
+              systemId == 
+                "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
+            self.parser.compatMode = "quirks"
         elif (publicId in
-                ("-//w3c//dtd xhtml 1.0 frameset//EN",
-                 "-//w3c//dtd xhtml 1.0 transitional//EN")
+              ("-//w3c//dtd xhtml 1.0 frameset//EN",
+               "-//w3c//dtd xhtml 1.0 transitional//EN")
               or (publicId in
                   ("-//w3c//dtd html 4.01 frameset//EN",
-                   "-//w3c//dtd html 4.01 transitional//EN") and 
-                  systemId == None)):
-            self.compatMode = "limited quirks"
+                   "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)):
+            self.parser.compatMode = "limited quirks"
 
         self.parser.phase = self.parser.phases["beforeHtml"]
 
@@ -440,7 +429,7 @@ def processSpaceCharacters(self, token):
 
     def processCharacters(self, token):
         self.parser.parseError("expected-doctype-but-got-chars")
-        self.compatMode = "quirks"
+        self.parser.compatMode = "quirks"
         self.parser.phase = self.parser.phases["beforeHtml"]
         self.parser.phase.processCharacters(token)
 
@@ -595,7 +584,8 @@ def startTagMeta(self, token):
                 codec = inputstream.codecName(attributes["charset"])
                 self.parser.tokenizer.stream.changeEncoding(codec)
             elif "content" in attributes:
-                data = inputstream.EncodingBytes(attributes["content"])
+                data = inputstream.EncodingBytes(
+                    attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
                 parser = inputstream.ContentAttrParser(data)
                 codec = parser.parse()
                 self.parser.tokenizer.stream.changeEncoding(codec)
diff --git a/src/html5lib/inputstream.py b/src/html5lib/inputstream.py
@@ -1,6 +1,7 @@
 import codecs
 import re
 import types
+import sys
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
 from .constants import encodings, ReparseException
@@ -188,7 +189,8 @@ def openStream(self, source):
             import io
             stream = io.BytesIO(bytes(source))
 
-        if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
+        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
+            stream is sys.stdin):
             stream = BufferedStream(stream)
 
         return stream
@@ -452,6 +454,9 @@ class EncodingBytes(bytes):
     """Bytes-like object with an assosiated position and various extra methods
     If the position is ever greater than the string length then an exception is
     raised"""
+    def __new__(self, value):
+        return str.__new__(self, value)
+
     def __init__(self, value):
         self._position = -1
     
diff --git a/src/html5lib/sanitizer.py b/src/html5lib/sanitizer.py
@@ -152,7 +152,7 @@ def sanitize_token(self, token):
                             continue
                         val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                                unescape(attrs[attr])).lower()
-                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) or
+                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
                             (val_unescaped.split(':')[0] not in 
                              self.allowed_protocols)):
                             del attrs[attr]
diff --git a/src/html5lib/tokenizer.py b/src/html5lib/tokenizer.py
@@ -142,7 +142,7 @@ def consumeNumberEntity(self, isHex):
         # Certain characters get replaced with U+FFFD
         if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
          or (0x007F <= charAsInt <= 0x009F)
-         or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDDF)
+         or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
          or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
          or (0x10FFFF < charAsInt)):
             char = "\uFFFD"
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -142,7 +142,8 @@ def buildTestSuite():
                 def testFunc(self, innerHTML=innerHTML, input=input,
                     expected=expected, errors=errors, treeCls=treeCls): 
                     return self.runParserTest(innerHTML, input, expected, errors, treeCls)
-                setattr(TestCase, "test_%s_%d_%s" % (testName,index+1,treeName),
+                testFunc.__name__ = "test_%s_%d_%s" % (testName,index+1,treeName)
+                setattr(TestCase, testFunc.__name__,
                      testFunc)
 
     return unittest.TestLoader().loadTestsFromTestCase(TestCase)