Skip to content

Commit babe4a3

Browse files
committed
Attempt at merging svgmathml branch to the default branch
--HG-- branch : svgmathml rename : python/parse.py => python3/parse.py rename : python/src/html5lib/__init__.py => python3/src/html5lib/__init__.py rename : python/src/html5lib/constants.py => python3/src/html5lib/constants.py rename : python/src/html5lib/filters/optionaltags.py => python3/src/html5lib/filters/optionaltags.py rename : python/src/html5lib/html5parser.py => python3/src/html5lib/html5parser.py rename : python/src/html5lib/inputstream.py => python3/src/html5lib/inputstream.py rename : python/src/html5lib/sanitizer.py => python3/src/html5lib/sanitizer.py rename : python/src/html5lib/serializer/__init__.py => python3/src/html5lib/serializer/__init__.py rename : python/src/html5lib/tokenizer.py => python3/src/html5lib/tokenizer.py rename : python/src/html5lib/treebuilders/etree_lxml.py => python3/src/html5lib/treebuilders/etree_lxml.py rename : python/src/html5lib/treebuilders/simpletree.py => python3/src/html5lib/treebuilders/simpletree.py rename : python/tests/test_encoding.py => python3/tests/test_encoding.py rename : python/tests/test_parser.py => python3/tests/test_parser.py rename : python/tests/test_tokenizer.py => python3/tests/test_tokenizer.py
1 parent 768ba79 commit babe4a3

File tree

8 files changed

+35
-36
lines changed

8 files changed

+35
-36
lines changed

parse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def parse():
5757
else:
5858
tokenizer = HTMLTokenizer
5959

60-
if opts.xml:
60+
if opts.liberalxml:
6161
p = liberalxmlparser.XHTMLParser(tree=treebuilder, tokenizer=tokenizer)
6262
else:
6363
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)

src/html5lib/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1070,7 +1070,6 @@
10701070
'utf16': 'utf-16',
10711071
'utf16be': 'utf-16-be',
10721072
'utf16le': 'utf-16-le',
1073-
'utf7': 'utf-7',
10741073
'utf8': 'utf-8',
10751074
'windows1250': 'cp1250',
10761075
'windows1251': 'cp1251',

src/html5lib/filters/optionaltags.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,11 @@ def is_optional_start(self, tagname, previous, next):
3131
elif tagname == 'head':
3232
# A head element's start tag may be omitted if the first thing
3333
# inside the head element is an element.
34-
return type == "StartTag"
34+
# XXX: we also omit the start tag if the head element is empty
35+
if type in ("StartTag", "EmptyTag"):
36+
return True
37+
elif type == "EndTag":
38+
return next["name"] == "head"
3539
elif tagname == 'body':
3640
# A body element's start tag may be omitted if the first thing
3741
# inside the body element is not a space character or a comment,
@@ -52,7 +56,7 @@ def is_optional_start(self, tagname, previous, next):
5256
# inside the colgroup element is a col element, and if the element
5357
# is not immediately preceeded by another colgroup element whose
5458
# end tag has been omitted.
55-
if type == "StartTag":
59+
if type in ("StartTag", "EmptyTag"):
5660
# XXX: we do not look at the preceding event, so instead we never
5761
# omit the colgroup element's end tag when it is immediately
5862
# followed by another colgroup element. See is_optional_end.
@@ -114,7 +118,7 @@ def is_optional_end(self, tagname, next):
114118
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
115119
# nav, ol, p, pre, section, table, or ul, element, or if
116120
# there is no more content in the parent element.
117-
if type == "StartTag":
121+
if type in ("StartTag", "EmptyTag"):
118122
return next["name"] in ('address', 'article', 'aside', \
119123
'blockquote', 'datagrid', 'dialog', 'dir', 'div', \
120124
'dl', 'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', \

src/html5lib/html5parser.py

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ def _parse(self, stream, innerHTML=False, container="div",
108108
# We only seem to have InBodyPhase testcases where the following is
109109
# relevant ... need others too
110110
self.lastPhase = None
111-
112111
self.beforeRCDataPhase = None
113112

114113
CharactersToken = tokenTypes["Characters"]
@@ -120,6 +119,8 @@ def _parse(self, stream, innerHTML=False, container="div",
120119

121120

122121
for token in self.normalizedTokens():
122+
#print self.phase.__class__.__name__
123+
#print token
123124
type = token["type"]
124125
if type == CharactersToken:
125126
self.phase.processCharacters(token)
@@ -271,18 +272,6 @@ def __init__(self, parser, tree):
271272

272273
def processEOF(self):
273274
raise NotImplementedError
274-
self.tree.generateImpliedEndTags()
275-
if len(self.tree.openElements) > 2:
276-
self.parser.parseError("expected-closing-tag-but-got-eof")
277-
elif len(self.tree.openElements) == 2 and\
278-
self.tree.openElements[1].name != "body":
279-
# This happens for framesets or something?
280-
self.parser.parseError("expected-closing-tag-but-got-eof")
281-
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
282-
# XXX This is not what the specification says. Not sure what to do
283-
# here.
284-
self.parser.parseError("eof-in-innerhtml")
285-
# Betting ends.
286275

287276
def processComment(self, token):
288277
# For most phases the following is correct. Where it's not it will be
@@ -318,7 +307,7 @@ class InitialPhase(Phase):
318307
# this.
319308
def processEOF(self):
320309
self.parser.parseError("expected-doctype-but-got-eof")
321-
self.compatMode = "quirks"
310+
self.parser.compatMode = "quirks"
322311
self.parser.phase = self.parser.phases["beforeHtml"]
323312
self.parser.phase.processEOF()
324313

@@ -346,8 +335,9 @@ def processDoctype(self, token):
346335
if publicId != "":
347336
publicId = publicId.translate(asciiUpper2Lower)
348337

349-
if (not correct or token["name"] != "html"
350-
or publicId in
338+
339+
if ((not correct) or nameLower != "html"
340+
or publicId in
351341
("+//silmaril//dtd html pro v0r11 19970101//en",
352342
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
353343
"-//as//dtd html 3.0 aswedit + extensions//en",
@@ -419,19 +409,18 @@ def processDoctype(self, token):
419409
"html")
420410
or (publicId in
421411
("-//w3c//dtd html 4.01 frameset//EN",
422-
"-//w3c//dtd html 4.01 transitional//EN") and
423-
systemId == None)
412+
"-//w3c//dtd html 4.01 transitional//EN") and systemId == None)
424413
or (systemId != None and
425-
systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
426-
self.compatMode = "quirks"
414+
systemId ==
415+
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
416+
self.parser.compatMode = "quirks"
427417
elif (publicId in
428-
("-//w3c//dtd xhtml 1.0 frameset//EN",
429-
"-//w3c//dtd xhtml 1.0 transitional//EN")
418+
("-//w3c//dtd xhtml 1.0 frameset//EN",
419+
"-//w3c//dtd xhtml 1.0 transitional//EN")
430420
or (publicId in
431421
("-//w3c//dtd html 4.01 frameset//EN",
432-
"-//w3c//dtd html 4.01 transitional//EN") and
433-
systemId == None)):
434-
self.compatMode = "limited quirks"
422+
"-//w3c//dtd html 4.01 transitional//EN") and systemId == None)):
423+
self.parser.compatMode = "limited quirks"
435424

436425
self.parser.phase = self.parser.phases["beforeHtml"]
437426

@@ -440,7 +429,7 @@ def processSpaceCharacters(self, token):
440429

441430
def processCharacters(self, token):
442431
self.parser.parseError("expected-doctype-but-got-chars")
443-
self.compatMode = "quirks"
432+
self.parser.compatMode = "quirks"
444433
self.parser.phase = self.parser.phases["beforeHtml"]
445434
self.parser.phase.processCharacters(token)
446435

@@ -595,7 +584,8 @@ def startTagMeta(self, token):
595584
codec = inputstream.codecName(attributes["charset"])
596585
self.parser.tokenizer.stream.changeEncoding(codec)
597586
elif "content" in attributes:
598-
data = inputstream.EncodingBytes(attributes["content"])
587+
data = inputstream.EncodingBytes(
588+
attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
599589
parser = inputstream.ContentAttrParser(data)
600590
codec = parser.parse()
601591
self.parser.tokenizer.stream.changeEncoding(codec)

src/html5lib/inputstream.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import codecs
22
import re
33
import types
4+
import sys
45

56
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
67
from .constants import encodings, ReparseException
@@ -188,7 +189,8 @@ def openStream(self, source):
188189
import io
189190
stream = io.BytesIO(bytes(source))
190191

191-
if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
192+
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
193+
stream is sys.stdin):
192194
stream = BufferedStream(stream)
193195

194196
return stream
@@ -452,6 +454,9 @@ class EncodingBytes(bytes):
452454
"""Bytes-like object with an assosiated position and various extra methods
453455
If the position is ever greater than the string length then an exception is
454456
raised"""
457+
def __new__(self, value):
458+
return str.__new__(self, value)
459+
455460
def __init__(self, value):
456461
self._position = -1
457462

src/html5lib/sanitizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def sanitize_token(self, token):
152152
continue
153153
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
154154
unescape(attrs[attr])).lower()
155-
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) or
155+
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
156156
(val_unescaped.split(':')[0] not in
157157
self.allowed_protocols)):
158158
del attrs[attr]

src/html5lib/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def consumeNumberEntity(self, isHex):
142142
# Certain characters get replaced with U+FFFD
143143
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
144144
or (0x007F <= charAsInt <= 0x009F)
145-
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDDF)
145+
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
146146
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
147147
or (0x10FFFF < charAsInt)):
148148
char = "\uFFFD"

tests/test_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ def buildTestSuite():
142142
def testFunc(self, innerHTML=innerHTML, input=input,
143143
expected=expected, errors=errors, treeCls=treeCls):
144144
return self.runParserTest(innerHTML, input, expected, errors, treeCls)
145-
setattr(TestCase, "test_%s_%d_%s" % (testName,index+1,treeName),
145+
testFunc.__name__ = "test_%s_%d_%s" % (testName,index+1,treeName)
146+
setattr(TestCase, testFunc.__name__,
146147
testFunc)
147148

148149
return unittest.TestLoader().loadTestsFromTestCase(TestCase)

0 commit comments

Comments
 (0)