Skip to content

Commit 2a4154e

Browse files
committed
Resync my tree with the trunk. Adds support for coercing trees to xml infosets in particular for lxml (still need to wire up the tests) and some speed improvements in the parser. Big apologies for the large checkin, there are some regressions in the liberal xml parser and the sanitizer that need to be fixed.
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401248
1 parent ddfddb9 commit 2a4154e

17 files changed

+548
-229
lines changed

parse.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#RELEASE remove
1313
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
1414
#END RELEASE
15-
from html5lib import html5parser, liberalxmlparser
15+
from html5lib import html5parser, liberalxmlparser, sanitizer, tokenizer
1616
from html5lib import treebuilders, serializer, treewalkers
1717
from html5lib import constants
1818

@@ -46,17 +46,23 @@ def parse():
4646

4747
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
4848

49+
if opts.sanitize:
50+
tokenizer = sanitizer.HTMLSanitizer
51+
else:
52+
tokenizer = HTMLTokenizer
53+
4954
if opts.xml:
50-
p = liberalxmlparser.XHTMLParser(tree=treebuilder)
55+
p = liberalxmlparser.XHTMLParser(tree=treebuilder, tokenizer=tokenizer)
5156
else:
52-
p = html5parser.HTMLParser(tree=treebuilder)
57+
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)
5358

5459
if opts.fragment:
5560
parseMethod = p.parseFragment
5661
else:
5762
parseMethod = p.parse
5863

5964
if opts.profile:
65+
#XXX should import cProfile instead and use that
6066
import hotshot
6167
import hotshot.stats
6268
prof = hotshot.Profile('stats.prof')

src/html5lib/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,7 @@
1111
p = html5lib.HTMLParser()
1212
tree = p.parse(f)
1313
"""
14-
from html5parser import HTMLParser
14+
from html5parser import HTMLParser, parse
15+
from treebuilders import getTreeBuilder
16+
1517
from liberalxmlparser import XMLParser, XHTMLParser

src/html5lib/constants.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,6 @@
370370
spaceCharacters = frozenset((
371371
u"\t",
372372
u"\n",
373-
u"\u000B",
374373
u"\u000C",
375374
u" ",
376375
u"\r"
@@ -1088,5 +1087,16 @@
10881087
'windows936': 'gbk',
10891088
'x-x-big5': 'big5'}
10901089

1090+
tokenTypes = {
1091+
"Doctype":0,
1092+
"Characters":1,
1093+
"SpaceCharacters":2,
1094+
"StartTag":3,
1095+
"EndTag":4,
1096+
"EmptyTag":5,
1097+
"Comment":6,
1098+
"ParseError":7
1099+
}
1100+
10911101
class DataLossWarning(UserWarning):
10921102
pass

src/html5lib/filters/validator.py

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import iso639codes
2323
import rfc3987
2424
import rfc2046
25-
from html5lib.constants import E, spaceCharacters, digits
25+
from html5lib.constants import E, spaceCharacters, digits, tokenTypes
2626
from html5lib import tokenizer
2727
import gettext
2828
_ = gettext.gettext
@@ -267,8 +267,9 @@ def __init__(self, stream, encoding, parseMeta, **kwargs):
267267
self.IDsWeHaveKnownAndLoved = []
268268

269269
def __iter__(self):
270+
types = dict((v,k) for k,v in tokenTypes.iteritems())
270271
for token in _base.Filter.__iter__(self):
271-
fakeToken = {"type": token.get("type", "-"),
272+
fakeToken = {"type": types.get(token.get("type", "-"), "-"),
272273
"name": token.get("name", "-").capitalize()}
273274
method = getattr(self, "validate%(type)s%(name)s" % fakeToken, None)
274275
if method:
@@ -301,23 +302,23 @@ def validateStartTagInput(self, token):
301302
attrDict = dict([(name.lower(), value) for name, value in token.get("data", [])])
302303
inputType = attrDict.get("type", "text")
303304
if inputType not in inputTypeAllowedAttributeMap.keys():
304-
yield {"type": "ParseError",
305+
yield {"type": tokenTypes["ParseError"],
305306
"data": "unknown-input-type",
306307
"datavars": {"attrValue": inputType}}
307308
allowedAttributes = inputTypeAllowedAttributeMap.get(inputType, [])
308309
for attrName, attrValue in attrDict.items():
309310
if attrName not in allowedAttributeMap['input']:
310-
yield {"type": "ParseError",
311+
yield {"type": tokenTypes["ParseError"],
311312
"data": "unknown-attribute",
312313
"datavars": {"tagName": "input",
313314
"attributeName": attrName}}
314315
elif attrName not in allowedAttributes:
315-
yield {"type": "ParseError",
316+
yield {"type": tokenTypes["ParseError"],
316317
"data": "attribute-not-allowed-on-this-input-type",
317318
"datavars": {"attributeName": attrName,
318319
"inputType": inputType}}
319320
if attrName in inputTypeDeprecatedAttributeMap.get(inputType, []):
320-
yield {"type": "ParseError",
321+
yield {"type": tokenTypes["ParseError"],
321322
"data": "deprecated-attribute",
322323
"datavars": {"attributeName": attrName,
323324
"inputType": inputType}}
@@ -330,7 +331,7 @@ def checkUnknownStartTag(self, token):
330331
# check for recognized tag name
331332
name = token.get("name", "").lower()
332333
if name not in allowedAttributeMap.keys():
333-
yield {"type": "ParseError",
334+
yield {"type": tokenTypes["ParseError"],
334335
"data": "unknown-start-tag",
335336
"datavars": {"tagName": name}}
336337

@@ -342,7 +343,7 @@ def checkStartTagRequiredAttributes(self, token):
342343
in token.get("data", [])]
343344
for attrName in requiredAttributeMap[name]:
344345
if attrName not in attrsPresent:
345-
yield {"type": "ParseError",
346+
yield {"type": tokenTypes["ParseError"],
346347
"data": "missing-required-attribute",
347348
"datavars": {"tagName": name,
348349
"attributeName": attrName}}
@@ -353,7 +354,7 @@ def checkStartTagUnknownAttributes(self, token):
353354
allowedAttributes = globalAttributes | allowedAttributeMap.get(name, frozenset(()))
354355
for attrName, attrValue in token.get("data", []):
355356
if attrName.lower() not in allowedAttributes:
356-
yield {"type": "ParseError",
357+
yield {"type": tokenTypes["ParseError"],
357358
"data": "unknown-attribute",
358359
"datavars": {"tagName": name,
359360
"attributeName": attrName}}
@@ -365,40 +366,40 @@ def checkStartTagUnknownAttributes(self, token):
365366
# def checkURI(self, token, tagName, attrName, attrValue):
366367
# isValid, errorCode = rfc3987.isValidURI(attrValue)
367368
# if not isValid:
368-
# yield {"type": "ParseError",
369+
# yield {"type": tokenTypes["ParseError"],
369370
# "data": errorCode,
370371
# "datavars": {"tagName": tagName,
371372
# "attributeName": attrName}}
372-
# yield {"type": "ParseError",
373+
# yield {"type": tokenTypes["ParseError"],
373374
# "data": "invalid-attribute-value",
374375
# "datavars": {"tagName": tagName,
375376
# "attributeName": attrName}}
376377

377378
def checkIRI(self, token, tagName, attrName, attrValue):
378379
isValid, errorCode = rfc3987.isValidIRI(attrValue)
379380
if not isValid:
380-
yield {"type": "ParseError",
381+
yield {"type": tokenTypes["ParseError"],
381382
"data": errorCode,
382383
"datavars": {"tagName": tagName,
383384
"attributeName": attrName}}
384-
yield {"type": "ParseError",
385+
yield {"type": tokenTypes["ParseError"],
385386
"data": "invalid-attribute-value",
386387
"datavars": {"tagName": tagName,
387388
"attributeName": attrName}}
388389

389390
def checkID(self, token, tagName, attrName, attrValue):
390391
if not attrValue:
391-
yield {"type": "ParseError",
392+
yield {"type": tokenTypes["ParseError"],
392393
"data": "attribute-value-can-not-be-blank",
393394
"datavars": {"tagName": tagName,
394395
"attributeName": attrName}}
395396
for c in attrValue:
396397
if c in spaceCharacters:
397-
yield {"type": "ParseError",
398+
yield {"type": tokenTypes["ParseError"],
398399
"data": "space-in-id",
399400
"datavars": {"tagName": tagName,
400401
"attributeName": attrName}}
401-
yield {"type": "ParseError",
402+
yield {"type": tokenTypes["ParseError"],
402403
"data": "invalid-attribute-value",
403404
"datavars": {"tagName": tagName,
404405
"attributeName": attrName}}
@@ -427,7 +428,7 @@ def checkTokenList(self, tagName, attrName, attrValue):
427428
valueDict = {}
428429
for currentValue in valueList:
429430
if valueDict.has_key(currentValue):
430-
yield {"type": "ParseError",
431+
yield {"type": tokenTypes["ParseError"],
431432
"data": "duplicate-value-in-token-list",
432433
"datavars": {"tagName": tagName,
433434
"attributeName": attrName,
@@ -437,32 +438,32 @@ def checkTokenList(self, tagName, attrName, attrValue):
437438

438439
def checkEnumeratedValue(self, token, tagName, attrName, attrValue, enumeratedValues):
439440
if not attrValue and ('' not in enumeratedValues):
440-
yield {"type": "ParseError",
441+
yield {"type": tokenTypes["ParseError"],
441442
"data": "attribute-value-can-not-be-blank",
442443
"datavars": {"tagName": tagName,
443444
"attributeName": attrName}}
444445
return
445446
attrValue = attrValue.lower()
446447
if attrValue not in enumeratedValues:
447-
yield {"type": "ParseError",
448+
yield {"type": tokenTypes["ParseError"],
448449
"data": "invalid-enumerated-value",
449450
"datavars": {"tagName": tagName,
450451
"attributeName": attrName,
451452
"enumeratedValues": tuple(enumeratedValues)}}
452-
yield {"type": "ParseError",
453+
yield {"type": tokenTypes["ParseError"],
453454
"data": "invalid-attribute-value",
454455
"datavars": {"tagName": tagName,
455456
"attributeName": attrName}}
456457

457458
def checkBoolean(self, token, tagName, attrName, attrValue):
458459
enumeratedValues = frozenset((attrName, ''))
459460
if attrValue not in enumeratedValues:
460-
yield {"type": "ParseError",
461+
yield {"type": tokenTypes["ParseError"],
461462
"data": "invalid-boolean-value",
462463
"datavars": {"tagName": tagName,
463464
"attributeName": attrName,
464465
"enumeratedValues": tuple(enumeratedValues)}}
465-
yield {"type": "ParseError",
466+
yield {"type": tokenTypes["ParseError"],
466467
"data": "invalid-attribute-value",
467468
"datavars": {"tagName": tagName,
468469
"attributeName": attrName}}
@@ -471,7 +472,7 @@ def checkInteger(self, token, tagName, attrName, attrValue):
471472
sign = 1
472473
numberString = ''
473474
state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
474-
error = {"type": "ParseError",
475+
error = {"type": tokenTypes["ParseError"],
475476
"data": "invalid-integer-value",
476477
"datavars": {"tagName": tagName,
477478
"attributeName": attrName,
@@ -503,7 +504,7 @@ def checkInteger(self, token, tagName, attrName, attrValue):
503504
elif state == 'trailing-junk':
504505
pass
505506
if not numberString:
506-
yield {"type": "ParseError",
507+
yield {"type": tokenTypes["ParseError"],
507508
"data": "attribute-value-can-not-be-blank",
508509
"datavars": {"tagName": tagName,
509510
"attributeName": attrName}}
@@ -517,15 +518,15 @@ def checkBrowsingContext(self, token, tagName, attrName, attrValue):
517518
if attrValue[0] != '_': return
518519
attrValue = attrValue.lower()
519520
if attrValue in frozenset(('_self', '_parent', '_top', '_blank')): return
520-
yield {"type": "ParseError",
521+
yield {"type": tokenTypes["ParseError"],
521522
"data": "invalid-browsing-context",
522523
"datavars": {"tagName": tagName,
523524
"attributeName": attrName}}
524525

525526
def checkLangCode(self, token, tagName, attrName, attrValue):
526527
if not attrValue: return # blank is OK
527528
if not iso639codes.isValidLangCode(attrValue):
528-
yield {"type": "ParseError",
529+
yield {"type": tokenTypes["ParseError"],
529530
"data": "invalid-lang-code",
530531
"datavars": {"tagName": tagName,
531532
"attributeName": attrName,
@@ -534,13 +535,13 @@ def checkLangCode(self, token, tagName, attrName, attrValue):
534535
def checkMIMEType(self, token, tagName, attrName, attrValue):
535536
# XXX needs tests
536537
if not attrValue:
537-
yield {"type": "ParseError",
538+
yield {"type": tokenTypes["ParseError"],
538539
"data": "attribute-value-can-not-be-blank",
539540
"datavars": {"tagName": tagName,
540541
"attributeName": attrName}}
541542

542543
if not rfc2046.isValidMIMEType(attrValue):
543-
yield {"type": "ParseError",
544+
yield {"type": tokenTypes["ParseError"],
544545
"data": "invalid-mime-type",
545546
"datavars": {"tagName": tagName,
546547
"attributeName": attrName,
@@ -556,7 +557,7 @@ def checkLinkRelation(self, token, tagName, attrName, attrValue):
556557
allowedValues = (tagName == 'link') and linkRelValues or aRelValues
557558
for currentValue in valueList:
558559
if currentValue not in allowedValues:
559-
yield {"type": "ParseError",
560+
yield {"type": tokenTypes["ParseError"],
560561
"data": "invalid-rel",
561562
"datavars": {"tagName": tagName,
562563
"attributeName": attrName}}
@@ -593,7 +594,7 @@ def checkAttributeValues(self, token):
593594
def validateAttributeValueClass(self, token, tagName, attrName, attrValue):
594595
for t in self.checkTokenList(tagName, attrName, attrValue) or []:
595596
yield t
596-
yield {"type": "ParseError",
597+
yield {"type": tokenTypes["ParseError"],
597598
"data": "invalid-attribute-value",
598599
"datavars": {"tagName": tagName,
599600
"attributeName": attrName}}
@@ -623,7 +624,7 @@ def validateAttributeValueId(self, token, tagName, attrName, attrValue):
623624
for t in self.checkID(token, tagName, attrName, attrValue) or []: yield t
624625
if not attrValue: return
625626
if attrValue in self.IDsWeHaveKnownAndLoved:
626-
yield {"type": "ParseError",
627+
yield {"type": tokenTypes["ParseError"],
627628
"data": "duplicate-id",
628629
"datavars": {"tagName": tagName}}
629630
self.IDsWeHaveKnownAndLoved.append(attrValue)
@@ -641,7 +642,7 @@ def validateAttributeValueTemplate(self, token, tagName, attrName, attrValue):
641642

642643
def validateAttributeValueHtmlXmlns(self, token, tagName, attrName, attrValue):
643644
if attrValue != "http://www.w3.org/1999/xhtml":
644-
yield {"type": "ParseError",
645+
yield {"type": tokenTypes["ParseError"],
645646
"data": "invalid-root-namespace",
646647
"datavars": {"tagName": tagName,
647648
"attributeName": attrName}}
@@ -699,7 +700,7 @@ def eof(self):
699700
# hooray for obscure side effects!
700701
attrValue = attrsDict.get("contextmenu", "")
701702
if attrValue and (attrValue not in self.IDsWeHaveKnownAndLoved):
702-
yield {"type": "ParseError",
703+
yield {"type": tokenTypes["ParseError"],
703704
"data": "id-does-not-exist",
704705
"datavars": {"tagName": tagName,
705706
"attributeName": "contextmenu",
@@ -710,6 +711,6 @@ def eof(self):
710711
if not id: continue
711712
if id == attrValue:
712713
if refToken.get("name", "").lower() != "menu":
713-
yield {"type": "ParseError",
714+
yield {"type": tokenTypes["ParseError"],
714715
"data": "contextmenu-must-point-to-menu"}
715716
break

0 commit comments

Comments
 (0)