Skip to content

Commit a83fbe4

Browse files
author
Mark Pilgrim
committed
added support for validating a whole bunch of stuff that I can't remember right at the moment
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40990
1 parent f37d906 commit a83fbe4

File tree

3 files changed

+184
-43
lines changed

3 files changed

+184
-43
lines changed

src/html5lib/filters/rfc2046.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# adapted from feedvalidator, original copyright license is
2+
#
3+
# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
import re
24+
25+
mime_re = re.compile('[^\s()<>,;:\\"/[\]?=]+/[^\s()<>,;:\\"/[\]?=]+(\s*;\s*[^\s()<>,;:\\"/[\]?=]+=("(\\"|[^"])*"|[^\s()<>,;:\\"/[\]?=]+))*$')
26+
27+
def isValidMIMEType(value):
28+
return not not mime_re.match(value)
29+

src/html5lib/filters/rfc3987.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
"urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp",
3232
"iris.beep", "dict", "snmp", "crid", "tag", "dns", "info"
3333
]
34+
allowed_schemes = iana_schemes + ['javascript']
3435

3536
rfc2396_re = re.compile("([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}" +
3637
"[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$")
@@ -62,8 +63,8 @@ def isValidURI(value, uriPattern=rfc2396_re):
6263
elif scheme in ['http','ftp']:
6364
if not re.match('^\w+://[^/].*',value):
6465
return False, "invalid-http-or-ftp-uri"
65-
elif value.find(':')>=0 and scheme.isalpha() and scheme not in iana_schemes:
66-
return False, "unregistered-scheme"
66+
elif value.find(':')>=0 and scheme.isalpha() and scheme not in allowed_schemes:
67+
return False, "invalid-scheme"
6768
return True, ""
6869

6970
def isValidIRI(value):

src/html5lib/filters/validator.py

Lines changed: 152 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import _base
2222
import iso639codes
2323
import rfc3987
24+
import rfc2046
2425
from html5lib.constants import E, spaceCharacters, digits
2526
from html5lib import tokenizer
2627
import gettext
@@ -65,6 +66,24 @@
6566
_(u"Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
6667
"invalid-browsing-context":
6768
_(u"Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_': '%(attributeName)s' attribute on <%(tagName)s>."),
69+
"invalid-tag-uri":
70+
_(u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
71+
"invalid-urn":
72+
_(u"Invalid URN: '%(attributeName)s' attribute on <%(tagName)s>."),
73+
"invalid-uri-char":
74+
_(u"Illegal character in URI: '%(attributeName)s' attribute on <%(tagName)s>."),
75+
"uri-not-iri":
76+
_(u"Expected a URI but found an IRI: '%(attributeName)s' attribute on <%(tagName)s>."),
77+
"invalid-uri":
78+
_(u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
79+
"invalid-http-or-ftp-uri":
80+
_(u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>."),
81+
"invalid-scheme":
82+
_(u"Unregistered URI scheme: '%(attributeName)s' attribute on <%(tagName)s>."),
83+
"invalid-rel":
84+
_(u"Invalid link relation: '%(attributeName)s' attribute on <%(tagName)s>."),
85+
"invalid-mime-type":
86+
_(u"Invalid MIME type: '%(attributeName)s' attribute on <%(tagName)s>."),
6887
})
6988

7089
globalAttributes = frozenset(('class', 'contenteditable', 'contextmenu', 'dir',
@@ -236,6 +255,9 @@
236255
'password': frozenset(('size',))
237256
}
238257

258+
linkRelValues = frozenset(('alternate', 'archive', 'archives', 'author', 'contact', 'feed', 'first', 'begin', 'start', 'help', 'icon', 'index', 'top', 'contents', 'toc', 'last', 'end', 'license', 'copyright', 'next', 'pingback', 'prefetch', 'prev', 'previous', 'search', 'stylesheet', 'sidebar', 'tag', 'up'))
259+
aRelValues = frozenset(('alternate', 'archive', 'archives', 'author', 'contact', 'feed', 'first', 'begin', 'start', 'help', 'index', 'top', 'contents', 'toc', 'last', 'end', 'license', 'copyright', 'next', 'prev', 'previous', 'search', 'sidebar', 'tag', 'up', 'bookmark', 'external', 'nofollow'))
260+
239261
class HTMLConformanceChecker(_base.Filter):
240262
def __init__(self, stream, encoding, parseMeta, **kwargs):
241263
_base.Filter.__init__(self, tokenizer.HTMLTokenizer(
@@ -340,17 +362,17 @@ def checkStartTagUnknownAttributes(self, token):
340362
# Attribute validation helpers
341363
##########################################################################
342364

343-
def checkURI(self, token, tagName, attrName, attrValue):
344-
isValid, errorCode = rfc3987.isValidURI(attrValue)
345-
if not isValid:
346-
yield {"type": "ParseError",
347-
"data": errorCode,
348-
"datavars": {"tagName": tagName,
349-
"attributeName": attrName}}
350-
yield {"type": "ParseError",
351-
"data": "invalid-attribute-value",
352-
"datavars": {"tagName": tagName,
353-
"attributeName": attrName}}
365+
# def checkURI(self, token, tagName, attrName, attrValue):
366+
# isValid, errorCode = rfc3987.isValidURI(attrValue)
367+
# if not isValid:
368+
# yield {"type": "ParseError",
369+
# "data": errorCode,
370+
# "datavars": {"tagName": tagName,
371+
# "attributeName": attrName}}
372+
# yield {"type": "ParseError",
373+
# "data": "invalid-attribute-value",
374+
# "datavars": {"tagName": tagName,
375+
# "attributeName": attrName}}
354376

355377
def checkIRI(self, token, tagName, attrName, attrValue):
356378
isValid, errorCode = rfc3987.isValidIRI(attrValue)
@@ -382,26 +404,36 @@ def checkID(self, token, tagName, attrName, attrValue):
382404
"attributeName": attrName}}
383405
break
384406

385-
def checkTokenList(self, tagName, attrName, attrValue):
386-
# The "token" in the method name refers to tokens in an attribute value
387-
# i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
388-
# but the "token" parameter refers to the token generated from
389-
# HTMLTokenizer. Sorry for the confusion.
407+
def parseTokenList(self, value):
390408
valueList = []
391409
currentValue = ''
392-
for c in attrValue + ' ':
410+
for c in value + ' ':
393411
if c in spaceCharacters:
394412
if currentValue:
395-
if currentValue in valueList:
396-
yield {"type": "ParseError",
397-
"data": "duplicate-value-in-token-list",
398-
"datavars": {"tagName": tagName,
399-
"attributeName": attrName,
400-
"attributeValue": currentValue}}
401413
valueList.append(currentValue)
402414
currentValue = ''
403415
else:
404416
currentValue += c
417+
if currentValue:
418+
valueList.append(currentValue)
419+
return valueList
420+
421+
def checkTokenList(self, tagName, attrName, attrValue):
422+
# The "token" in the method name refers to tokens in an attribute value
423+
# i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
424+
# but the "token" parameter refers to the token generated from
425+
# HTMLTokenizer. Sorry for the confusion.
426+
valueList = self.parseTokenList(attrValue)
427+
valueDict = {}
428+
for currentValue in valueList:
429+
if valueDict.has_key(currentValue):
430+
yield {"type": "ParseError",
431+
"data": "duplicate-value-in-token-list",
432+
"datavars": {"tagName": tagName,
433+
"attributeName": attrName,
434+
"attributeValue": currentValue}}
435+
break
436+
valueDict[currentValue] = 1
405437

406438
def checkEnumeratedValue(self, token, tagName, attrName, attrValue, enumeratedValues):
407439
if not attrValue and ('' not in enumeratedValues):
@@ -422,7 +454,7 @@ def checkEnumeratedValue(self, token, tagName, attrName, attrValue, enumeratedVa
422454
"datavars": {"tagName": tagName,
423455
"attributeName": attrName}}
424456

425-
def checkBooleanValue(self, token, tagName, attrName, attrValue):
457+
def checkBoolean(self, token, tagName, attrName, attrValue):
426458
enumeratedValues = frozenset((attrName, ''))
427459
if attrValue not in enumeratedValues:
428460
yield {"type": "ParseError",
@@ -435,7 +467,7 @@ def checkBooleanValue(self, token, tagName, attrName, attrValue):
435467
"datavars": {"tagName": tagName,
436468
"attributeName": attrName}}
437469

438-
def checkIntegerValue(self, token, tagName, attrName, attrValue):
470+
def checkInteger(self, token, tagName, attrName, attrValue):
439471
sign = 1
440472
numberString = ''
441473
state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
@@ -476,6 +508,10 @@ def checkIntegerValue(self, token, tagName, attrName, attrValue):
476508
"datavars": {"tagName": tagName,
477509
"attributeName": attrName}}
478510

511+
def checkFloatingPointNumber(self, token, tagName, attrName, attrValue):
512+
# XXX
513+
pass
514+
479515
def checkBrowsingContext(self, token, tagName, attrName, attrValue):
480516
if not attrValue: return
481517
if attrValue[0] != '_': return
@@ -486,6 +522,56 @@ def checkBrowsingContext(self, token, tagName, attrName, attrValue):
486522
"datavars": {"tagName": tagName,
487523
"attributeName": attrName}}
488524

525+
def checkLangCode(self, token, tagName, attrName, attrValue):
526+
if not attrValue: return # blank is OK
527+
if not iso639codes.isValidLangCode(attrValue):
528+
yield {"type": "ParseError",
529+
"data": "invalid-lang-code",
530+
"datavars": {"tagName": tagName,
531+
"attributeName": attrName,
532+
"attributeValue": attrValue}}
533+
534+
def checkMIMEType(self, token, tagName, attrName, attrValue):
535+
# XXX needs tests
536+
if not attrValue:
537+
yield {"type": "ParseError",
538+
"data": "attribute-value-can-not-be-blank",
539+
"datavars": {"tagName": tagName,
540+
"attributeName": attrName}}
541+
542+
if not rfc2046.isValidMIMEType(attrValue):
543+
yield {"type": "ParseError",
544+
"data": "invalid-mime-type",
545+
"datavars": {"tagName": tagName,
546+
"attributeName": attrName,
547+
"attributeValue": attrValue}}
548+
549+
def checkMediaQuery(self, token, tagName, attrName, attrValue):
550+
# XXX
551+
pass
552+
553+
def checkLinkRelation(self, token, tagName, attrName, attrValue):
554+
for t in self.checkTokenList(tagName, attrName, attrValue) or []: yield t
555+
valueList = self.parseTokenList(attrValue)
556+
allowedValues = (tagName == 'link') and linkRelValues or aRelValues
557+
for currentValue in valueList:
558+
if currentValue not in allowedValues:
559+
yield {"type": "ParseError",
560+
"data": "invalid-rel",
561+
"datavars": {"tagName": tagName,
562+
"attributeName": attrName}}
563+
564+
def checkDateTime(self, token, tagName, attrName, attrValue):
565+
# XXX
566+
state = 'begin' # ('begin', '...
567+
# for c in attrValue:
568+
# if state == 'begin':
569+
# if c in spaceCharacters:
570+
# continue
571+
# elif c in digits:
572+
# state = ...
573+
574+
489575
##########################################################################
490576
# Attribute validation
491577
##########################################################################
@@ -521,17 +607,8 @@ def validateAttributeValueDir(self, token, tagName, attrName, attrValue):
521607
def validateAttributeValueDraggable(self, token, tagName, attrName, attrValue):
522608
for t in self.checkEnumeratedValue(token, tagName, attrName, attrValue, frozenset(('true', 'false'))) or []: yield t
523609

524-
def validateAttributeValueIrrelevant(self, token, tagName, attrName, attrValue):
525-
for t in self.checkBooleanValue(token, tagName, attrName, attrValue) or []: yield t
526-
527-
def validateAttributeValueLang(self, token, tagName, attrName, attrValue):
528-
if not attrValue: return # blank is OK
529-
if not iso639codes.isValidLangCode(attrValue):
530-
yield {"type": "ParseError",
531-
"data": "invalid-lang-code",
532-
"datavars": {"tagName": tagName,
533-
"attributeName": attrName,
534-
"attributeValue": attrValue}}
610+
validateAttributeValueIrrelevant = checkBoolean
611+
validateAttributeValueLang = checkLangCode
535612

536613
def validateAttributeValueContextmenu(self, token, tagName, attrName, attrValue):
537614
for t in self.checkID(token, tagName, attrName, attrValue) or []: yield t
@@ -552,7 +629,7 @@ def validateAttributeValueId(self, token, tagName, attrName, attrValue):
552629
self.IDsWeHaveKnownAndLoved.append(attrValue)
553630
self.thingsThatDefineAnID.append(token)
554631

555-
validateAttributeValueTabindex = checkIntegerValue
632+
validateAttributeValueTabindex = checkInteger
556633

557634
def validateAttributeValueRef(self, token, tagName, attrName, attrValue):
558635
# XXX
@@ -569,13 +646,47 @@ def validateAttributeValueHtmlXmlns(self, token, tagName, attrName, attrValue):
569646
"datavars": {"tagName": tagName,
570647
"attributeName": attrName}}
571648

572-
def validateAttributeValueBaseHref(self, token, tagName, attrName, attrValue):
573-
# XXX
574-
pass
575-
576649
validateAttributeValueBaseHref = checkIRI
577650
validateAttributeValueBaseTarget = checkBrowsingContext
578651
validateAttributeValueLinkHref = checkIRI
652+
validateAttributeValueLinkRel = checkLinkRelation
653+
validateAttributeValueLinkMedia = checkMediaQuery
654+
validateAttributeValueLinkHreflang = checkLangCode
655+
validateAttributeValueLinkType = checkMIMEType
656+
# XXX <meta> attributes
657+
validateAttributeValueStyleMedia = checkMediaQuery
658+
validateAttributeValueStyleType = checkMIMEType
659+
validateAttributeValueStyleScoped = checkBoolean
660+
validateAttributeValueBlockquoteCite = checkIRI
661+
validateAttributeValueOlStart = checkInteger
662+
validateAttributeValueLiValue = checkInteger
663+
# XXX need tests from here on
664+
validateAttributeValueAHref = checkIRI
665+
validateAttributeValueATarget = checkBrowsingContext
666+
667+
def validateAttributeValueAPing(self, token, tagName, attrName, attrValue):
668+
valueList = self.parseTokenList(attrValue)
669+
for currentValue in valueList:
670+
for t in self.checkIRI(token, tagName, attrName, attrValue) or []: yield t
671+
672+
validateAttributeValueARel = checkLinkRelation
673+
validateAttributeValueAMedia = checkMediaQuery
674+
validateAttributeValueAHreflang = checkLangCode
675+
validateAttributeValueAType = checkMIMEType
676+
validateAttributeValueQCite = checkIRI
677+
validateAttributeValueTimeDatetime = checkDateTime
678+
validateAttributeValueMeterValue = checkFloatingPointNumber
679+
validateAttributeValueMeterMin = checkFloatingPointNumber
680+
validateAttributeValueMeterLow = checkFloatingPointNumber
681+
validateAttributeValueMeterHigh = checkFloatingPointNumber
682+
validateAttributeValueMeterMax = checkFloatingPointNumber
683+
validateAttributeValueMeterOptimum = checkFloatingPointNumber
684+
validateAttributeValueProgressValue = checkFloatingPointNumber
685+
validateAttributeValueProgressMax = checkFloatingPointNumber
686+
validateAttributeValueInsCite = checkIRI
687+
validateAttributeValueInsDatetime = checkDateTime
688+
validateAttributeValueDelCite = checkIRI
689+
validateAttributeValueDelDatetime = checkDateTime
579690

580691
##########################################################################
581692
# Whole document validation (IDs, etc.)

0 commit comments

Comments
 (0)