21
21
import _base
22
22
import iso639codes
23
23
import rfc3987
24
+ import rfc2046
24
25
from html5lib .constants import E , spaceCharacters , digits
25
26
from html5lib import tokenizer
26
27
import gettext
65
66
_ (u"Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted." ),
66
67
"invalid-browsing-context" :
67
68
_ (u"Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_': '%(attributeName)s' attribute on <%(tagName)s>." ),
69
+ "invalid-tag-uri" :
70
+ _ (u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
71
+ "invalid-urn" :
72
+ _ (u"Invalid URN: '%(attributeName)s' attribute on <%(tagName)s>." ),
73
+ "invalid-uri-char" :
74
+ _ (u"Illegal character in URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
75
+ "uri-not-iri" :
76
+ _ (u"Expected a URI but found an IRI: '%(attributeName)s' attribute on <%(tagName)s>." ),
77
+ "invalid-uri" :
78
+ _ (u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
79
+ "invalid-http-or-ftp-uri" :
80
+ _ (u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
81
+ "invalid-scheme" :
82
+ _ (u"Unregistered URI scheme: '%(attributeName)s' attribute on <%(tagName)s>." ),
83
+ "invalid-rel" :
84
+ _ (u"Invalid link relation: '%(attributeName)s' attribute on <%(tagName)s>." ),
85
+ "invalid-mime-type" :
86
+ _ (u"Invalid MIME type: '%(attributeName)s' attribute on <%(tagName)s>." ),
68
87
})
69
88
70
89
globalAttributes = frozenset (('class' , 'contenteditable' , 'contextmenu' , 'dir' ,
236
255
'password' : frozenset (('size' ,))
237
256
}
238
257
258
+ linkRelValues = frozenset (('alternate' , 'archive' , 'archives' , 'author' , 'contact' , 'feed' , 'first' , 'begin' , 'start' , 'help' , 'icon' , 'index' , 'top' , 'contents' , 'toc' , 'last' , 'end' , 'license' , 'copyright' , 'next' , 'pingback' , 'prefetch' , 'prev' , 'previous' , 'search' , 'stylesheet' , 'sidebar' , 'tag' , 'up' ))
259
+ aRelValues = frozenset (('alternate' , 'archive' , 'archives' , 'author' , 'contact' , 'feed' , 'first' , 'begin' , 'start' , 'help' , 'index' , 'top' , 'contents' , 'toc' , 'last' , 'end' , 'license' , 'copyright' , 'next' , 'prev' , 'previous' , 'search' , 'sidebar' , 'tag' , 'up' , 'bookmark' , 'external' , 'nofollow' ))
260
+
239
261
class HTMLConformanceChecker (_base .Filter ):
240
262
def __init__ (self , stream , encoding , parseMeta , ** kwargs ):
241
263
_base .Filter .__init__ (self , tokenizer .HTMLTokenizer (
@@ -340,17 +362,17 @@ def checkStartTagUnknownAttributes(self, token):
340
362
# Attribute validation helpers
341
363
##########################################################################
342
364
343
- def checkURI (self , token , tagName , attrName , attrValue ):
344
- isValid , errorCode = rfc3987 .isValidURI (attrValue )
345
- if not isValid :
346
- yield {"type" : "ParseError" ,
347
- "data" : errorCode ,
348
- "datavars" : {"tagName" : tagName ,
349
- "attributeName" : attrName }}
350
- yield {"type" : "ParseError" ,
351
- "data" : "invalid-attribute-value" ,
352
- "datavars" : {"tagName" : tagName ,
353
- "attributeName" : attrName }}
365
+ # def checkURI(self, token, tagName, attrName, attrValue):
366
+ # isValid, errorCode = rfc3987.isValidURI(attrValue)
367
+ # if not isValid:
368
+ # yield {"type": "ParseError",
369
+ # "data": errorCode,
370
+ # "datavars": {"tagName": tagName,
371
+ # "attributeName": attrName}}
372
+ # yield {"type": "ParseError",
373
+ # "data": "invalid-attribute-value",
374
+ # "datavars": {"tagName": tagName,
375
+ # "attributeName": attrName}}
354
376
355
377
def checkIRI (self , token , tagName , attrName , attrValue ):
356
378
isValid , errorCode = rfc3987 .isValidIRI (attrValue )
@@ -382,26 +404,36 @@ def checkID(self, token, tagName, attrName, attrValue):
382
404
"attributeName" : attrName }}
383
405
break
384
406
385
- def checkTokenList (self , tagName , attrName , attrValue ):
386
- # The "token" in the method name refers to tokens in an attribute value
387
- # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
388
- # but the "token" parameter refers to the token generated from
389
- # HTMLTokenizer. Sorry for the confusion.
407
+ def parseTokenList (self , value ):
390
408
valueList = []
391
409
currentValue = ''
392
- for c in attrValue + ' ' :
410
+ for c in value + ' ' :
393
411
if c in spaceCharacters :
394
412
if currentValue :
395
- if currentValue in valueList :
396
- yield {"type" : "ParseError" ,
397
- "data" : "duplicate-value-in-token-list" ,
398
- "datavars" : {"tagName" : tagName ,
399
- "attributeName" : attrName ,
400
- "attributeValue" : currentValue }}
401
413
valueList .append (currentValue )
402
414
currentValue = ''
403
415
else :
404
416
currentValue += c
417
+ if currentValue :
418
+ valueList .append (currentValue )
419
+ return valueList
420
+
421
+ def checkTokenList (self , tagName , attrName , attrValue ):
422
+ # The "token" in the method name refers to tokens in an attribute value
423
+ # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
424
+ # but the "token" parameter refers to the token generated from
425
+ # HTMLTokenizer. Sorry for the confusion.
426
+ valueList = self .parseTokenList (attrValue )
427
+ valueDict = {}
428
+ for currentValue in valueList :
429
+ if valueDict .has_key (currentValue ):
430
+ yield {"type" : "ParseError" ,
431
+ "data" : "duplicate-value-in-token-list" ,
432
+ "datavars" : {"tagName" : tagName ,
433
+ "attributeName" : attrName ,
434
+ "attributeValue" : currentValue }}
435
+ break
436
+ valueDict [currentValue ] = 1
405
437
406
438
def checkEnumeratedValue (self , token , tagName , attrName , attrValue , enumeratedValues ):
407
439
if not attrValue and ('' not in enumeratedValues ):
@@ -422,7 +454,7 @@ def checkEnumeratedValue(self, token, tagName, attrName, attrValue, enumeratedVa
422
454
"datavars" : {"tagName" : tagName ,
423
455
"attributeName" : attrName }}
424
456
425
- def checkBooleanValue (self , token , tagName , attrName , attrValue ):
457
+ def checkBoolean (self , token , tagName , attrName , attrValue ):
426
458
enumeratedValues = frozenset ((attrName , '' ))
427
459
if attrValue not in enumeratedValues :
428
460
yield {"type" : "ParseError" ,
@@ -435,7 +467,7 @@ def checkBooleanValue(self, token, tagName, attrName, attrValue):
435
467
"datavars" : {"tagName" : tagName ,
436
468
"attributeName" : attrName }}
437
469
438
- def checkIntegerValue (self , token , tagName , attrName , attrValue ):
470
+ def checkInteger (self , token , tagName , attrName , attrValue ):
439
471
sign = 1
440
472
numberString = ''
441
473
state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
@@ -476,6 +508,10 @@ def checkIntegerValue(self, token, tagName, attrName, attrValue):
476
508
"datavars" : {"tagName" : tagName ,
477
509
"attributeName" : attrName }}
478
510
511
+ def checkFloatingPointNumber (self , token , tagName , attrName , attrValue ):
512
+ # XXX
513
+ pass
514
+
479
515
def checkBrowsingContext (self , token , tagName , attrName , attrValue ):
480
516
if not attrValue : return
481
517
if attrValue [0 ] != '_' : return
@@ -486,6 +522,56 @@ def checkBrowsingContext(self, token, tagName, attrName, attrValue):
486
522
"datavars" : {"tagName" : tagName ,
487
523
"attributeName" : attrName }}
488
524
525
+ def checkLangCode (self , token , tagName , attrName , attrValue ):
526
+ if not attrValue : return # blank is OK
527
+ if not iso639codes .isValidLangCode (attrValue ):
528
+ yield {"type" : "ParseError" ,
529
+ "data" : "invalid-lang-code" ,
530
+ "datavars" : {"tagName" : tagName ,
531
+ "attributeName" : attrName ,
532
+ "attributeValue" : attrValue }}
533
+
534
+ def checkMIMEType (self , token , tagName , attrName , attrValue ):
535
+ # XXX needs tests
536
+ if not attrValue :
537
+ yield {"type" : "ParseError" ,
538
+ "data" : "attribute-value-can-not-be-blank" ,
539
+ "datavars" : {"tagName" : tagName ,
540
+ "attributeName" : attrName }}
541
+
542
+ if not rfc2046 .isValidMIMEType (attrValue ):
543
+ yield {"type" : "ParseError" ,
544
+ "data" : "invalid-mime-type" ,
545
+ "datavars" : {"tagName" : tagName ,
546
+ "attributeName" : attrName ,
547
+ "attributeValue" : attrValue }}
548
+
549
+ def checkMediaQuery (self , token , tagName , attrName , attrValue ):
550
+ # XXX
551
+ pass
552
+
553
+ def checkLinkRelation (self , token , tagName , attrName , attrValue ):
554
+ for t in self .checkTokenList (tagName , attrName , attrValue ) or []: yield t
555
+ valueList = self .parseTokenList (attrValue )
556
+ allowedValues = (tagName == 'link' ) and linkRelValues or aRelValues
557
+ for currentValue in valueList :
558
+ if currentValue not in allowedValues :
559
+ yield {"type" : "ParseError" ,
560
+ "data" : "invalid-rel" ,
561
+ "datavars" : {"tagName" : tagName ,
562
+ "attributeName" : attrName }}
563
+
564
+ def checkDateTime (self , token , tagName , attrName , attrValue ):
565
+ # XXX
566
+ state = 'begin' # ('begin', '...
567
+ # for c in attrValue:
568
+ # if state == 'begin':
569
+ # if c in spaceCharacters:
570
+ # continue
571
+ # elif c in digits:
572
+ # state = ...
573
+
574
+
489
575
##########################################################################
490
576
# Attribute validation
491
577
##########################################################################
@@ -521,17 +607,8 @@ def validateAttributeValueDir(self, token, tagName, attrName, attrValue):
521
607
def validateAttributeValueDraggable (self , token , tagName , attrName , attrValue ):
522
608
for t in self .checkEnumeratedValue (token , tagName , attrName , attrValue , frozenset (('true' , 'false' ))) or []: yield t
523
609
524
- def validateAttributeValueIrrelevant (self , token , tagName , attrName , attrValue ):
525
- for t in self .checkBooleanValue (token , tagName , attrName , attrValue ) or []: yield t
526
-
527
- def validateAttributeValueLang (self , token , tagName , attrName , attrValue ):
528
- if not attrValue : return # blank is OK
529
- if not iso639codes .isValidLangCode (attrValue ):
530
- yield {"type" : "ParseError" ,
531
- "data" : "invalid-lang-code" ,
532
- "datavars" : {"tagName" : tagName ,
533
- "attributeName" : attrName ,
534
- "attributeValue" : attrValue }}
610
+ validateAttributeValueIrrelevant = checkBoolean
611
+ validateAttributeValueLang = checkLangCode
535
612
536
613
def validateAttributeValueContextmenu (self , token , tagName , attrName , attrValue ):
537
614
for t in self .checkID (token , tagName , attrName , attrValue ) or []: yield t
@@ -552,7 +629,7 @@ def validateAttributeValueId(self, token, tagName, attrName, attrValue):
552
629
self .IDsWeHaveKnownAndLoved .append (attrValue )
553
630
self .thingsThatDefineAnID .append (token )
554
631
555
- validateAttributeValueTabindex = checkIntegerValue
632
+ validateAttributeValueTabindex = checkInteger
556
633
557
634
def validateAttributeValueRef (self , token , tagName , attrName , attrValue ):
558
635
# XXX
@@ -569,13 +646,47 @@ def validateAttributeValueHtmlXmlns(self, token, tagName, attrName, attrValue):
569
646
"datavars" : {"tagName" : tagName ,
570
647
"attributeName" : attrName }}
571
648
572
- def validateAttributeValueBaseHref (self , token , tagName , attrName , attrValue ):
573
- # XXX
574
- pass
575
-
576
649
validateAttributeValueBaseHref = checkIRI
577
650
validateAttributeValueBaseTarget = checkBrowsingContext
578
651
validateAttributeValueLinkHref = checkIRI
652
+ validateAttributeValueLinkRel = checkLinkRelation
653
+ validateAttributeValueLinkMedia = checkMediaQuery
654
+ validateAttributeValueLinkHreflang = checkLangCode
655
+ validateAttributeValueLinkType = checkMIMEType
656
+ # XXX <meta> attributes
657
+ validateAttributeValueStyleMedia = checkMediaQuery
658
+ validateAttributeValueStyleType = checkMIMEType
659
+ validateAttributeValueStyleScoped = checkBoolean
660
+ validateAttributeValueBlockquoteCite = checkIRI
661
+ validateAttributeValueOlStart = checkInteger
662
+ validateAttributeValueLiValue = checkInteger
663
+ # XXX need tests from here on
664
+ validateAttributeValueAHref = checkIRI
665
+ validateAttributeValueATarget = checkBrowsingContext
666
+
667
+ def validateAttributeValueAPing (self , token , tagName , attrName , attrValue ):
668
+ valueList = self .parseTokenList (attrValue )
669
+ for currentValue in valueList :
670
+ for t in self .checkIRI (token , tagName , attrName , attrValue ) or []: yield t
671
+
672
+ validateAttributeValueARel = checkLinkRelation
673
+ validateAttributeValueAMedia = checkMediaQuery
674
+ validateAttributeValueAHreflang = checkLangCode
675
+ validateAttributeValueAType = checkMIMEType
676
+ validateAttributeValueQCite = checkIRI
677
+ validateAttributeValueTimeDatetime = checkDateTime
678
+ validateAttributeValueMeterValue = checkFloatingPointNumber
679
+ validateAttributeValueMeterMin = checkFloatingPointNumber
680
+ validateAttributeValueMeterLow = checkFloatingPointNumber
681
+ validateAttributeValueMeterHigh = checkFloatingPointNumber
682
+ validateAttributeValueMeterMax = checkFloatingPointNumber
683
+ validateAttributeValueMeterOptimum = checkFloatingPointNumber
684
+ validateAttributeValueProgressValue = checkFloatingPointNumber
685
+ validateAttributeValueProgressMax = checkFloatingPointNumber
686
+ validateAttributeValueInsCite = checkIRI
687
+ validateAttributeValueInsDatetime = checkDateTime
688
+ validateAttributeValueDelCite = checkIRI
689
+ validateAttributeValueDelDatetime = checkDateTime
579
690
580
691
##########################################################################
581
692
# Whole document validation (IDs, etc.)
0 commit comments