File tree 3 files changed +207
-10
lines changed
3 files changed +207
-10
lines changed Original file line number Diff line number Diff line change 112
112
u"\n " ,
113
113
u"\u000B " ,
114
114
u"\u000C " ,
115
- u" "
115
+ u" " ,
116
+ u"\r "
116
117
))
117
118
118
119
tableInsertModeElements = frozenset ((
455
456
"zwj" : u"\u200D " ,
456
457
"zwnj" : u"\u200C "
457
458
}
459
+
460
+ encodings = [
461
+ "ascii" ,
462
+ "646" ,
463
+ "us-ascii" ,
464
+ "big5" ,
465
+ "big5-tw" ,
466
+ "csbig5" ,
467
+ "big5hkscs" ,
468
+ "big5-hkscs" ,
469
+ "hkscs" ,
470
+ "cp874" ,
471
+ "cp875" ,
472
+ "cp932" ,
473
+ "932" ,
474
+ "ms932" ,
475
+ "mskanji" ,
476
+ "ms-kanji" ,
477
+ "cp949" ,
478
+ "949" ,
479
+ "ms949" ,
480
+ "uhc" ,
481
+ "cp950" ,
482
+ "950" ,
483
+ "ms950" ,
484
+ "cp1006" ,
485
+ "cp1250" ,
486
+ "windows-1250" ,
487
+ "cp1251" ,
488
+ "windows-1251" ,
489
+ "cp1252" ,
490
+ "windows-1252" ,
491
+ "cp1253" ,
492
+ "windows-1253" ,
493
+ "cp1254" ,
494
+ "windows-1254" ,
495
+ "cp1255" ,
496
+ "windows-1255" ,
497
+ "cp1256" ,
498
+ "windows1256" ,
499
+ "cp1257" ,
500
+ "windows-1257" ,
501
+ "cp1258" ,
502
+ "windows-1258" ,
503
+ "euc_jp" ,
504
+ "eucjp" ,
505
+ "ujis" ,
506
+ "u-jis" ,
507
+ "euc_jis_2004" ,
508
+ "jisx0213" ,
509
+ "eucjis2004" ,
510
+ "euc_jisx0213" ,
511
+ "eucjisx0213" ,
512
+ "euc_kr" ,
513
+ "euckr" ,
514
+ "korean" ,
515
+ "ksc5601" ,
516
+ "ks_c-5601" ,
517
+ "ks_c-5601-1987" ,
518
+ "ksx1001" ,
519
+ "ks_x-1001" ,
520
+ "gb2312" ,
521
+ "chinese" ,
522
+ "csiso58gb231280" ,
523
+ "euc-cn" ,
524
+ "euccn" ,
525
+ "eucgb2312-cn" ,
526
+ "gb2312-1980" ,
527
+ "gb2312-80" ,
528
+ "iso-ir-58" ,
529
+ "gbk" ,
530
+ "936" ,
531
+ "cp936" ,
532
+ "ms936" ,
533
+ "gb18030" ,
534
+ "gb18030-2000" ,
535
+ "hz" ,
536
+ "hzgb" ,
537
+ "hz-gb" ,
538
+ "hz-gb-2312" ,
539
+ "iso2022_jp" ,
540
+ "csiso2022jp" ,
541
+ "iso2022jp" ,
542
+ "iso-2022-jp" ,
543
+ "iso2022_jp_1" ,
544
+ "iso2022jp-1" ,
545
+ "iso-2022-jp-1" ,
546
+ "iso2022_jp_2" ,
547
+ "iso2022jp-2" ,
548
+ "iso-2022-jp-2" ,
549
+ "iso2022_jp_2004" ,
550
+ "iso2022jp-2004" ,
551
+ "iso-2022-jp-2004" ,
552
+ "iso2022_jp_3" ,
553
+ "iso2022jp-3" ,
554
+ "iso-2022-jp-3" ,
555
+ "iso2022_jp_ext" ,
556
+ "iso2022jp-ext" ,
557
+ "iso-2022-jp-ext" ,
558
+ "iso2022_kr" ,
559
+ "csiso2022kr" ,
560
+ "iso2022kr" ,
561
+ "iso-2022-kr" ,
562
+ "latin_1" ,
563
+ "iso-8859-1" ,
564
+ "iso8859-1" ,
565
+ "8859" ,
566
+ "cp819" ,
567
+ "latin" ,
568
+ "latin1" ,
569
+ "L1" ,
570
+ "iso8859_2" ,
571
+ "iso-8859-2" ,
572
+ "latin2" ,
573
+ "L2" ,
574
+ "iso8859_3" ,
575
+ "iso-8859-3" ,
576
+ "latin3" ,
577
+ "L3" ,
578
+ "iso8859_4" ,
579
+ "iso-8859-4" ,
580
+ "latin4" ,
581
+ "L4" ,
582
+ "iso8859_5" ,
583
+ "iso-8859-5" ,
584
+ "cyrillic" ,
585
+ "iso8859_6" ,
586
+ "iso-8859-6" ,
587
+ "arabic" ,
588
+ "iso8859_7" ,
589
+ "iso-8859-7" ,
590
+ "greek" ,
591
+ "greek8" ,
592
+ "iso8859_8" ,
593
+ "iso-8859-8" ,
594
+ "hebrew" ,
595
+ "iso8859_9" ,
596
+ "iso-8859-9" ,
597
+ "latin5" ,
598
+ "L5" ,
599
+ "iso8859_10" ,
600
+ "iso-8859-10" ,
601
+ "latin6" ,
602
+ "L6" ,
603
+ "iso8859_13" ,
604
+ "iso-8859-13" ,
605
+ "iso8859_14" ,
606
+ "iso-8859-14" ,
607
+ "latin8" ,
608
+ "L8" ,
609
+ "iso8859_15" ,
610
+ "iso-8859-15" ,
611
+ "johab" ,
612
+ "cp1361" ,
613
+ "ms1361" ,
614
+ "koi8_r" ,
615
+ "koi8_u" ,
616
+ "mac_cyrillic" ,
617
+ "maccyrillic" ,
618
+ "mac_greek" ,
619
+ "macgreek" ,
620
+ "mac_iceland" ,
621
+ "maciceland" ,
622
+ "mac_latin2" ,
623
+ "maclatin2" ,
624
+ "maccentraleurope" ,
625
+ "mac_roman" ,
626
+ "macroman" ,
627
+ "mac_turkish" ,
628
+ "macturkish" ,
629
+ "ptcp154" ,
630
+ "csptcp154" ,
631
+ "pt154" ,
632
+ "cp154" ,
633
+ "cyrillic-asian" ,
634
+ "shift_jis" ,
635
+ "csshiftjis" ,
636
+ "shiftjis" ,
637
+ "sjis" ,
638
+ "s_jis" ,
639
+ "shift_jis_2004" ,
640
+ "shiftjis2004" ,
641
+ "sjis_2004" ,
642
+ "sjis2004" ,
643
+ "shift_jisx0213" ,
644
+ "shiftjisx0213" ,
645
+ "sjisx0213" ,
646
+ "s_jisx0213" ,
647
+ "utf_16" ,
648
+ "U16" ,
649
+ "utf16" ,
650
+ "utf_16_be" ,
651
+ "UTF-16BE" ,
652
+ "utf_16_le" ,
653
+ "UTF-16LE" ,
654
+ "utf_8" ,
655
+ "U8" ,
656
+ "UTF" ,
657
+ "utf8" ,
658
+ "utf_8_sig" ]
Original file line number Diff line number Diff line change 2
2
import re
3
3
4
4
from constants import EOF , spaceCharacters , asciiLetters , asciiUppercase
5
+ from constants import encodings
5
6
from utils import MethodDispatcher
6
7
7
8
class HTMLInputStream (object ):
@@ -356,14 +357,8 @@ def getAttribute(self):
356
357
return attr
357
358
358
359
def isValidEncoding (self , encoding ):
359
- """Determine if encoding is a valid encoding and, if it is, set it
360
- as the encoding on the inputstream"""
361
- try :
362
- codecs .lookup (encoding )
363
- rv = True
364
- except LookupError :
365
- rv = False
366
- return rv
360
+ """Determine if a string is a supported encoding"""
361
+ return encoding is not None and encoding .lower () in encodings
367
362
368
363
class FragmentParser (object ):
369
364
"""Helper object for parsing document fragments e.g. attributes and content
Original file line number Diff line number Diff line change 4
4
import StringIO
5
5
import unittest
6
6
import new
7
+ import codecs
7
8
8
9
# XXX Allow us to import the sibling module
9
10
os .chdir (os .path .split (os .path .abspath (__file__ ))[0 ])
10
11
sys .path .insert (0 , os .path .abspath (os .path .join (os .pardir , "src" )))
11
12
12
13
import inputstream
13
- import codecs
14
+
14
15
15
16
def parseTestcase (testString ):
16
17
testString = testString .split ("\n " )
You can’t perform that action at this time.
0 commit comments