Skip to content

Commit 9c136b0

Browse files
committed
More encoding bugfixes
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40507
1 parent a0eefb1 commit 9c136b0

File tree

3 files changed

+207
-10
lines changed

3 files changed

+207
-10
lines changed

src/constants.py

Lines changed: 202 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@
112112
u"\n",
113113
u"\u000B",
114114
u"\u000C",
115-
u" "
115+
u" ",
116+
u"\r"
116117
))
117118

118119
tableInsertModeElements = frozenset((
@@ -455,3 +456,203 @@
455456
"zwj": u"\u200D",
456457
"zwnj": u"\u200C"
457458
}
459+
460+
encodings = [
461+
"ascii",
462+
"646",
463+
"us-ascii",
464+
"big5",
465+
"big5-tw",
466+
"csbig5",
467+
"big5hkscs",
468+
"big5-hkscs",
469+
"hkscs",
470+
"cp874",
471+
"cp875",
472+
"cp932",
473+
"932",
474+
"ms932",
475+
"mskanji",
476+
"ms-kanji",
477+
"cp949",
478+
"949",
479+
"ms949",
480+
"uhc",
481+
"cp950",
482+
"950",
483+
"ms950",
484+
"cp1006",
485+
"cp1250",
486+
"windows-1250",
487+
"cp1251",
488+
"windows-1251",
489+
"cp1252",
490+
"windows-1252",
491+
"cp1253",
492+
"windows-1253",
493+
"cp1254",
494+
"windows-1254",
495+
"cp1255",
496+
"windows-1255",
497+
"cp1256",
498+
"windows1256",
499+
"cp1257",
500+
"windows-1257",
501+
"cp1258",
502+
"windows-1258",
503+
"euc_jp",
504+
"eucjp",
505+
"ujis",
506+
"u-jis",
507+
"euc_jis_2004",
508+
"jisx0213",
509+
"eucjis2004",
510+
"euc_jisx0213",
511+
"eucjisx0213",
512+
"euc_kr",
513+
"euckr",
514+
"korean",
515+
"ksc5601",
516+
"ks_c-5601",
517+
"ks_c-5601-1987",
518+
"ksx1001",
519+
"ks_x-1001",
520+
"gb2312",
521+
"chinese",
522+
"csiso58gb231280",
523+
"euc-cn",
524+
"euccn",
525+
"eucgb2312-cn",
526+
"gb2312-1980",
527+
"gb2312-80",
528+
"iso-ir-58",
529+
"gbk",
530+
"936",
531+
"cp936",
532+
"ms936",
533+
"gb18030",
534+
"gb18030-2000",
535+
"hz",
536+
"hzgb",
537+
"hz-gb",
538+
"hz-gb-2312",
539+
"iso2022_jp",
540+
"csiso2022jp",
541+
"iso2022jp",
542+
"iso-2022-jp",
543+
"iso2022_jp_1",
544+
"iso2022jp-1",
545+
"iso-2022-jp-1",
546+
"iso2022_jp_2",
547+
"iso2022jp-2",
548+
"iso-2022-jp-2",
549+
"iso2022_jp_2004",
550+
"iso2022jp-2004",
551+
"iso-2022-jp-2004",
552+
"iso2022_jp_3",
553+
"iso2022jp-3",
554+
"iso-2022-jp-3",
555+
"iso2022_jp_ext",
556+
"iso2022jp-ext",
557+
"iso-2022-jp-ext",
558+
"iso2022_kr",
559+
"csiso2022kr",
560+
"iso2022kr",
561+
"iso-2022-kr",
562+
"latin_1",
563+
"iso-8859-1",
564+
"iso8859-1",
565+
"8859",
566+
"cp819",
567+
"latin",
568+
"latin1",
569+
"L1",
570+
"iso8859_2",
571+
"iso-8859-2",
572+
"latin2",
573+
"L2",
574+
"iso8859_3",
575+
"iso-8859-3",
576+
"latin3",
577+
"L3",
578+
"iso8859_4",
579+
"iso-8859-4",
580+
"latin4",
581+
"L4",
582+
"iso8859_5",
583+
"iso-8859-5",
584+
"cyrillic",
585+
"iso8859_6",
586+
"iso-8859-6",
587+
"arabic",
588+
"iso8859_7",
589+
"iso-8859-7",
590+
"greek",
591+
"greek8",
592+
"iso8859_8",
593+
"iso-8859-8",
594+
"hebrew",
595+
"iso8859_9",
596+
"iso-8859-9",
597+
"latin5",
598+
"L5",
599+
"iso8859_10",
600+
"iso-8859-10",
601+
"latin6",
602+
"L6",
603+
"iso8859_13",
604+
"iso-8859-13",
605+
"iso8859_14",
606+
"iso-8859-14",
607+
"latin8",
608+
"L8",
609+
"iso8859_15",
610+
"iso-8859-15",
611+
"johab",
612+
"cp1361",
613+
"ms1361",
614+
"koi8_r",
615+
"koi8_u",
616+
"mac_cyrillic",
617+
"maccyrillic",
618+
"mac_greek",
619+
"macgreek",
620+
"mac_iceland",
621+
"maciceland",
622+
"mac_latin2",
623+
"maclatin2",
624+
"maccentraleurope",
625+
"mac_roman",
626+
"macroman",
627+
"mac_turkish",
628+
"macturkish",
629+
"ptcp154",
630+
"csptcp154",
631+
"pt154",
632+
"cp154",
633+
"cyrillic-asian",
634+
"shift_jis",
635+
"csshiftjis",
636+
"shiftjis",
637+
"sjis",
638+
"s_jis",
639+
"shift_jis_2004",
640+
"shiftjis2004",
641+
"sjis_2004",
642+
"sjis2004",
643+
"shift_jisx0213",
644+
"shiftjisx0213",
645+
"sjisx0213",
646+
"s_jisx0213",
647+
"utf_16",
648+
"U16",
649+
"utf16",
650+
"utf_16_be",
651+
"UTF-16BE",
652+
"utf_16_le",
653+
"UTF-16LE",
654+
"utf_8",
655+
"U8",
656+
"UTF",
657+
"utf8",
658+
"utf_8_sig"]

src/inputstream.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33

44
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
5+
from constants import encodings
56
from utils import MethodDispatcher
67

78
class HTMLInputStream(object):
@@ -356,14 +357,8 @@ def getAttribute(self):
356357
return attr
357358

358359
def isValidEncoding(self, encoding):
359-
"""Determine if encoding is a valid encoding and, if it is, set it
360-
as the encoding on the inputstream"""
361-
try:
362-
codecs.lookup(encoding)
363-
rv = True
364-
except LookupError:
365-
rv = False
366-
return rv
360+
"""Determine if a string is a supported encoding"""
361+
return encoding is not None and encoding.lower() in encodings
367362

368363
class FragmentParser(object):
369364
"""Helper object for parsing document fragments e.g. attributes and content

tests/test_encoding.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
import StringIO
55
import unittest
66
import new
7+
import codecs
78

89
# XXX Allow us to import the sibling module
910
os.chdir(os.path.split(os.path.abspath(__file__))[0])
1011
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
1112

1213
import inputstream
13-
import codecs
14+
1415

1516
def parseTestcase(testString):
1617
testString = testString.split("\n")

0 commit comments

Comments
 (0)