Skip to content

Commit d9a9980

Browse files
committed
Util for parsing iana encodings list
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40510
1 parent e80aa0d commit d9a9980

File tree

3 files changed

+441
-0
lines changed

3 files changed

+441
-0
lines changed

utils/encodings.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
encodings = frozenset((
2+
"ansi_x3.4-1968",
3+
"iso-ir-6",
4+
"ansi_x3.4-1986",
5+
"iso_646.irv:1991",
6+
"ascii",
7+
"iso646-us",
8+
"us-ascii",
9+
"us",
10+
"ibm367",
11+
"cp367",
12+
"csascii",
13+
"ks_c_5601-1987",
14+
"korean",
15+
"iso-2022-kr",
16+
"csiso2022kr",
17+
"euc-kr",
18+
"iso-2022-jp",
19+
"csiso2022jp",
20+
"iso-2022-jp-2",
21+
"iso-ir-58",
22+
"chinese",
23+
"csiso58gb231280",
24+
"iso_8859-1:1987",
25+
"iso-ir-100",
26+
"iso_8859-1",
27+
"iso-8859-1",
28+
"latin1",
29+
"l1",
30+
"ibm819",
31+
"cp819",
32+
"csisolatin1",
33+
"iso_8859-2:1987",
34+
"iso-ir-101",
35+
"iso_8859-2",
36+
"iso-8859-2",
37+
"latin2",
38+
"l2",
39+
"csisolatin2",
40+
"iso_8859-3:1988",
41+
"iso-ir-109",
42+
"iso_8859-3",
43+
"iso-8859-3",
44+
"latin3",
45+
"l3",
46+
"csisolatin3",
47+
"iso_8859-4:1988",
48+
"iso-ir-110",
49+
"iso_8859-4",
50+
"iso-8859-4",
51+
"latin4",
52+
"l4",
53+
"csisolatin4",
54+
"iso_8859-6:1987",
55+
"iso-ir-127",
56+
"iso_8859-6",
57+
"iso-8859-6",
58+
"ecma-114",
59+
"asmo-708",
60+
"arabic",
61+
"csisolatinarabic",
62+
"iso_8859-7:1987",
63+
"iso-ir-126",
64+
"iso_8859-7",
65+
"iso-8859-7",
66+
"elot_928",
67+
"ecma-118",
68+
"greek",
69+
"greek8",
70+
"csisolatingreek",
71+
"iso_8859-8:1988",
72+
"iso-ir-138",
73+
"iso_8859-8",
74+
"iso-8859-8",
75+
"hebrew",
76+
"csisolatinhebrew",
77+
"iso_8859-5:1988",
78+
"iso-ir-144",
79+
"iso_8859-5",
80+
"iso-8859-5",
81+
"cyrillic",
82+
"csisolatincyrillic",
83+
"iso_8859-9:1989",
84+
"iso-ir-148",
85+
"iso_8859-9",
86+
"iso-8859-9",
87+
"latin5",
88+
"l5",
89+
"csisolatin5",
90+
"iso-8859-10",
91+
"iso-ir-157",
92+
"l6",
93+
"iso_8859-10:1992",
94+
"csisolatin6",
95+
"latin6",
96+
"hp-roman8",
97+
"roman8",
98+
"r8",
99+
"ibm037",
100+
"cp037",
101+
"ebcdic-cp-us",
102+
"ebcdic-cp-ca",
103+
"ebcdic-cp-wt",
104+
"ebcdic-cp-nl",
105+
"csibm037",
106+
"ibm424",
107+
"cp424",
108+
"ebcdic-cp-he",
109+
"csibm424",
110+
"ibm437",
111+
"cp437",
112+
"437",
113+
"cspc8codepage437",
114+
"ibm500",
115+
"cp500",
116+
"ebcdic-cp-be",
117+
"ebcdic-cp-ch",
118+
"csibm500",
119+
"ibm775",
120+
"cp775",
121+
"cspc775baltic",
122+
"ibm850",
123+
"cp850",
124+
"850",
125+
"cspc850multilingual",
126+
"ibm852",
127+
"cp852",
128+
"852",
129+
"cspcp852",
130+
"ibm855",
131+
"cp855",
132+
"855",
133+
"csibm855",
134+
"ibm857",
135+
"cp857",
136+
"857",
137+
"csibm857",
138+
"ibm860",
139+
"cp860",
140+
"860",
141+
"csibm860",
142+
"ibm861",
143+
"cp861",
144+
"861",
145+
"cp-is",
146+
"csibm861",
147+
"ibm862",
148+
"cp862",
149+
"862",
150+
"cspc862latinhebrew",
151+
"ibm863",
152+
"cp863",
153+
"863",
154+
"csibm863",
155+
"ibm864",
156+
"cp864",
157+
"csibm864",
158+
"ibm865",
159+
"cp865",
160+
"865",
161+
"csibm865",
162+
"ibm866",
163+
"cp866",
164+
"866",
165+
"csibm866",
166+
"ibm869",
167+
"cp869",
168+
"869",
169+
"cp-gr",
170+
"csibm869",
171+
"ibm1026",
172+
"cp1026",
173+
"csibm1026",
174+
"koi8-r",
175+
"cskoi8r",
176+
"koi8-u",
177+
"big5-hkscs",
178+
"ptcp154",
179+
"csptcp154",
180+
"pt154",
181+
"cp154",
182+
"utf-7",
183+
"utf-16be",
184+
"utf-16le",
185+
"utf-16",
186+
"utf-8",
187+
"iso-8859-13",
188+
"iso-8859-14",
189+
"iso-ir-199",
190+
"iso_8859-14:1998",
191+
"iso_8859-14",
192+
"latin8",
193+
"iso-celtic",
194+
"l8",
195+
"iso-8859-15",
196+
"iso_8859-15",
197+
"iso-8859-16",
198+
"iso-ir-226",
199+
"iso_8859-16:2001",
200+
"iso_8859-16",
201+
"latin10",
202+
"l10",
203+
"gbk",
204+
"cp936",
205+
"ms936",
206+
"gb18030",
207+
"shift_jis",
208+
"ms_kanji",
209+
"csshiftjis",
210+
"euc-jp",
211+
"gb2312",
212+
"big5",
213+
"csbig5",
214+
"windows-1250",
215+
"windows-1251",
216+
"windows-1252",
217+
"windows-1253",
218+
"windows-1254",
219+
"windows-1255",
220+
"windows-1256",
221+
"windows-1257",
222+
"windows-1258",
223+
"tis-620",
224+
"hz-gb-2312",
225+
))

utils/iana_parse.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import urllib2
4+
import codecs
5+
6+
def main():
7+
encodings = []
8+
f = urllib2.urlopen(sys.argv[1])
9+
for line in f:
10+
if line.startswith("Name: ") or line.startswith("Alias: "):
11+
enc = line.split()[1]
12+
try:
13+
codecs.lookup(enc)
14+
if enc.lower not in encodings:
15+
encodings.append(enc.lower())
16+
except LookupError:
17+
pass
18+
sys.stdout.write("encodings = frozenset((\n")
19+
for enc in encodings:
20+
sys.stdout.write(' "%s",\n'%enc)
21+
sys.stdout.write(' ))')
22+
23+
if __name__ == "__main__":
24+
main()

0 commit comments

Comments
 (0)