Skip to content

Commit 1c27075

Browse files
committed
Fix html5lib#124: Move to webencodings for decoding the input byte stream.
1 parent c36197d commit 1c27075

File tree

6 files changed

+16
-266
lines changed

6 files changed

+16
-266
lines changed

CHANGES.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ Change Log
66

77
Released on XXX, 2014
88

9-
* XXX
9+
* Fix #124: move to webencodings for decoding the input byte stream;
10+
this makes html5lib compliant with the Encoding Standard, and
11+
introduces a required dependency on webencodings.
1012

1113

1214
0.999

html5lib/constants.py

Lines changed: 0 additions & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -2848,235 +2848,6 @@
28482848
0x9F: "\u0178",
28492849
}
28502850

2851-
encodings = {
2852-
'437': 'cp437',
2853-
'850': 'cp850',
2854-
'852': 'cp852',
2855-
'855': 'cp855',
2856-
'857': 'cp857',
2857-
'860': 'cp860',
2858-
'861': 'cp861',
2859-
'862': 'cp862',
2860-
'863': 'cp863',
2861-
'865': 'cp865',
2862-
'866': 'cp866',
2863-
'869': 'cp869',
2864-
'ansix341968': 'ascii',
2865-
'ansix341986': 'ascii',
2866-
'arabic': 'iso8859-6',
2867-
'ascii': 'ascii',
2868-
'asmo708': 'iso8859-6',
2869-
'big5': 'big5',
2870-
'big5hkscs': 'big5hkscs',
2871-
'chinese': 'gbk',
2872-
'cp037': 'cp037',
2873-
'cp1026': 'cp1026',
2874-
'cp154': 'ptcp154',
2875-
'cp367': 'ascii',
2876-
'cp424': 'cp424',
2877-
'cp437': 'cp437',
2878-
'cp500': 'cp500',
2879-
'cp775': 'cp775',
2880-
'cp819': 'windows-1252',
2881-
'cp850': 'cp850',
2882-
'cp852': 'cp852',
2883-
'cp855': 'cp855',
2884-
'cp857': 'cp857',
2885-
'cp860': 'cp860',
2886-
'cp861': 'cp861',
2887-
'cp862': 'cp862',
2888-
'cp863': 'cp863',
2889-
'cp864': 'cp864',
2890-
'cp865': 'cp865',
2891-
'cp866': 'cp866',
2892-
'cp869': 'cp869',
2893-
'cp936': 'gbk',
2894-
'cpgr': 'cp869',
2895-
'cpis': 'cp861',
2896-
'csascii': 'ascii',
2897-
'csbig5': 'big5',
2898-
'cseuckr': 'cp949',
2899-
'cseucpkdfmtjapanese': 'euc_jp',
2900-
'csgb2312': 'gbk',
2901-
'cshproman8': 'hp-roman8',
2902-
'csibm037': 'cp037',
2903-
'csibm1026': 'cp1026',
2904-
'csibm424': 'cp424',
2905-
'csibm500': 'cp500',
2906-
'csibm855': 'cp855',
2907-
'csibm857': 'cp857',
2908-
'csibm860': 'cp860',
2909-
'csibm861': 'cp861',
2910-
'csibm863': 'cp863',
2911-
'csibm864': 'cp864',
2912-
'csibm865': 'cp865',
2913-
'csibm866': 'cp866',
2914-
'csibm869': 'cp869',
2915-
'csiso2022jp': 'iso2022_jp',
2916-
'csiso2022jp2': 'iso2022_jp_2',
2917-
'csiso2022kr': 'iso2022_kr',
2918-
'csiso58gb231280': 'gbk',
2919-
'csisolatin1': 'windows-1252',
2920-
'csisolatin2': 'iso8859-2',
2921-
'csisolatin3': 'iso8859-3',
2922-
'csisolatin4': 'iso8859-4',
2923-
'csisolatin5': 'windows-1254',
2924-
'csisolatin6': 'iso8859-10',
2925-
'csisolatinarabic': 'iso8859-6',
2926-
'csisolatincyrillic': 'iso8859-5',
2927-
'csisolatingreek': 'iso8859-7',
2928-
'csisolatinhebrew': 'iso8859-8',
2929-
'cskoi8r': 'koi8-r',
2930-
'csksc56011987': 'cp949',
2931-
'cspc775baltic': 'cp775',
2932-
'cspc850multilingual': 'cp850',
2933-
'cspc862latinhebrew': 'cp862',
2934-
'cspc8codepage437': 'cp437',
2935-
'cspcp852': 'cp852',
2936-
'csptcp154': 'ptcp154',
2937-
'csshiftjis': 'shift_jis',
2938-
'csunicode11utf7': 'utf-7',
2939-
'cyrillic': 'iso8859-5',
2940-
'cyrillicasian': 'ptcp154',
2941-
'ebcdiccpbe': 'cp500',
2942-
'ebcdiccpca': 'cp037',
2943-
'ebcdiccpch': 'cp500',
2944-
'ebcdiccphe': 'cp424',
2945-
'ebcdiccpnl': 'cp037',
2946-
'ebcdiccpus': 'cp037',
2947-
'ebcdiccpwt': 'cp037',
2948-
'ecma114': 'iso8859-6',
2949-
'ecma118': 'iso8859-7',
2950-
'elot928': 'iso8859-7',
2951-
'eucjp': 'euc_jp',
2952-
'euckr': 'cp949',
2953-
'extendedunixcodepackedformatforjapanese': 'euc_jp',
2954-
'gb18030': 'gb18030',
2955-
'gb2312': 'gbk',
2956-
'gb231280': 'gbk',
2957-
'gbk': 'gbk',
2958-
'greek': 'iso8859-7',
2959-
'greek8': 'iso8859-7',
2960-
'hebrew': 'iso8859-8',
2961-
'hproman8': 'hp-roman8',
2962-
'hzgb2312': 'hz',
2963-
'ibm037': 'cp037',
2964-
'ibm1026': 'cp1026',
2965-
'ibm367': 'ascii',
2966-
'ibm424': 'cp424',
2967-
'ibm437': 'cp437',
2968-
'ibm500': 'cp500',
2969-
'ibm775': 'cp775',
2970-
'ibm819': 'windows-1252',
2971-
'ibm850': 'cp850',
2972-
'ibm852': 'cp852',
2973-
'ibm855': 'cp855',
2974-
'ibm857': 'cp857',
2975-
'ibm860': 'cp860',
2976-
'ibm861': 'cp861',
2977-
'ibm862': 'cp862',
2978-
'ibm863': 'cp863',
2979-
'ibm864': 'cp864',
2980-
'ibm865': 'cp865',
2981-
'ibm866': 'cp866',
2982-
'ibm869': 'cp869',
2983-
'iso2022jp': 'iso2022_jp',
2984-
'iso2022jp2': 'iso2022_jp_2',
2985-
'iso2022kr': 'iso2022_kr',
2986-
'iso646irv1991': 'ascii',
2987-
'iso646us': 'ascii',
2988-
'iso88591': 'windows-1252',
2989-
'iso885910': 'iso8859-10',
2990-
'iso8859101992': 'iso8859-10',
2991-
'iso885911987': 'windows-1252',
2992-
'iso885913': 'iso8859-13',
2993-
'iso885914': 'iso8859-14',
2994-
'iso8859141998': 'iso8859-14',
2995-
'iso885915': 'iso8859-15',
2996-
'iso885916': 'iso8859-16',
2997-
'iso8859162001': 'iso8859-16',
2998-
'iso88592': 'iso8859-2',
2999-
'iso885921987': 'iso8859-2',
3000-
'iso88593': 'iso8859-3',
3001-
'iso885931988': 'iso8859-3',
3002-
'iso88594': 'iso8859-4',
3003-
'iso885941988': 'iso8859-4',
3004-
'iso88595': 'iso8859-5',
3005-
'iso885951988': 'iso8859-5',
3006-
'iso88596': 'iso8859-6',
3007-
'iso885961987': 'iso8859-6',
3008-
'iso88597': 'iso8859-7',
3009-
'iso885971987': 'iso8859-7',
3010-
'iso88598': 'iso8859-8',
3011-
'iso885981988': 'iso8859-8',
3012-
'iso88599': 'windows-1254',
3013-
'iso885991989': 'windows-1254',
3014-
'isoceltic': 'iso8859-14',
3015-
'isoir100': 'windows-1252',
3016-
'isoir101': 'iso8859-2',
3017-
'isoir109': 'iso8859-3',
3018-
'isoir110': 'iso8859-4',
3019-
'isoir126': 'iso8859-7',
3020-
'isoir127': 'iso8859-6',
3021-
'isoir138': 'iso8859-8',
3022-
'isoir144': 'iso8859-5',
3023-
'isoir148': 'windows-1254',
3024-
'isoir149': 'cp949',
3025-
'isoir157': 'iso8859-10',
3026-
'isoir199': 'iso8859-14',
3027-
'isoir226': 'iso8859-16',
3028-
'isoir58': 'gbk',
3029-
'isoir6': 'ascii',
3030-
'koi8r': 'koi8-r',
3031-
'koi8u': 'koi8-u',
3032-
'korean': 'cp949',
3033-
'ksc5601': 'cp949',
3034-
'ksc56011987': 'cp949',
3035-
'ksc56011989': 'cp949',
3036-
'l1': 'windows-1252',
3037-
'l10': 'iso8859-16',
3038-
'l2': 'iso8859-2',
3039-
'l3': 'iso8859-3',
3040-
'l4': 'iso8859-4',
3041-
'l5': 'windows-1254',
3042-
'l6': 'iso8859-10',
3043-
'l8': 'iso8859-14',
3044-
'latin1': 'windows-1252',
3045-
'latin10': 'iso8859-16',
3046-
'latin2': 'iso8859-2',
3047-
'latin3': 'iso8859-3',
3048-
'latin4': 'iso8859-4',
3049-
'latin5': 'windows-1254',
3050-
'latin6': 'iso8859-10',
3051-
'latin8': 'iso8859-14',
3052-
'latin9': 'iso8859-15',
3053-
'ms936': 'gbk',
3054-
'mskanji': 'shift_jis',
3055-
'pt154': 'ptcp154',
3056-
'ptcp154': 'ptcp154',
3057-
'r8': 'hp-roman8',
3058-
'roman8': 'hp-roman8',
3059-
'shiftjis': 'shift_jis',
3060-
'tis620': 'cp874',
3061-
'unicode11utf7': 'utf-7',
3062-
'us': 'ascii',
3063-
'usascii': 'ascii',
3064-
'utf16': 'utf-16',
3065-
'utf16be': 'utf-16-be',
3066-
'utf16le': 'utf-16-le',
3067-
'utf8': 'utf-8',
3068-
'windows1250': 'cp1250',
3069-
'windows1251': 'cp1251',
3070-
'windows1252': 'cp1252',
3071-
'windows1253': 'cp1253',
3072-
'windows1254': 'cp1254',
3073-
'windows1255': 'cp1255',
3074-
'windows1256': 'cp1256',
3075-
'windows1257': 'cp1257',
3076-
'windows1258': 'cp1258',
3077-
'windows936': 'gbk',
3078-
'x-x-big5': 'big5'}
3079-
30802851
tokenTypes = {
30812852
"Doctype": 0,
30822853
"Characters": 1,

html5lib/inputstream.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
from __future__ import absolute_import, division, unicode_literals
2-
from six import text_type
2+
from six import text_type, binary_type
33
from six.moves import http_client
44

55
import codecs
66
import re
77

8+
import webencodings
9+
810
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
9-
from .constants import encodings, ReparseException
11+
from .constants import ReparseException
1012
from . import utils
1113

1214
from io import StringIO
@@ -479,12 +481,6 @@ def detectEncoding(self, parseMeta=True, chardet=True):
479481
confidence = "tentative"
480482
encoding = self.defaultEncoding
481483

482-
# Substitute for equivalent encodings:
483-
encodingSub = {"iso-8859-1": "windows-1252"}
484-
485-
if encoding.lower() in encodingSub:
486-
encoding = encodingSub[encoding.lower()]
487-
488484
return encoding, confidence
489485

490486
def changeEncoding(self, newEncoding):
@@ -874,13 +870,16 @@ def parse(self):
874870
def codecName(encoding):
875871
"""Return the python codec name corresponding to an encoding or None if the
876872
string doesn't correspond to a valid encoding."""
877-
if isinstance(encoding, bytes):
873+
if isinstance(encoding, binary_type):
878874
try:
879875
encoding = encoding.decode("ascii")
880876
except UnicodeDecodeError:
881877
return None
882-
if encoding:
883-
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
884-
return encodings.get(canonicalName, None)
878+
879+
if encoding is not None:
880+
try:
881+
return webencodings.lookup(encoding).name
882+
except AttributeError:
883+
return None
885884
else:
886885
return None

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
six
2+
webencodings

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ deps =
1919
Genshi
2020
nose
2121
six
22+
webencodings
2223

2324
[testenv:py26]
2425
basepython = python2.6

utils/iana_parse.py

Lines changed: 0 additions & 24 deletions
This file was deleted.

0 commit comments

Comments
 (0)