Skip to content

Commit f5853f7

Browse files
committed
Patches to address SF bugs 1409538 (Japanese codecs in CODEC_MAP) and 1409455
(.set_payload() gives bad .get_payload() results). Specific changes include: Simplfy the default CODEC_MAP in Charset.py to not include the Japanese and Korean codecs. The names of the codecs are different depending on whether you're using Python 2.4 and 2.5, which include the codecs by default, or earlier Python's which provide the codecs under different names as a third party library. Now, we attempt to discover which (if either) is available and populate the CODEC_MAP as appropriate. Message.set_charset(): When the message does not already have a Content-Transfer-Encoding header, instead of just adding the header, we also encode the body as defined by the assigned Charset. As before, if the body_encoding is callable, we just call that. If not, then we add a call to body_encode() before setting the header. This way, we guarantee that a message's text payload is always encoded properly. Remove the payload encoding code from Generator._handle_text(). With the above patch, this would cause the body to be doubly encoded. Doing this in the Message class is better than only doing it in the Generator. Added some new tests to ensure everything works correctly. Also changed the way the test_email_codecs.py tests get added (using the same lookup code that the CODEC_MAP adjustments use). This resolves both issues for email 2.5/Python 2.3. I will patch forward to email 3.0 for both Python 2.4 and 2.5.
1 parent 784fccf commit f5853f7

File tree

5 files changed

+47
-27
lines changed

5 files changed

+47
-27
lines changed

Lib/email/Charset.py

+27-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# Copyright (C) 2001,2002 Python Software Foundation
2-
# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
1+
# Copyright (C) 2001-2006 Python Software Foundation
2+
# Author: che@debian.org (Ben Gertzfield), barry@python.org (Barry Warsaw)
33

44
from types import UnicodeType
55
from email.Encoders import encode_7or8bit
@@ -99,20 +99,13 @@ def _isunicode(s):
9999
# of stability and useability.
100100

101101
CODEC_MAP = {
102-
'euc-jp': 'japanese.euc-jp',
103-
'iso-2022-jp': 'japanese.iso-2022-jp',
104-
'shift_jis': 'japanese.shift_jis',
105-
'euc-kr': 'korean.euc-kr',
106-
'ks_c_5601-1987': 'korean.cp949',
107-
'iso-2022-kr': 'korean.iso-2022-kr',
108-
'johab': 'korean.johab',
109-
'gb2132': 'eucgb2312_cn',
110-
'big5': 'big5_tw',
111-
'utf-8': 'utf-8',
102+
'gb2132': 'eucgb2312_cn',
103+
'big5': 'big5_tw',
104+
'utf-8': 'utf-8',
112105
# Hack: We don't want *any* conversion for stuff marked us-ascii, as all
113106
# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
114107
# Let that stuff pass through without conversion to/from Unicode.
115-
'us-ascii': None,
108+
'us-ascii': None,
116109
}
117110

118111

@@ -165,6 +158,26 @@ def add_codec(charset, codecname):
165158
CODEC_MAP[charset] = codecname
166159

167160

161+
def _find_asian_codec(charset, language):
162+
try:
163+
unicode('foo', charset)
164+
return charset
165+
except LookupError:
166+
try:
167+
codec = language + '.' + charset
168+
unicode('foo', codec)
169+
return codec
170+
except LookupError:
171+
return None
172+
173+
174+
for _charset in ('euc-jp', 'iso-2022-jp', 'shift_jis'):
175+
add_codec(_charset, _find_asian_codec(_charset, 'japanese') or _charset)
176+
177+
for _charset in ('euc-kr', 'cp949', 'iso-2022-kr', 'johab'):
178+
add_codec(_charset, _find_asian_codec(_charset, 'korean') or _charset)
179+
180+
168181

169182
class Charset:
170183
"""Map character sets to their email properties.
@@ -229,7 +242,7 @@ def __init__(self, input_charset=DEFAULT_CHARSET):
229242
self.input_codec = CODEC_MAP.get(self.input_charset,
230243
self.input_charset)
231244
self.output_codec = CODEC_MAP.get(self.output_charset,
232-
self.input_codec)
245+
self.input_codec)
233246

234247
def __str__(self):
235248
return self.input_charset.lower()

Lib/email/Generator.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
# Copyright (C) 2001,2002 Python Software Foundation
2-
# Author: barry@zope.com (Barry Warsaw)
1+
# Copyright (C) 2001-2006 Python Software Foundation
2+
# Author: barry@python.org (Barry Warsaw)
33

4-
"""Classes to generate plain text from a message object tree.
5-
"""
4+
"""Classes to generate plain text from a message object tree."""
65

76
import re
87
import sys
@@ -192,9 +191,6 @@ def _handle_text(self, msg):
192191
payload = msg.get_payload()
193192
if payload is None:
194193
return
195-
cset = msg.get_charset()
196-
if cset is not None:
197-
payload = cset.body_encode(payload)
198194
if not _isstring(payload):
199195
raise TypeError, 'string payload expected: %s' % type(payload)
200196
if self._mangle_from_:

Lib/email/Message.py

+3
Original file line numberDiff line numberDiff line change
@@ -272,11 +272,14 @@ def set_charset(self, charset):
272272
charset=charset.get_output_charset())
273273
else:
274274
self.set_param('charset', charset.get_output_charset())
275+
if str(charset) <> charset.get_output_charset():
276+
self._payload = charset.body_encode(self._payload)
275277
if not self.has_key('Content-Transfer-Encoding'):
276278
cte = charset.get_body_encoding()
277279
if callable(cte):
278280
cte(self)
279281
else:
282+
self._payload = charset.body_encode(self._payload)
280283
self.add_header('Content-Transfer-Encoding', cte)
281284

282285
def get_charset(self):

Lib/email/test/test_email.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2073,7 +2073,8 @@ def test_get_body_encoding_with_uppercase_charset(self):
20732073
charset = Charset(charsets[0])
20742074
eq(charset.get_body_encoding(), 'base64')
20752075
msg.set_payload('hello world', charset=charset)
2076-
eq(msg.get_payload(), 'hello world')
2076+
eq(msg.get_payload(), 'aGVsbG8gd29ybGQ=\n')
2077+
eq(msg.get_payload(decode=True), 'hello world')
20772078
eq(msg['content-transfer-encoding'], 'base64')
20782079
# Try another one
20792080
msg = Message()

Lib/email/test/test_email_codecs.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
1-
# Copyright (C) 2002 Python Software Foundation
1+
# Copyright (C) 2002-2006 Python Software Foundation
22
# email package unit tests for (optional) Asian codecs
33

44
import unittest
55
from test.test_support import TestSkipped, run_unittest
66

77
from email.test.test_email import TestEmailBase
8-
from email.Charset import Charset
8+
from email.Charset import Charset, _find_asian_codec
99
from email.Header import Header, decode_header
10+
from email.Message import Message
1011

1112
# See if we have the Japanese codecs package installed
12-
try:
13-
unicode('foo', 'japanese.iso-2022-jp')
14-
except LookupError:
13+
if not _find_asian_codec('iso-2022-jp', 'japanese'):
1514
raise TestSkipped, 'Optional Japanese codecs not installed'
1615

1716

@@ -49,6 +48,14 @@ def test_japanese_codecs(self):
4948
# TK: full decode comparison
5049
eq(h.__unicode__().encode('euc-jp'), long)
5150

51+
def test_payload_encoding(self):
52+
jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa'
53+
jcode = 'euc-jp'
54+
msg = Message()
55+
msg.set_payload(jhello, jcode)
56+
ustr = unicode(msg.get_payload(), msg.get_content_charset())
57+
self.assertEqual(jhello, ustr.encode(jcode))
58+
5259

5360

5461
def suite():

0 commit comments

Comments
 (0)