Patches to address SF bugs 1409538 (Japanese codecs in CODEC_MAP) and 1409455

warsaw · warsaw · commit f5853f7592fa · 2006-02-08T13:33:20.000Z
(.set_payload() gives bad .get_payload() results).  Specific changes include:

Simplfy the default CODEC_MAP in Charset.py to not include the Japanese and
Korean codecs.  The names of the codecs are different depending on whether
you're using Python 2.4 and 2.5, which include the codecs by default, or
earlier Python's which provide the codecs under different names as a third
party library.  Now, we attempt to discover which (if either) is available and
populate the CODEC_MAP as appropriate.

Message.set_charset(): When the message does not already have a
Content-Transfer-Encoding header, instead of just adding the header, we also
encode the body as defined by the assigned Charset.  As before, if the
body_encoding is callable, we just call that.  If not, then we add a call to
body_encode() before setting the header.  This way, we guarantee that a
message's text payload is always encoded properly.

Remove the payload encoding code from Generator._handle_text().  With the
above patch, this would cause the body to be doubly encoded.  Doing this in
the Message class is better than only doing it in the Generator.

Added some new tests to ensure everything works correctly.  Also changed the
way the test_email_codecs.py tests get added (using the same lookup code that
the CODEC_MAP adjustments use).

This resolves both issues for email 2.5/Python 2.3.  I will patch forward to
email 3.0 for both Python 2.4 and 2.5.
diff --git a/Lib/email/Charset.py b/Lib/email/Charset.py
@@ -1,5 +1,5 @@
-# Copyright (C) 2001,2002 Python Software Foundation
-# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
+# Copyright (C) 2001-2006 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield), barry@python.org (Barry Warsaw)
 
 from types import UnicodeType
 from email.Encoders import encode_7or8bit
@@ -99,20 +99,13 @@ def _isunicode(s):
 # of stability and useability.
 
 CODEC_MAP = {
-    'euc-jp':      'japanese.euc-jp',
-    'iso-2022-jp': 'japanese.iso-2022-jp',
-    'shift_jis':   'japanese.shift_jis',
-    'euc-kr':      'korean.euc-kr',
-    'ks_c_5601-1987': 'korean.cp949',
-    'iso-2022-kr': 'korean.iso-2022-kr',
-    'johab':       'korean.johab',
-    'gb2132':      'eucgb2312_cn',
-    'big5':        'big5_tw',
-    'utf-8':       'utf-8',
+    'gb2132':   'eucgb2312_cn',
+    'big5':     'big5_tw',
+    'utf-8':    'utf-8',
     # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
     # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
     # Let that stuff pass through without conversion to/from Unicode.
-    'us-ascii':    None,
+    'us-ascii': None,
     }
 
 
@@ -165,6 +158,26 @@ def add_codec(charset, codecname):
     CODEC_MAP[charset] = codecname
 
 
+def _find_asian_codec(charset, language):
+    try:
+        unicode('foo', charset)
+        return charset
+    except LookupError:
+        try:
+            codec = language + '.' + charset
+            unicode('foo', codec)
+            return codec
+        except LookupError:
+            return None
+
+
+for _charset in ('euc-jp', 'iso-2022-jp', 'shift_jis'):
+    add_codec(_charset, _find_asian_codec(_charset, 'japanese') or _charset)
+
+for _charset in ('euc-kr', 'cp949', 'iso-2022-kr', 'johab'):
+    add_codec(_charset, _find_asian_codec(_charset, 'korean') or _charset)
+
+
 
 class Charset:
     """Map character sets to their email properties.
@@ -229,7 +242,7 @@ def __init__(self, input_charset=DEFAULT_CHARSET):
         self.input_codec = CODEC_MAP.get(self.input_charset,
                                          self.input_charset)
         self.output_codec = CODEC_MAP.get(self.output_charset,
-                                            self.input_codec)
+                                          self.input_codec)
 
     def __str__(self):
         return self.input_charset.lower()
diff --git a/Lib/email/Generator.py b/Lib/email/Generator.py
@@ -1,8 +1,7 @@
-# Copyright (C) 2001,2002 Python Software Foundation
-# Author: barry@zope.com (Barry Warsaw)
+# Copyright (C) 2001-2006 Python Software Foundation
+# Author: barry@python.org (Barry Warsaw)
 
-"""Classes to generate plain text from a message object tree.
-"""
+"""Classes to generate plain text from a message object tree."""
 
 import re
 import sys
@@ -192,9 +191,6 @@ def _handle_text(self, msg):
         payload = msg.get_payload()
         if payload is None:
             return
-        cset = msg.get_charset()
-        if cset is not None:
-            payload = cset.body_encode(payload)
         if not _isstring(payload):
             raise TypeError, 'string payload expected: %s' % type(payload)
         if self._mangle_from_:
diff --git a/Lib/email/Message.py b/Lib/email/Message.py
@@ -272,11 +272,14 @@ def set_charset(self, charset):
                             charset=charset.get_output_charset())
         else:
             self.set_param('charset', charset.get_output_charset())
+        if str(charset) <> charset.get_output_charset():
+            self._payload = charset.body_encode(self._payload)
         if not self.has_key('Content-Transfer-Encoding'):
             cte = charset.get_body_encoding()
             if callable(cte):
                 cte(self)
             else:
+                self._payload = charset.body_encode(self._payload)
                 self.add_header('Content-Transfer-Encoding', cte)
 
     def get_charset(self):
diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py
@@ -2073,7 +2073,8 @@ def test_get_body_encoding_with_uppercase_charset(self):
         charset = Charset(charsets[0])
         eq(charset.get_body_encoding(), 'base64')
         msg.set_payload('hello world', charset=charset)
-        eq(msg.get_payload(), 'hello world')
+        eq(msg.get_payload(), 'aGVsbG8gd29ybGQ=\n')
+        eq(msg.get_payload(decode=True), 'hello world')
         eq(msg['content-transfer-encoding'], 'base64')
         # Try another one
         msg = Message()
diff --git a/Lib/email/test/test_email_codecs.py b/Lib/email/test/test_email_codecs.py
@@ -1,17 +1,16 @@
-# Copyright (C) 2002 Python Software Foundation
+# Copyright (C) 2002-2006 Python Software Foundation
 # email package unit tests for (optional) Asian codecs
 
 import unittest
 from test.test_support import TestSkipped, run_unittest
 
 from email.test.test_email import TestEmailBase
-from email.Charset import Charset
+from email.Charset import Charset, _find_asian_codec
 from email.Header import Header, decode_header
+from email.Message import Message
 
 # See if we have the Japanese codecs package installed
-try:
-    unicode('foo', 'japanese.iso-2022-jp')
-except LookupError:
+if not _find_asian_codec('iso-2022-jp', 'japanese'):
     raise TestSkipped, 'Optional Japanese codecs not installed'
 
 
@@ -49,6 +48,14 @@ def test_japanese_codecs(self):
         # TK: full decode comparison
         eq(h.__unicode__().encode('euc-jp'), long)
 
+    def test_payload_encoding(self):
+        jhello = '\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc\xa5\xeb\xa5\xc9\xa1\xaa'
+        jcode  = 'euc-jp'
+        msg = Message()
+        msg.set_payload(jhello, jcode)
+        ustr = unicode(msg.get_payload(), msg.get_content_charset())
+        self.assertEqual(jhello, ustr.encode(jcode))
+
 
 
 def suite():