Skip to content

Commit 2bfcf5d

Browse files
committed
Back port r50693 and r50754 from the trunk (and 2.4 branch):
decode_rfc2231(): Be more robust against buggy RFC 2231 encodings. Specifically, instead of raising a ValueError when there is a single tick in the parameter, simply return that the entire string unquoted, with None for both the charset and the language. Also, if there are more than 2 ticks in the parameter, interpret the first three parts as the standard RFC 2231 parts, then the rest of the parts as the encoded string. More RFC 2231 improvements for the email 4.0 package. As Mark Sapiro rightly points out there are really two types of continued headers defined in this RFC (i.e. "encoded" parameters with the form "name*0*=" and unencoded parameters with the form "name*0="), but we were were handling them both the same way and that isn't correct. This patch should be much more RFC compliant in that only encoded params are %-decoded and the charset/language information is only extract if there are any encoded params in the segments. If there are no encoded params then the RFC says that there will be no charset/language parts. Note however that this will change the return value for Message.get_param() in some cases. For example, whereas before if you had all unencoded param continuations you would have still gotten a 3-tuple back from this method (with charset and language == None), you will now get just a string. I don't believe this is a backward incompatible change though because the documentation for this method already indicates that either return value is possible and that you must do an isinstance(val, tuple) check to discriminate between the two. (Yeah that API kind of sucks but we can't change /that/ without breaking code.) Test cases, some documentation updates, and a NEWS item accompany this patch. Original fewer-than-3-parts fix by Tokio Kikuchi. Resolves SF bug # 1218081. Also, bump the package version number to 2.5.8 for release.
1 parent e3e7851 commit 2bfcf5d

File tree

4 files changed

+195
-37
lines changed

4 files changed

+195
-37
lines changed

Lib/email/Utils.py

+44-23
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
# Copyright (C) 2001,2002 Python Software Foundation
2-
# Author: barry@zope.com (Barry Warsaw)
1+
# Copyright (C) 2001-2006 Python Software Foundation
2+
# Author: Barry Warsaw
3+
# Contact: email-sig@python.org
34

4-
"""Miscellaneous utilities.
5-
"""
5+
"""Miscellaneous utilities."""
66

77
import time
88
import socket
99
import re
1010
import random
1111
import os
12+
import urllib
1213
import warnings
1314
from cStringIO import StringIO
1415
from types import ListType
@@ -53,6 +54,7 @@ def _qdecode(s):
5354
EMPTYSTRING = ''
5455
UEMPTYSTRING = u''
5556
CRLF = '\r\n'
57+
TICK = "'"
5658

5759
specialsre = re.compile(r'[][\\()<>@,:;".]')
5860
escapesre = re.compile(r'[][\\()"]')
@@ -277,12 +279,14 @@ def unquote(str):
277279
# RFC2231-related functions - parameter encoding and decoding
278280
def decode_rfc2231(s):
279281
"""Decode string according to RFC 2231"""
280-
import urllib
281-
parts = s.split("'", 2)
282-
if len(parts) == 1:
282+
parts = s.split(TICK, 2)
283+
if len(parts) <= 2:
283284
return None, None, urllib.unquote(s)
284-
charset, language, s = parts
285-
return charset, language, urllib.unquote(s)
285+
if len(parts) > 3:
286+
charset, language = pars[:2]
287+
s = TICK.join(parts[2:])
288+
return charset, language, s
289+
return parts
286290

287291

288292
def encode_rfc2231(s, charset=None, language=None):
@@ -306,35 +310,52 @@ def encode_rfc2231(s, charset=None, language=None):
306310
def decode_params(params):
307311
"""Decode parameters list according to RFC 2231.
308312
309-
params is a sequence of 2-tuples containing (content type, string value).
313+
params is a sequence of 2-tuples containing (param name, string value).
310314
"""
315+
# Copy params so we don't mess with the original
316+
params = params[:]
311317
new_params = []
312-
# maps parameter's name to a list of continuations
318+
# Map parameter's name to a list of continuations. The values are a
319+
# 3-tuple of the continuation number, the string value, and a flag
320+
# specifying whether a particular segment is %-encoded.
313321
rfc2231_params = {}
314-
# params is a sequence of 2-tuples containing (content_type, string value)
315-
name, value = params[0]
322+
name, value = params.pop(0)
316323
new_params.append((name, value))
317-
# Cycle through each of the rest of the parameters.
318-
for name, value in params[1:]:
324+
while params:
325+
name, value = params.pop(0)
326+
if name.endswith('*'):
327+
encoded = True
328+
else:
329+
encoded = False
319330
value = unquote(value)
320331
mo = rfc2231_continuation.match(name)
321332
if mo:
322333
name, num = mo.group('name', 'num')
323334
if num is not None:
324335
num = int(num)
325-
rfc2231_param1 = rfc2231_params.setdefault(name, [])
326-
rfc2231_param1.append((num, value))
336+
rfc2231_params.setdefault(name, []).append((num, value, encoded))
327337
else:
328338
new_params.append((name, '"%s"' % quote(value)))
329339
if rfc2231_params:
330340
for name, continuations in rfc2231_params.items():
331341
value = []
342+
extended = False
332343
# Sort by number
333344
continuations.sort()
334-
# And now append all values in num order
335-
for num, continuation in continuations:
336-
value.append(continuation)
337-
charset, language, value = decode_rfc2231(EMPTYSTRING.join(value))
338-
new_params.append(
339-
(name, (charset, language, '"%s"' % quote(value))))
345+
# And now append all values in numerical order, converting
346+
# %-encodings for the encoded segments. If any of the
347+
# continuation names ends in a *, then the entire string, after
348+
# decoding segments and concatenating, must have the charset and
349+
# language specifiers at the beginning of the string.
350+
for num, s, encoded in continuations:
351+
if encoded:
352+
s = urllib.unquote(s)
353+
extended = True
354+
value.append(s)
355+
value = quote(EMPTYSTRING.join(value))
356+
if extended:
357+
charset, language, value = decode_rfc2231(value)
358+
new_params.append((name, (charset, language, '"%s"' % value)))
359+
else:
360+
new_params.append((name, '"%s"' % value))
340361
return new_params

Lib/email/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
"""A package for parsing, handling, and generating email messages."""
55

6-
__version__ = '2.5.7'
6+
__version__ = '2.5.8'
77

88
__all__ = [
99
'base64MIME',

Lib/email/test/test_email.py

+138-13
Original file line numberDiff line numberDiff line change
@@ -2756,14 +2756,17 @@ def test_rfc2231_no_language_or_charset(self):
27562756
27572757
'''
27582758
msg = email.message_from_string(m)
2759-
self.assertEqual(msg.get_param('NAME'),
2760-
(None, None, 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm'))
2759+
param = msg.get_param('NAME')
2760+
self.failIf(isinstance(param, tuple))
2761+
self.assertEqual(
2762+
param,
2763+
'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')
27612764

27622765
def test_rfc2231_no_language_or_charset_in_filename(self):
27632766
m = '''\
27642767
Content-Disposition: inline;
2765-
\tfilename*0="This%20is%20even%20more%20";
2766-
\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
2768+
\tfilename*0*="This%20is%20even%20more%20";
2769+
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
27672770
\tfilename*2="is it not.pdf"
27682771
27692772
'''
@@ -2774,21 +2777,47 @@ def test_rfc2231_no_language_or_charset_in_filename(self):
27742777
def test_rfc2231_no_language_or_charset_in_boundary(self):
27752778
m = '''\
27762779
Content-Type: multipart/alternative;
2777-
\tboundary*0="This%20is%20even%20more%20";
2778-
\tboundary*1="%2A%2A%2Afun%2A%2A%2A%20";
2780+
\tboundary*0*="This%20is%20even%20more%20";
2781+
\tboundary*1*="%2A%2A%2Afun%2A%2A%2A%20";
27792782
\tboundary*2="is it not.pdf"
27802783
27812784
'''
27822785
msg = email.message_from_string(m)
27832786
self.assertEqual(msg.get_boundary(),
27842787
'This is even more ***fun*** is it not.pdf')
27852788

2789+
def test_rfc2231_partly_encoded(self):
2790+
m = '''\
2791+
Content-Disposition: inline;
2792+
\tfilename*0="''This%20is%20even%20more%20";
2793+
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
2794+
\tfilename*2="is it not.pdf"
2795+
2796+
'''
2797+
msg = email.message_from_string(m)
2798+
self.assertEqual(
2799+
msg.get_filename(),
2800+
'This%20is%20even%20more%20***fun*** is it not.pdf')
2801+
2802+
def test_rfc2231_partly_nonencoded(self):
2803+
m = '''\
2804+
Content-Disposition: inline;
2805+
\tfilename*0="This%20is%20even%20more%20";
2806+
\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
2807+
\tfilename*2="is it not.pdf"
2808+
2809+
'''
2810+
msg = email.message_from_string(m)
2811+
self.assertEqual(
2812+
msg.get_filename(),
2813+
'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20is it not.pdf')
2814+
27862815
def test_rfc2231_no_language_or_charset_in_charset(self):
27872816
# This is a nonsensical charset value, but tests the code anyway
27882817
m = '''\
27892818
Content-Type: text/plain;
2790-
\tcharset*0="This%20is%20even%20more%20";
2791-
\tcharset*1="%2A%2A%2Afun%2A%2A%2A%20";
2819+
\tcharset*0*="This%20is%20even%20more%20";
2820+
\tcharset*1*="%2A%2A%2Afun%2A%2A%2A%20";
27922821
\tcharset*2="is it not.pdf"
27932822
27942823
'''
@@ -2799,8 +2828,8 @@ def test_rfc2231_no_language_or_charset_in_charset(self):
27992828
def test_rfc2231_bad_encoding_in_filename(self):
28002829
m = '''\
28012830
Content-Disposition: inline;
2802-
\tfilename*0="bogus'xx'This%20is%20even%20more%20";
2803-
\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
2831+
\tfilename*0*="bogus'xx'This%20is%20even%20more%20";
2832+
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
28042833
\tfilename*2="is it not.pdf"
28052834
28062835
'''
@@ -2831,16 +2860,112 @@ def test_rfc2231_bad_character_in_charset(self):
28312860
def test_rfc2231_bad_character_in_filename(self):
28322861
m = '''\
28332862
Content-Disposition: inline;
2834-
\tfilename*0="ascii'xx'This%20is%20even%20more%20";
2835-
\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
2836-
\tfilename*2="is it not.pdf%E2"
2863+
\tfilename*0*="ascii'xx'This%20is%20even%20more%20";
2864+
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
2865+
\tfilename*2*="is it not.pdf%E2"
28372866
28382867
'''
28392868
msg = email.message_from_string(m)
28402869
self.assertEqual(msg.get_filename(),
28412870
'This is even more ***fun*** is it not.pdf\xe2')
28422871

28432872

2873+
def test_rfc2231_unknown_encoding(self):
2874+
m = """\
2875+
Content-Transfer-Encoding: 8bit
2876+
Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
2877+
2878+
"""
2879+
msg = email.message_from_string(m)
2880+
self.assertEqual(msg.get_filename(), 'myfile.txt')
2881+
2882+
def test_rfc2231_single_tick_in_filename_extended(self):
2883+
eq = self.assertEqual
2884+
m = """\
2885+
Content-Type: application/x-foo;
2886+
\tname*0*=\"Frank's\"; name*1*=\" Document\"
2887+
2888+
"""
2889+
msg = email.message_from_string(m)
2890+
charset, language, s = msg.get_param('name')
2891+
eq(charset, None)
2892+
eq(language, None)
2893+
eq(s, "Frank's Document")
2894+
2895+
def test_rfc2231_single_tick_in_filename(self):
2896+
m = """\
2897+
Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
2898+
2899+
"""
2900+
msg = email.message_from_string(m)
2901+
param = msg.get_param('name')
2902+
self.failIf(isinstance(param, tuple))
2903+
self.assertEqual(param, "Frank's Document")
2904+
2905+
def test_rfc2231_tick_attack_extended(self):
2906+
eq = self.assertEqual
2907+
m = """\
2908+
Content-Type: application/x-foo;
2909+
\tname*0*=\"us-ascii'en-us'Frank's\"; name*1*=\" Document\"
2910+
2911+
"""
2912+
msg = email.message_from_string(m)
2913+
charset, language, s = msg.get_param('name')
2914+
eq(charset, 'us-ascii')
2915+
eq(language, 'en-us')
2916+
eq(s, "Frank's Document")
2917+
2918+
def test_rfc2231_tick_attack(self):
2919+
m = """\
2920+
Content-Type: application/x-foo;
2921+
\tname*0=\"us-ascii'en-us'Frank's\"; name*1=\" Document\"
2922+
2923+
"""
2924+
msg = email.message_from_string(m)
2925+
param = msg.get_param('name')
2926+
self.failIf(isinstance(param, tuple))
2927+
self.assertEqual(param, "us-ascii'en-us'Frank's Document")
2928+
2929+
def test_rfc2231_no_extended_values(self):
2930+
eq = self.assertEqual
2931+
m = """\
2932+
Content-Type: application/x-foo; name=\"Frank's Document\"
2933+
2934+
"""
2935+
msg = email.message_from_string(m)
2936+
eq(msg.get_param('name'), "Frank's Document")
2937+
2938+
def test_rfc2231_encoded_then_unencoded_segments(self):
2939+
eq = self.assertEqual
2940+
m = """\
2941+
Content-Type: application/x-foo;
2942+
\tname*0*=\"us-ascii'en-us'My\";
2943+
\tname*1=\" Document\";
2944+
\tname*2*=\" For You\"
2945+
2946+
"""
2947+
msg = email.message_from_string(m)
2948+
charset, language, s = msg.get_param('name')
2949+
eq(charset, 'us-ascii')
2950+
eq(language, 'en-us')
2951+
eq(s, 'My Document For You')
2952+
2953+
def test_rfc2231_unencoded_then_encoded_segments(self):
2954+
eq = self.assertEqual
2955+
m = """\
2956+
Content-Type: application/x-foo;
2957+
\tname*0=\"us-ascii'en-us'My\";
2958+
\tname*1*=\" Document\";
2959+
\tname*2*=\" For You\"
2960+
2961+
"""
2962+
msg = email.message_from_string(m)
2963+
charset, language, s = msg.get_param('name')
2964+
eq(charset, 'us-ascii')
2965+
eq(language, 'en-us')
2966+
eq(s, 'My Document For You')
2967+
2968+
28442969

28452970
def _testclasses():
28462971
mod = sys.modules[__name__]

Misc/NEWS

+12
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,18 @@ Core and builtins
2828
Library
2929
-------
3030

31+
- The email package has improved RFC 2231 support, specifically for
32+
recognizing the difference between encoded (name*0*=<blah>) and non-encoded
33+
(name*0=<blah>) parameter continuations. This may change the types of
34+
values returned from email.message.Message.get_param() and friends.
35+
Specifically in some cases where non-encoded continuations were used,
36+
get_param() used to return a 3-tuple of (None, None, string) whereas now it
37+
will just return the string (since non-encoded continuations don't have
38+
charset and language parts).
39+
40+
Also, whereas % values were decoded in all parameter continuations, they are
41+
now only decoded in encoded parameter parts.
42+
3143
- Applied a security fix to SimpleXMLRPCserver (PSF-2005-001). This
3244
disables recursive traversal through instance attributes, which can
3345
be exploited in various ways.

0 commit comments

Comments
 (0)