Skip to content

Commit 054f1af

Browse files
[3.13] gh-92081: Fix for email.generator.Generator with whitespace between encoded words. (GH-92281) (#119245)
* Fix for email.generator.Generator with whitespace between encoded words. email.generator.Generator currently does not handle whitespace between encoded words correctly when the encoded words span multiple lines. The current generator will create an encoded word for each line. If the end of the line happens to correspond with the end real word in the plaintext, the generator will place an unencoded space at the start of the subsequent lines to represent the whitespace between the plaintext words. A compliant decoder will strip all the whitespace from between two encoded words which leads to missing spaces in the round-tripped output. The fix for this is to make sure that whitespace between two encoded words ends up inside of one or the other of the encoded words. This fix places the space inside of the second encoded word. A second problem happens with continuation lines. A continuation line that starts with whitespace and is followed by a non-encoded word is fine because the newline between such continuation lines is defined as condensing to a single space character. When the continuation line starts with whitespace followed by an encoded word, however, the RFCs specify that the word is run together with the encoded word on the previous line. This is because normal words are filded on syntactic breaks by encoded words are not. The solution to this is to add the whitespace to the start of the encoded word on the continuation line. Test cases are from GH-92081 * Rename a variable so it's not confused with the final variable. (cherry picked from commit a6fdb31) Co-authored-by: Toshio Kuratomi <a.badger@gmail.com>
1 parent d8c562a commit 054f1af

File tree

4 files changed

+79
-8
lines changed

4 files changed

+79
-8
lines changed

Lib/email/_header_value_parser.py

+41-7
Original file line numberDiff line numberDiff line change
@@ -2784,11 +2784,15 @@ def _refold_parse_tree(parse_tree, *, policy):
27842784
# max_line_length 0/None means no limit, ie: infinitely long.
27852785
maxlen = policy.max_line_length or sys.maxsize
27862786
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
2787-
lines = ['']
2788-
last_ew = None
2787+
lines = [''] # Folded lines to be output
2788+
leading_whitespace = '' # When we have whitespace between two encoded
2789+
# words, we may need to encode the whitespace
2790+
# at the beginning of the second word.
2791+
last_ew = None # Points to the last encoded character if there's an ew on
2792+
# the line
27892793
last_charset = None
27902794
wrap_as_ew_blocked = 0
2791-
want_encoding = False
2795+
want_encoding = False # This is set to True if we need to encode this part
27922796
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
27932797
parts = list(parse_tree)
27942798
while parts:
@@ -2812,10 +2816,12 @@ def _refold_parse_tree(parse_tree, *, policy):
28122816
# 'charset' property on the policy.
28132817
charset = 'utf-8'
28142818
want_encoding = True
2819+
28152820
if part.token_type == 'mime-parameters':
28162821
# Mime parameter folding (using RFC2231) is extra special.
28172822
_fold_mime_parameters(part, lines, maxlen, encoding)
28182823
continue
2824+
28192825
if want_encoding and not wrap_as_ew_blocked:
28202826
if not part.as_ew_allowed:
28212827
want_encoding = False
@@ -2847,21 +2853,38 @@ def _refold_parse_tree(parse_tree, *, policy):
28472853
last_charset == 'utf-8' and charset != 'us-ascii')):
28482854
last_ew = None
28492855
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2850-
part.ew_combine_allowed, charset)
2856+
part.ew_combine_allowed, charset, leading_whitespace)
2857+
# This whitespace has been added to the lines in _fold_as_ew()
2858+
# so clear it now.
2859+
leading_whitespace = ''
28512860
last_charset = charset
28522861
want_encoding = False
28532862
continue
2863+
28542864
if len(tstr) <= maxlen - len(lines[-1]):
28552865
lines[-1] += tstr
28562866
continue
2867+
28572868
# This part is too long to fit. The RFC wants us to break at
28582869
# "major syntactic breaks", so unless we don't consider this
28592870
# to be one, check if it will fit on the next line by itself.
2871+
leading_whitespace = ''
28602872
if (part.syntactic_break and
28612873
len(tstr) + 1 <= maxlen):
28622874
newline = _steal_trailing_WSP_if_exists(lines)
28632875
if newline or part.startswith_fws():
2876+
# We're going to fold the data onto a new line here. Due to
2877+
# the way encoded strings handle continuation lines, we need to
2878+
# be prepared to encode any whitespace if the next line turns
2879+
# out to start with an encoded word.
28642880
lines.append(newline + tstr)
2881+
2882+
whitespace_accumulator = []
2883+
for char in lines[-1]:
2884+
if char not in WSP:
2885+
break
2886+
whitespace_accumulator.append(char)
2887+
leading_whitespace = ''.join(whitespace_accumulator)
28652888
last_ew = None
28662889
continue
28672890
if not hasattr(part, 'encode'):
@@ -2885,9 +2908,10 @@ def _refold_parse_tree(parse_tree, *, policy):
28852908
else:
28862909
# We can't fold it onto the next line either...
28872910
lines[-1] += tstr
2911+
28882912
return policy.linesep.join(lines) + policy.linesep
28892913

2890-
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
2914+
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
28912915
"""Fold string to_encode into lines as encoded word, combining if allowed.
28922916
Return the new value for last_ew, or None if ew_combine_allowed is False.
28932917
@@ -2902,14 +2926,15 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
29022926
to_encode = str(
29032927
get_unstructured(lines[-1][last_ew:] + to_encode))
29042928
lines[-1] = lines[-1][:last_ew]
2905-
if to_encode[0] in WSP:
2929+
elif to_encode[0] in WSP:
29062930
# We're joining this to non-encoded text, so don't encode
29072931
# the leading blank.
29082932
leading_wsp = to_encode[0]
29092933
to_encode = to_encode[1:]
29102934
if (len(lines[-1]) == maxlen):
29112935
lines.append(_steal_trailing_WSP_if_exists(lines))
29122936
lines[-1] += leading_wsp
2937+
29132938
trailing_wsp = ''
29142939
if to_encode[-1] in WSP:
29152940
# Likewise for the trailing space.
@@ -2929,11 +2954,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
29292954

29302955
while to_encode:
29312956
remaining_space = maxlen - len(lines[-1])
2932-
text_space = remaining_space - chrome_len
2957+
text_space = remaining_space - chrome_len - len(leading_whitespace)
29332958
if text_space <= 0:
29342959
lines.append(' ')
29352960
continue
29362961

2962+
# If we are at the start of a continuation line, prepend whitespace
2963+
# (we only want to do this when the line starts with an encoded word
2964+
# but if we're folding in this helper function, then we know that we
2965+
# are going to be writing out an encoded word.)
2966+
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
2967+
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
2968+
lines[-1] += encoded_word
2969+
leading_whitespace = ''
2970+
29372971
to_encode_word = to_encode[:text_space]
29382972
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
29392973
excess = len(encoded_word) - remaining_space

Lib/test/test_email/test_generator.py

+35
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
281281
ioclass = io.BytesIO
282282
typ = lambda self, x: x.encode('ascii')
283283

284+
def test_defaults_handle_spaces_between_encoded_words_when_folded(self):
285+
source = ("Уведомление о принятии в работу обращения для"
286+
" подключения услуги")
287+
expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n'
288+
' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n'
289+
' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii')
290+
msg = EmailMessage()
291+
msg['Subject'] = source
292+
s = io.BytesIO()
293+
g = BytesGenerator(s)
294+
g.flatten(msg)
295+
self.assertEqual(s.getvalue(), expected)
296+
297+
def test_defaults_handle_spaces_at_start_of_subject(self):
298+
source = " Уведомление"
299+
expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n"
300+
msg = EmailMessage()
301+
msg['Subject'] = source
302+
s = io.BytesIO()
303+
g = BytesGenerator(s)
304+
g.flatten(msg)
305+
self.assertEqual(s.getvalue(), expected)
306+
307+
def test_defaults_handle_spaces_at_start_of_continuation_line(self):
308+
source = " ф ффффффффффффффффффф ф ф"
309+
expected = (b"Subject: "
310+
b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n"
311+
b" =?utf-8?b?INGEINGE?=\n\n")
312+
msg = EmailMessage()
313+
msg['Subject'] = source
314+
s = io.BytesIO()
315+
g = BytesGenerator(s)
316+
g.flatten(msg)
317+
self.assertEqual(s.getvalue(), expected)
318+
284319
def test_cte_type_7bit_handles_unknown_8bit(self):
285320
source = ("Subject: Maintenant je vous présente mon "
286321
"collègue\n\n").encode('utf-8')

Lib/test/test_email/test_headerregistry.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from test.test_email import TestEmailBase, parameterize
88
from email import headerregistry
99
from email.headerregistry import Address, Group
10+
from email.header import decode_header
1011
from test.support import ALWAYS_EQ
1112

1213

@@ -1648,7 +1649,7 @@ def test_address_display_names(self):
16481649
'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. '
16491650
'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.',
16501651
'=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c'
1651-
'=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse'
1652+
'=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse'
16521653
'_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8'
16531654
'?q?p=C3=B4tenti=2E?=',
16541655
),
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix missing spaces in email headers when the spaces are mixed with encoded 8-bit characters.

0 commit comments

Comments
 (0)