From b265321c0ccecf266b81a35b5f8c213c78f99ea2 Mon Sep 17 00:00:00 2001 From: Abhilash Raj Date: Thu, 28 May 2020 17:04:59 -0700 Subject: [PATCH] bpo-39040: Fix parsing of email mime headers with whitespace between encoded-words. (gh-17620) * bpo-39040: Fix parsing of email headers with encoded-words inside a quoted string. It is fairly common to find malformed mime headers (especially content-disposition headers) where the parameter values, instead of being encoded to RFC standards, are "encoded" by doing RFC 2047 "encoded word" encoding, and then enclosing the whole thing in quotes. The processing of these malformed headers was incorrectly leaving the spaces between encoded words in the decoded text (whitespace between adjacent encoded words is supposed to be stripped on decoding). This changeset fixes the encoded word processing inside quoted strings (bare-quoted-string) to do correct RFC 2047 decoding by stripping that whitespace. (cherry picked from commit 21017ed904f734be9f195ae1274eb81426a9e776) Co-authored-by: Abhilash Raj --- Lib/email/_header_value_parser.py | 9 +++++++++ Lib/test/test_email/test_headerregistry.py | 19 +++++++++++++++++++ .../2019-12-15-18-47-20.bpo-39040.tKa0Qs.rst | 2 ++ 3 files changed, 30 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2019-12-15-18-47-20.bpo-39040.tKa0Qs.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 9c55ef7fb453be..51d355fbb0abc5 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1218,12 +1218,21 @@ def get_bare_quoted_string(value): if value[0] in WSP: token, value = get_fws(value) elif value[:2] == '=?': + valid_ew = False try: token, value = get_encoded_word(value) bare_quoted_string.defects.append(errors.InvalidHeaderDefect( "encoded word inside quoted string")) + valid_ew = True except errors.HeaderParseError: token, value = get_qcontent(value) + # Collapse the whitespace between two encoded words that occur in a + # bare-quoted-string. + if valid_ew and len(bare_quoted_string) > 1: + if (bare_quoted_string[-1].token_type == 'fws' and + bare_quoted_string[-2].token_type == 'encoded-word'): + bare_quoted_string[-1] = EWWhiteSpaceTerminal( + bare_quoted_string[-1], 'fws') else: token, value = get_qcontent(value) bare_quoted_string.append(token) diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index a43d51f730ad51..7ade9684465d50 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -872,6 +872,25 @@ def content_disp_as_value(self, {'filename': 'foo'}, [errors.InvalidHeaderDefect]), + 'invalid_parameter_value_with_fws_between_ew': ( + 'attachment; filename="=?UTF-8?Q?Schulbesuchsbest=C3=A4ttigung=2E?=' + ' =?UTF-8?Q?pdf?="', + 'attachment', + {'filename': 'Schulbesuchsbestättigung.pdf'}, + [errors.InvalidHeaderDefect]*3, + ('attachment; filename="Schulbesuchsbestättigung.pdf"'), + ('Content-Disposition: attachment;\n' + ' filename*=utf-8\'\'Schulbesuchsbest%C3%A4ttigung.pdf\n'), + ), + + 'parameter_value_with_fws_between_tokens': ( + 'attachment; filename="File =?utf-8?q?Name?= With Spaces.pdf"', + 'attachment', + {'filename': 'File Name With Spaces.pdf'}, + [errors.InvalidHeaderDefect], + 'attachment; filename="File Name With Spaces.pdf"', + ('Content-Disposition: attachment; filename="File Name With Spaces.pdf"\n'), + ) } diff --git a/Misc/NEWS.d/next/Library/2019-12-15-18-47-20.bpo-39040.tKa0Qs.rst b/Misc/NEWS.d/next/Library/2019-12-15-18-47-20.bpo-39040.tKa0Qs.rst new file mode 100644 index 00000000000000..078bce22be30f0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-12-15-18-47-20.bpo-39040.tKa0Qs.rst @@ -0,0 +1,2 @@ +Fix parsing of invalid mime headers parameters by collapsing whitespace between +encoded words in a bare-quote-string.