From 3bc8212ad08e59ed5350fa25f1a0f5b648b0c49c Mon Sep 17 00:00:00 2001 From: Gustaf <79180496+GGyll@users.noreply.github.com> Date: Tue, 20 May 2025 20:54:38 +0200 Subject: [PATCH 1/5] email: Correctly decode using policy UTF-8 attribute in BytesParser --- Lib/email/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/email/parser.py b/Lib/email/parser.py index 039f03cba74fa0..3231e16f6de40e 100644 --- a/Lib/email/parser.py +++ b/Lib/email/parser.py @@ -100,7 +100,8 @@ def parse(self, fp, headersonly=False): parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ - fp = TextIOWrapper(fp, encoding='ascii', errors='surrogateescape') + encoding = "utf-8" if getattr(self.parser.policy, "utf8", False) else "ascii" + fp = TextIOWrapper(fp, encoding=encoding, errors='surrogateescape') try: return self.parser.parse(fp, headersonly) finally: From 29f3802d202aa3a2d2b7c8243de554413657a20e Mon Sep 17 00:00:00 2001 From: Gustaf <79180496+GGyll@users.noreply.github.com> Date: Tue, 20 May 2025 20:57:08 +0200 Subject: [PATCH 2/5] Added test for message_from_binary_file using utf-8 --- Lib/test/test_email/test_email.py | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 7b14305f997e5d..17ea125c3c55db 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3989,6 +3989,38 @@ def test_bytes_parser_on_exception_does_not_close_file(self): bytesParser(policy=email.policy.strict).parse, fp) self.assertFalse(fp.closed) + + def test_bytes_parser_uses_policy_utf8_setting(self): + m = """ + From: Nathaniel Nameson + To: Ned Sampleson + Subject: Sample message + MIME-Version: 1.0 + Content-type: multipart/mixed; boundary="i-am-boundary" + + This is the préamble. It is to be ignored, though it + is a handy place for mail composers to include an + explanatory note to non-MIME compliant readers. + + --i-am-boundary + Content-type: text/plain; charset=us-ascii + + This is explicitly typed plain ASCII text. + It DOES end with a linebreak. + + --i-am-boundary + Content-type: text/plain; charset=utf-8 + Content-Transfer-Encoding: 8bit + + This should be correctly encapsulated: Un petit café ? + + --i-am-boundary-- + This is the epilogue. It is also to be ignored. + + """.lstrip() + M_BYTES = BytesIO(m.encode()) + msg = email.message_from_binary_file(M_BYTES, policy=email.policy.default.clone(utf8=True)) + self.assertEqual(msg.as_string(), m) def test_parser_does_not_close_file(self): with openfile('msg_02.txt', encoding="utf-8") as fp: From 64d2bd39eaa7dea5e2187ba07c568c7508ed532a Mon Sep 17 00:00:00 2001 From: Gustaf <79180496+GGyll@users.noreply.github.com> Date: Tue, 20 May 2025 21:32:25 +0200 Subject: [PATCH 3/5] Update test_bytes_parser_uses_policy_utf8_setting test Now asserts that the UTF-8 encoded data is present in the final message --- Lib/test/test_email/test_email.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 17ea125c3c55db..c98a53a3be43b8 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4019,8 +4019,14 @@ def test_bytes_parser_uses_policy_utf8_setting(self): """.lstrip() M_BYTES = BytesIO(m.encode()) + msg = email.message_from_binary_file(M_BYTES, policy=email.policy.default.clone(utf8=True)) - self.assertEqual(msg.as_string(), m) + for i, part in enumerate(msg.iter_parts(), 1): + _ = part.as_string() + + msg_string = msg.as_string() + self.assertIn("This is the préamble.", msg_string) + self.assertIn("Un petit café", msg_string) def test_parser_does_not_close_file(self): with openfile('msg_02.txt', encoding="utf-8") as fp: From 4bbea35ab353a46af84f6e204e31fe0115ce9b10 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 20 May 2025 19:37:10 +0000 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2025-05-20-19-37-09.gh-issue-118718.dyhtAS.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-05-20-19-37-09.gh-issue-118718.dyhtAS.rst diff --git a/Misc/NEWS.d/next/Library/2025-05-20-19-37-09.gh-issue-118718.dyhtAS.rst b/Misc/NEWS.d/next/Library/2025-05-20-19-37-09.gh-issue-118718.dyhtAS.rst new file mode 100644 index 00000000000000..3ce8eda70c7687 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-20-19-37-09.gh-issue-118718.dyhtAS.rst @@ -0,0 +1,2 @@ +Fix incorrect decoding of preamble in BytesParser +Contributed by Gustaf Gyllensporre. From bfcef667a9196c88bc5c210a8fee03d7d3b418bd Mon Sep 17 00:00:00 2001 From: Gustaf <79180496+GGyll@users.noreply.github.com> Date: Wed, 21 May 2025 18:00:08 +0200 Subject: [PATCH 5/5] removed extra whitespace in test_email.py --- Lib/test/test_email/test_email.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index c98a53a3be43b8..f546b24c7cf8db 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3989,7 +3989,7 @@ def test_bytes_parser_on_exception_does_not_close_file(self): bytesParser(policy=email.policy.strict).parse, fp) self.assertFalse(fp.closed) - + def test_bytes_parser_uses_policy_utf8_setting(self): m = """ From: Nathaniel Nameson