From 63bde17aefd2537ee32919cb6291ad5a7dc3073f Mon Sep 17 00:00:00 2001 From: Xiami Date: Wed, 29 Aug 2018 18:29:12 +0800 Subject: [PATCH 1/2] bpo-34222, Lib/email: Fix infinite loop when folding Currently when folding headers with length > maxlen, _fold_as_ew tries to split the to_encode into multiple parts to fulfill the maxlen limit, in an inapropriate way. If a long header has non-ascii characters, in some situations (e.g. a Subject: with full of CJK chars), it will split the to_encode into ["", to_encode], entering an infinite loop. This commit fixes this by introducing a smarter way to split. Besides, when an header needs to be folded now, every non-last line will try its best to reach the maxlen, in O(log N) time. Also, apply missing charset= parameter for _ew.encode. The bug is introduced in commit 85d5c18c9d83a1d54eecc4c2ad4dce63194107c6 --- Lib/email/_header_value_parser.py | 23 ++++++++++++++----- .../test_email/test__header_value_parser.py | 6 +++++ Misc/ACKS | 1 + .../2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst | 1 + 4 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index e805a75fbd93b9..7f06eb25445782 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2726,12 +2726,23 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): continue first_part = to_encode[:text_space] ew = _ew.encode(first_part, charset=encode_as) - excess = len(ew) - remaining_space - if excess > 0: - # encode always chooses the shortest encoding, so this - # is guaranteed to fit at this point. - first_part = first_part[:-excess] - ew = _ew.encode(first_part) + if len(ew) > remaining_space: + # Find the longest first_part + # since len(_ew.encode(to_encode[:x])) is a non-linear + # monotonically increasing function, and calculating the + # exactly length requires knowing the internal of _ew.encode + # which seems dirty, use binary search here. + part_len_l = 0 + part_len_r = text_space + while part_len_l + 1 < part_len_r: + part_len_m = (part_len_l + part_len_r) // 2 + ew = _ew.encode(first_part[:part_len_m], charset=encode_as) + if len(ew) <= remaining_space: + part_len_l = part_len_m + else: + part_len_r = part_len_m + first_part = to_encode[:part_len_l] + ew = _ew.encode(first_part, charset=encode_as) lines[-1] += ew to_encode = to_encode[len(first_part):] if to_encode: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 676732bb3d0261..9ae39f83a7d5c1 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2687,6 +2687,12 @@ def test_unstructured_with_unicode_no_folds(self): self._test(parser.get_unstructured("hübsch kleiner beißt"), "=?utf-8?q?h=C3=BCbsch_kleiner_bei=C3=9Ft?=\n") + def test_unstructured_with_long_unicode_folded(self): + self._test(parser.get_unstructured("虾" * 40), + "=?utf-8?b?" + "6Jm+" * 16 + "?=\n" + " =?utf-8?b?" + "6Jm+" * 16 + "?=\n" + " =?utf-8?b?" + "6Jm+" * 8 + "?=\n") + def test_one_ew_on_each_of_two_wrapped_lines(self): self._test(parser.get_unstructured("Mein kleiner Kaktus ist sehr " "hübsch. Es hat viele Stacheln " diff --git a/Misc/ACKS b/Misc/ACKS index 82fbc921feaa91..393de9c5df8aea 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1599,6 +1599,7 @@ Anish Tambe Musashi Tamura William Tanksley Christian Tanzer +Pengyu Tao Steven Taschuk Amy Taylor Julian Taylor diff --git a/Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst b/Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst new file mode 100644 index 00000000000000..f11af7e0f42f65 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst @@ -0,0 +1 @@ +Fix infinite loop when folding non-ASCII email headers From 064a5f264e070915d7c4143496155eb232bedddb Mon Sep 17 00:00:00 2001 From: Xiami Date: Wed, 29 Aug 2018 19:32:40 +0800 Subject: [PATCH 2/2] Lib/test/test_email: Fix tests --- Lib/test/test_email/test_headerregistry.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 30ce0ba54e4728..d1007099f666c9 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -1643,10 +1643,10 @@ def test_fold_overlong_words_using_RFC2047(self): self.assertEqual( h.fold(policy=policy.default), 'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E' - 'com/report=5F?=\n' - ' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx' - 'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n' - ' =?utf-8?q?xx-xx=3E?=\n') + 'com/report=5Fabuse?=\n' + ' =?utf-8?q?=2Ephp=3Fmid=3Dxxx-xxx-xxxx' + 'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-xx-xx?=\n' + ' =?utf-8?q?=3E?=\n') if __name__ == '__main__':