From 63bde17aefd2537ee32919cb6291ad5a7dc3073f Mon Sep 17 00:00:00 2001
From: Xiami <i@f2light.com>
Date: Wed, 29 Aug 2018 18:29:12 +0800
Subject: [PATCH 1/2] bpo-34222, Lib/email: Fix infinite loop when folding

Currently when folding headers with length > maxlen, _fold_as_ew tries
to split the to_encode into multiple parts to fulfill the maxlen limit,
in an inapropriate way.

If a long header has non-ascii characters, in some situations (e.g. a
Subject: with full of CJK chars), it will split the to_encode into
["", to_encode], entering an infinite loop.

This commit fixes this by introducing a smarter way to split.
Besides, when an header needs to be folded now, every non-last line will
try its best to reach the maxlen, in O(log N) time.
Also, apply missing charset= parameter for _ew.encode.

The bug is introduced in commit 85d5c18c9d83a1d54eecc4c2ad4dce63194107c6
---
 Lib/email/_header_value_parser.py             | 23 ++++++++++++++-----
 .../test_email/test__header_value_parser.py   |  6 +++++
 Misc/ACKS                                     |  1 +
 .../2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst  |  1 +
 4 files changed, 25 insertions(+), 6 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst

diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index e805a75fbd93b9..7f06eb25445782 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -2726,12 +2726,23 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
             continue
         first_part = to_encode[:text_space]
         ew = _ew.encode(first_part, charset=encode_as)
-        excess = len(ew) - remaining_space
-        if excess > 0:
-            # encode always chooses the shortest encoding, so this
-            # is guaranteed to fit at this point.
-            first_part = first_part[:-excess]
-            ew = _ew.encode(first_part)
+        if len(ew) > remaining_space:
+            # Find the longest first_part
+            # since len(_ew.encode(to_encode[:x])) is a non-linear
+            # monotonically increasing function, and calculating the
+            # exactly length requires knowing the internal of _ew.encode
+            # which seems dirty, use binary search here.
+            part_len_l = 0
+            part_len_r = text_space
+            while part_len_l + 1 < part_len_r:
+                part_len_m = (part_len_l + part_len_r) // 2
+                ew = _ew.encode(first_part[:part_len_m], charset=encode_as)
+                if len(ew) <= remaining_space:
+                    part_len_l = part_len_m
+                else:
+                    part_len_r = part_len_m
+            first_part = to_encode[:part_len_l]
+            ew = _ew.encode(first_part, charset=encode_as)
         lines[-1] += ew
         to_encode = to_encode[len(first_part):]
         if to_encode:
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 676732bb3d0261..9ae39f83a7d5c1 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -2687,6 +2687,12 @@ def test_unstructured_with_unicode_no_folds(self):
         self._test(parser.get_unstructured("hübsch kleiner beißt"),
                    "=?utf-8?q?h=C3=BCbsch_kleiner_bei=C3=9Ft?=\n")
 
+    def test_unstructured_with_long_unicode_folded(self):
+        self._test(parser.get_unstructured("虾" * 40),
+                   "=?utf-8?b?" + "6Jm+" * 16 + "?=\n"
+                   " =?utf-8?b?" + "6Jm+" * 16 + "?=\n"
+                   " =?utf-8?b?" + "6Jm+" * 8 + "?=\n")
+
     def test_one_ew_on_each_of_two_wrapped_lines(self):
         self._test(parser.get_unstructured("Mein kleiner Kaktus ist sehr "
                                            "hübsch.  Es hat viele Stacheln "
diff --git a/Misc/ACKS b/Misc/ACKS
index 82fbc921feaa91..393de9c5df8aea 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1599,6 +1599,7 @@ Anish Tambe
 Musashi Tamura
 William Tanksley
 Christian Tanzer
+Pengyu Tao
 Steven Taschuk
 Amy Taylor
 Julian Taylor
diff --git a/Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst b/Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst
new file mode 100644
index 00000000000000..f11af7e0f42f65
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-08-30-11-11-25.bpo-34222.yA1Rn7.rst
@@ -0,0 +1 @@
+Fix infinite loop when folding non-ASCII email headers

From 064a5f264e070915d7c4143496155eb232bedddb Mon Sep 17 00:00:00 2001
From: Xiami <i@f2light.com>
Date: Wed, 29 Aug 2018 19:32:40 +0800
Subject: [PATCH 2/2] Lib/test/test_email: Fix tests

---
 Lib/test/test_email/test_headerregistry.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index 30ce0ba54e4728..d1007099f666c9 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1643,10 +1643,10 @@ def test_fold_overlong_words_using_RFC2047(self):
         self.assertEqual(
             h.fold(policy=policy.default),
             'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E'
-                'com/report=5F?=\n'
-            ' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx'
-                'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n'
-            ' =?utf-8?q?xx-xx=3E?=\n')
+                'com/report=5Fabuse?=\n'
+            ' =?utf-8?q?=2Ephp=3Fmid=3Dxxx-xxx-xxxx'
+                'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-xx-xx?=\n'
+            ' =?utf-8?q?=3E?=\n')
 
 
 if __name__ == '__main__':