From 212109443b620cf0f17e81430733a5951f78b964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 30 May 2025 20:06:24 +0200 Subject: [PATCH 1/3] fix quadratic worst-time complexity in `_header_value_parser.py` --- Lib/email/_header_value_parser.py | 95 +++++++++---------- .../test_email/test__header_value_parser.py | 10 ++ ...-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst | 2 + 3 files changed, 59 insertions(+), 48 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index f11fa83d45ed2d..3b01636bc27cbf 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -82,11 +82,13 @@ WSP = set(' \t') CFWS_LEADER = WSP | set('(') +CFWS_LEADER_WITH_DOT = CFWS_LEADER | set('.') SPECIALS = set(r'()<>@,:;.\"[]') ATOM_ENDS = SPECIALS | WSP DOT_ATOM_ENDS = ATOM_ENDS - set('.') # '.', '"', and '(' do not end phrases in order to support obs-phrase PHRASE_ENDS = SPECIALS - set('."(') +PHRASE_ENDS_CHARS = r''.join(PHRASE_ENDS) TSPECIALS = (SPECIALS | set('/?=')) - set('.') TOKEN_ENDS = TSPECIALS | WSP ASPECIALS = TSPECIALS | set("*'%") @@ -1300,6 +1302,12 @@ def get_cfws(value): cfws.append(token) return cfws, value +def get_cfws_digits(value, leader_set): + ind = 0 + while ind < len(value) and value[ind] not in leader_set: + ind += 1 + return value[:ind], value[ind:] + def get_quoted_string(value): """quoted-string = [CFWS] [CFWS] @@ -1443,11 +1451,13 @@ def get_phrase(value): phrase.defects.append(errors.InvalidHeaderDefect( "phrase does not start with word")) while value and value[0] not in PHRASE_ENDS: - if value[0]=='.': - phrase.append(DOT) - phrase.defects.append(errors.ObsoleteHeaderDefect( - "period in 'phrase'")) - value = value[1:] + if value[0] == '.': + tmpvalue = value.lstrip('.') + for _ in range(len(value) - len(tmpvalue)): + phrase.append(DOT) + phrase.defects.append(errors.ObsoleteHeaderDefect( + "period in 'phrase'")) + value = tmpvalue else: try: token, value = get_word(value) @@ -1461,6 +1471,20 @@ def get_phrase(value): phrase.append(token) return phrase, value +def _find_phrase(reslist, value, endchars): + # lstrip() should not strip stuff in 'endchars' + phrase_end_chars = ''.join(PHRASE_ENDS - set(endchars)) + while value and value[0] not in endchars: + if value[0] in PHRASE_ENDS: + tmpvalue = value.lstrip(phrase_end_chars) + for i in range(len(value) - len(tmpvalue)): + reslist.append(ValueTerminal(value[i], 'misplaced-special')) + value = tmpvalue + else: + token, value = get_phrase(value) + reslist.append(token) + return value + def get_local_part(value): """ local-part = dot-atom / quoted-string / obs-local-part @@ -1842,14 +1866,7 @@ def get_invalid_mailbox(value, endchars): """ invalid_mailbox = InvalidMailbox() - while value and value[0] not in endchars: - if value[0] in PHRASE_ENDS: - invalid_mailbox.append(ValueTerminal(value[0], - 'misplaced-special')) - value = value[1:] - else: - token, value = get_phrase(value) - invalid_mailbox.append(token) + value = _find_phrase(invalid_mailbox, value, endchars) return invalid_mailbox, value def get_mailbox_list(value): @@ -2196,10 +2213,7 @@ def parse_mime_version(value): if not value: mime_version.defects.append(errors.HeaderMissingRequiredValue( "Expected MIME version number but found only CFWS")) - digits = '' - while value and value[0] != '.' and value[0] not in CFWS_LEADER: - digits += value[0] - value = value[1:] + digits, value = get_cfws_digits(value, CFWS_LEADER_WITH_DOT) if not digits.isdigit(): mime_version.defects.append(errors.InvalidHeaderDefect( "Expected MIME major version number but found {!r}".format(digits))) @@ -2227,10 +2241,7 @@ def parse_mime_version(value): mime_version.defects.append(errors.InvalidHeaderDefect( "Incomplete MIME version; found only major number")) return mime_version - digits = '' - while value and value[0] not in CFWS_LEADER: - digits += value[0] - value = value[1:] + digits, value = get_cfws_digits(value, CFWS_LEADER) if not digits.isdigit(): mime_version.defects.append(errors.InvalidHeaderDefect( "Expected MIME minor version number but found {!r}".format(digits))) @@ -2255,14 +2266,7 @@ def get_invalid_parameter(value): """ invalid_parameter = InvalidParameter() - while value and value[0] != ';': - if value[0] in PHRASE_ENDS: - invalid_parameter.append(ValueTerminal(value[0], - 'misplaced-special')) - value = value[1:] - else: - token, value = get_phrase(value) - invalid_parameter.append(token) + value = _find_phrase(invalid_parameter, value, ';') return invalid_parameter, value def get_ttext(value): @@ -2407,10 +2411,8 @@ def get_section(value): if not value or not value[0].isdigit(): raise errors.HeaderParseError("Expected section number but " "found {}".format(value)) - digits = '' - while value and value[0].isdigit(): - digits += value[0] - value = value[1:] + ind = next((i for i, ch in enumerate(value) if not ch.isdigit()), 0) + digits, value = value[:ind], value[ind:] if digits[0] == '0' and digits != '0': section.defects.append(errors.InvalidHeaderDefect( "section number has an invalid leading 0")) @@ -2638,17 +2640,10 @@ def _find_mime_parameters(tokenlist, value): """Do our best to find the parameters in an invalid MIME header """ - while value and value[0] != ';': - if value[0] in PHRASE_ENDS: - tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) - value = value[1:] - else: - token, value = get_phrase(value) - tokenlist.append(token) - if not value: - return - tokenlist.append(ValueTerminal(';', 'parameter-separator')) - tokenlist.append(parse_mime_parameters(value[1:])) + value = _find_phrase(tokenlist, value, ';') + if value: + tokenlist.append(ValueTerminal(';', 'parameter-separator')) + tokenlist.append(parse_mime_parameters(value[1:])) def parse_content_type_header(value): """ maintype "/" subtype *( ";" parameter ) @@ -2757,12 +2752,16 @@ def parse_content_transfer_encoding_header(value): if not value: return cte_header while value: - cte_header.defects.append(errors.InvalidHeaderDefect( - "Extra text after content transfer encoding")) if value[0] in PHRASE_ENDS: - cte_header.append(ValueTerminal(value[0], 'misplaced-special')) - value = value[1:] + tmpvalue = value.lstrip(PHRASE_ENDS_CHARS) + for i in range(len(value) - len(tmpvalue)): + cte_header.defects.append(errors.InvalidHeaderDefect( + "Extra text after content transfer encoding")) + cte_header.append(ValueTerminal(value[i], 'misplaced-special')) + value = tmpvalue else: + cte_header.defects.append(errors.InvalidHeaderDefect( + "Extra text after content transfer encoding")) token, value = get_phrase(value) cte_header.append(token) return cte_header diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index fd4ac2c404ce47..3d91537191c69b 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2676,6 +2676,16 @@ def test_invalid_content_transfer_encoding(self): ";foo", ";foo", ";foo", [errors.InvalidHeaderDefect]*3 ) + def test_invalid_content_transfer_encoding_misplaced_special(self): + cte = parser.parse_content_transfer_encoding_header("foo;;;;;") + self.assertEqual(len(cte), 6) + self.assertEqual(cte[0].value, "foo") + self.assertEqual(cte[0].token_type, "token") + self.assertEqual(cte[0].value, "foo") + self.assertEqual(cte[0].token_type, "token") + terminal = parser.ValueTerminal(";", "misplaced-special") + self.assertEqual(cte[1:], [terminal] * 5) + # get_msg_id def test_get_msg_id_empty(self): diff --git a/Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst b/Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst new file mode 100644 index 00000000000000..f4371932aa04cf --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst @@ -0,0 +1,2 @@ +Fix various HTTP header value parsing routines with worst-time +quadratic-complexity. Patch by Bénédikt Tran. From af32b1bf64506b5406a6efd7541007807a270a1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 31 May 2025 11:45:21 +0200 Subject: [PATCH 2/3] fix quadratic worst-time complexity in `_header_value_parser.py` --- Lib/email/_header_value_parser.py | 41 +++++++++++++++++++------------ 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 3b01636bc27cbf..235b180473b3b0 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -89,6 +89,7 @@ # '.', '"', and '(' do not end phrases in order to support obs-phrase PHRASE_ENDS = SPECIALS - set('."(') PHRASE_ENDS_CHARS = r''.join(PHRASE_ENDS) +PHRASE_ENDS_CHARS_NO_SEMICOLON = PHRASE_ENDS_CHARS.replace(';', '') TSPECIALS = (SPECIALS | set('/?=')) - set('.') TOKEN_ENDS = TSPECIALS | WSP ASPECIALS = TSPECIALS | set("*'%") @@ -1453,10 +1454,11 @@ def get_phrase(value): while value and value[0] not in PHRASE_ENDS: if value[0] == '.': tmpvalue = value.lstrip('.') - for _ in range(len(value) - len(tmpvalue)): - phrase.append(DOT) - phrase.defects.append(errors.ObsoleteHeaderDefect( - "period in 'phrase'")) + n = len(value) - len(tmpvalue) + phrase.extend(DOT for _ in range(n)) + phrase.defects.extend( + errors.ObsoleteHeaderDefect("period in 'phrase'") + for _ in range(n)) value = tmpvalue else: try: @@ -1471,14 +1473,13 @@ def get_phrase(value): phrase.append(token) return phrase, value -def _find_phrase(reslist, value, endchars): - # lstrip() should not strip stuff in 'endchars' - phrase_end_chars = ''.join(PHRASE_ENDS - set(endchars)) +def _find_phrase(reslist, value, phrase_ends, phrase_end_chars, endchars): while value and value[0] not in endchars: - if value[0] in PHRASE_ENDS: + if value[0] in phrase_ends: tmpvalue = value.lstrip(phrase_end_chars) - for i in range(len(value) - len(tmpvalue)): - reslist.append(ValueTerminal(value[i], 'misplaced-special')) + reslist.extend( + ValueTerminal(value[i], 'misplaced-special') + for i in range(len(value) - len(tmpvalue))) value = tmpvalue else: token, value = get_phrase(value) @@ -1866,7 +1867,10 @@ def get_invalid_mailbox(value, endchars): """ invalid_mailbox = InvalidMailbox() - value = _find_phrase(invalid_mailbox, value, endchars) + # lstrip() should not strip stuff in 'endchars' + phrase_end_chars = ''.join(PHRASE_ENDS - set(endchars)) + value = _find_phrase(invalid_mailbox, value, + PHRASE_ENDS, phrase_end_chars, endchars) return invalid_mailbox, value def get_mailbox_list(value): @@ -2266,7 +2270,8 @@ def get_invalid_parameter(value): """ invalid_parameter = InvalidParameter() - value = _find_phrase(invalid_parameter, value, ';') + value = _find_phrase(invalid_parameter, value, + PHRASE_ENDS, PHRASE_ENDS_CHARS_NO_SEMICOLON, ';') return invalid_parameter, value def get_ttext(value): @@ -2569,12 +2574,15 @@ def get_parameter(value): while value: if value[0] in WSP: token, value = get_fws(value) + v.append(token) elif value[0] == '"': - token = ValueTerminal('"', 'DQUOTE') - value = value[1:] + tmpvalue = value.lstrip('"') + n = len(value) - len(tmpvalue) + v.extend((ValueTerminal('"', 'DQUOTE') for _ in range(n))) + value = tmpvalue else: token, value = get_qcontent(value) - v.append(token) + v.append(token) token = v else: token, value = get_value(value) @@ -2640,7 +2648,8 @@ def _find_mime_parameters(tokenlist, value): """Do our best to find the parameters in an invalid MIME header """ - value = _find_phrase(tokenlist, value, ';') + value = _find_phrase(tokenlist, value, + PHRASE_ENDS, PHRASE_ENDS_CHARS_NO_SEMICOLON, ';') if value: tokenlist.append(ValueTerminal(';', 'parameter-separator')) tokenlist.append(parse_mime_parameters(value[1:])) From 7bf0e7a175a097560f2b4265075ffc29e54a11bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 2 Jun 2025 10:27:16 +0200 Subject: [PATCH 3/3] fixup --- .../Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst b/Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst index f4371932aa04cf..93389a64ee2ead 100644 --- a/Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst +++ b/Misc/NEWS.d/next/Security/2025-05-30-20-08-38.gh-issue-134873.6Z5xUC.rst @@ -1,2 +1,2 @@ -Fix various HTTP header value parsing routines with worst-time -quadratic-complexity. Patch by Bénédikt Tran. +Fix various HTTP header value parsing routines with worst-case +quadratic time complexity. Patch by Bénédikt Tran.