Skip to content

bpo-35805: Add parser for Message-ID email header. #13397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions Doc/library/email.headerregistry.rst
Original file line number Diff line number Diff line change
Expand Up @@ -321,19 +321,26 @@ variant, :attr:`~.BaseHeader.max_count` is set to 1.

The default mappings are:

:subject: UniqueUnstructuredHeader
:date: UniqueDateHeader
:resent-date: DateHeader
:orig-date: UniqueDateHeader
:sender: UniqueSingleAddressHeader
:resent-sender: SingleAddressHeader
:to: UniqueAddressHeader
:resent-to: AddressHeader
:cc: UniqueAddressHeader
:resent-cc: AddressHeader
:from: UniqueAddressHeader
:resent-from: AddressHeader
:reply-to: UniqueAddressHeader
:subject: UniqueUnstructuredHeader
:date: UniqueDateHeader
:resent-date: DateHeader
:orig-date: UniqueDateHeader
:sender: UniqueSingleAddressHeader
:resent-sender: SingleAddressHeader
:to: UniqueAddressHeader
:resent-to: AddressHeader
:cc: UniqueAddressHeader
:resent-cc: AddressHeader
:bcc: UniqueAddressHeader
:resent-bcc: AddressHeader
:from: UniqueAddressHeader
:resent-from: AddressHeader
:reply-to: UniqueAddressHeader
:mime-version: MIMEVersionHeader
:content-type: ContentTypeHeader
:content-disposition: ContentDispositionHeader
:content-transfer-encoding: ContentTransferEncodingHeader
:message-id: MessageIDHeader

``HeaderRegistry`` has the following methods:

Expand Down
137 changes: 122 additions & 15 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,37 +179,30 @@ def comments(self):


class UnstructuredTokenList(TokenList):

token_type = 'unstructured'


class Phrase(TokenList):

token_type = 'phrase'

class Word(TokenList):

token_type = 'word'


class CFWSList(WhiteSpaceTokenList):

token_type = 'cfws'


class Atom(TokenList):

token_type = 'atom'


class Token(TokenList):

token_type = 'token'
encode_as_ew = False


class EncodedWord(TokenList):

token_type = 'encoded-word'
cte = None
charset = None
Expand Down Expand Up @@ -496,16 +489,19 @@ def domain(self):


class DotAtom(TokenList):

token_type = 'dot-atom'


class DotAtomText(TokenList):

token_type = 'dot-atom-text'
as_ew_allowed = True


class NoFoldLiteral(TokenList):
token_type = 'no-fold-literal'
as_ew_allowed = False


class AddrSpec(TokenList):

token_type = 'addr-spec'
Expand Down Expand Up @@ -809,35 +805,42 @@ def params(self):


class ContentType(ParameterizedHeaderValue):

token_type = 'content-type'
as_ew_allowed = False
maintype = 'text'
subtype = 'plain'


class ContentDisposition(ParameterizedHeaderValue):

token_type = 'content-disposition'
as_ew_allowed = False
content_disposition = None


class ContentTransferEncoding(TokenList):

token_type = 'content-transfer-encoding'
as_ew_allowed = False
cte = '7bit'


class HeaderLabel(TokenList):

token_type = 'header-label'
as_ew_allowed = False


class Header(TokenList):
class MsgID(TokenList):
token_type = 'msg-id'
as_ew_allowed = False

def fold(self, policy):
# message-id tokens may not be folded.
return str(self) + policy.linesep

class MessageID(MsgID):
token_type = 'message-id'


class Header(TokenList):
token_type = 'header'


Expand Down Expand Up @@ -1583,7 +1586,7 @@ def get_addr_spec(value):
addr_spec.append(token)
if not value or value[0] != '@':
addr_spec.defects.append(errors.InvalidHeaderDefect(
"add-spec local part with no domain"))
"addr-spec local part with no domain"))
return addr_spec, value
addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
token, value = get_domain(value[1:])
Expand Down Expand Up @@ -1968,6 +1971,110 @@ def get_address_list(value):
value = value[1:]
return address_list, value


def get_no_fold_literal(value):
""" no-fold-literal = "[" *dtext "]"
"""
no_fold_literal = NoFoldLiteral()
if not value:
raise errors.HeaderParseError(
"expected no-fold-literal but found '{}'".format(value))
if value[0] != '[':
raise errors.HeaderParseError(
"expected '[' at the start of no-fold-literal "
"but found '{}'".format(value))
no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start'))
value = value[1:]
token, value = get_dtext(value)
no_fold_literal.append(token)
if not value or value[0] != ']':
raise errors.HeaderParseError(
"expected ']' at the end of no-fold-literal "
"but found '{}'".format(value))
no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end'))
return no_fold_literal, value[1:]

def get_msg_id(value):
"""msg-id = [CFWS] "<" id-left '@' id-right ">" [CFWS]
id-left = dot-atom-text / obs-id-left
id-right = dot-atom-text / no-fold-literal / obs-id-right
no-fold-literal = "[" *dtext "]"
"""
msg_id = MsgID()
if value[0] in CFWS_LEADER:
token, value = get_cfws(value)
msg_id.append(token)
if not value or value[0] != '<':
raise errors.HeaderParseError(
"expected msg-id but found '{}'".format(value))
msg_id.append(ValueTerminal('<', 'msg-id-start'))
value = value[1:]
# Parse id-left.
try:
token, value = get_dot_atom_text(value)
except errors.HeaderParseError:
try:
# obs-id-left is same as local-part of add-spec.
token, value = get_obs_local_part(value)
msg_id.defects.append(errors.ObsoleteHeaderDefect(
"obsolete id-left in msg-id"))
except errors.HeaderParseError:
raise errors.HeaderParseError(
"expected dot-atom-text or obs-id-left"
" but found '{}'".format(value))
msg_id.append(token)
if not value or value[0] != '@':
msg_id.defects.append(errors.InvalidHeaderDefect(
"msg-id with no id-right"))
# Even though there is no id-right, if the local part
# ends with `>` let's just parse it too and return
# along with the defect.
if value and value[0] == '>':
msg_id.append(ValueTerminal('>', 'msg-id-end'))
value = value[1:]
return msg_id, value
msg_id.append(ValueTerminal('@', 'address-at-symbol'))
value = value[1:]
# Parse id-right.
try:
token, value = get_dot_atom_text(value)
except errors.HeaderParseError:
try:
token, value = get_no_fold_literal(value)
except errors.HeaderParseError as e:
try:
token, value = get_domain(value)
msg_id.defects.append(errors.ObsoleteHeaderDefect(
"obsolete id-right in msg-id"))
except errors.HeaderParseError:
raise errors.HeaderParseError(
"expected dot-atom-text, no-fold-literal or obs-id-right"
" but found '{}'".format(value))
msg_id.append(token)
if value and value[0] == '>':
value = value[1:]
else:
msg_id.defects.append(errors.InvalidHeaderDefect(
"missing trailing '>' on msg-id"))
msg_id.append(ValueTerminal('>', 'msg-id-end'))
if value and value[0] in CFWS_LEADER:
token, value = get_cfws(value)
msg_id.append(token)
return msg_id, value


def parse_message_id(value):
"""message-id = "Message-ID:" msg-id CRLF
"""
message_id = MessageID()
try:
token, value = get_msg_id(value)
except errors.HeaderParseError:
message_id.defects.append(errors.InvalidHeaderDefect(
"Expected msg-id but found {!r}".format(value)))
message_id.append(token)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears to bomb out when building hyper kitty, because token is referenced before assignment, when try block fails.

======================================================================
ERROR: test_long_message_id (hyperkitty.tests.lib.test_incoming.TestAddToList)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "./hyperkitty/tests/lib/test_incoming.py", line 295, in test_long_message_id
    msg["Message-ID"] = "X" * 260
  File "/usr/lib/python3.8/email/message.py", line 409, in __setitem__
    self._headers.append(self.policy.header_store_parse(name, val))
  File "/usr/lib/python3.8/email/policy.py", line 148, in header_store_parse
    return (name, self.header_factory(name, value))
  File "/usr/lib/python3.8/email/headerregistry.py", line 602, in __call__
    return self[name](name, value)
  File "/usr/lib/python3.8/email/headerregistry.py", line 197, in __new__
    cls.parse(value, kwds)
  File "/usr/lib/python3.8/email/headerregistry.py", line 530, in parse
    kwds['parse_tree'] = parse_tree = cls.value_parser(value)
  File "/usr/lib/python3.8/email/_header_value_parser.py", line 2116, in parse_message_id
    message_id.append(token)
UnboundLocalError: local variable 'token' referenced before assignment

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, same for me. Also, accessing value[0] before checking if value is there…

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@surkova Thank you! I think this should be fixed up properly. Opened BPO https://bugs.python.org/issue38708

return message_id

#
# XXX: As I begin to add additional header parsers, I'm realizing we probably
# have two level of parser routines: the get_XXX methods that get a token in
Expand Down
13 changes: 13 additions & 0 deletions Lib/email/headerregistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,18 @@ def cte(self):
return self._cte


class MessageIDHeader:

max_count = 1
value_parser = staticmethod(parser.parse_message_id)

@classmethod
def parse(cls, value, kwds):
kwds['parse_tree'] = parse_tree = cls.value_parser(value)
kwds['decoded'] = str(parse_tree)
kwds['defects'].extend(parse_tree.all_defects)


# The header factory #

_default_header_map = {
Expand All @@ -542,6 +554,7 @@ def cte(self):
'content-type': ContentTypeHeader,
'content-disposition': ContentDispositionHeader,
'content-transfer-encoding': ContentTransferEncodingHeader,
'message-id': MessageIDHeader,
}

class HeaderRegistry:
Expand Down
72 changes: 72 additions & 0 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2494,6 +2494,78 @@ def test_invalid_content_transfer_encoding(self):
";foo", ";foo", ";foo", [errors.InvalidHeaderDefect]*3
)

# get_msg_id

def test_get_msg_id_valid(self):
msg_id = self._test_get_x(
parser.get_msg_id,
"<simeple.local@example.something.com>",
"<simeple.local@example.something.com>",
"<simeple.local@example.something.com>",
[],
'',
)
self.assertEqual(msg_id.token_type, 'msg-id')

def test_get_msg_id_obsolete_local(self):
msg_id = self._test_get_x(
parser.get_msg_id,
'<"simeple.local"@example.com>',
'<"simeple.local"@example.com>',
'<simeple.local@example.com>',
[errors.ObsoleteHeaderDefect],
'',
)
self.assertEqual(msg_id.token_type, 'msg-id')

def test_get_msg_id_non_folding_literal_domain(self):
msg_id = self._test_get_x(
parser.get_msg_id,
"<simple.local@[someexamplecom.domain]>",
"<simple.local@[someexamplecom.domain]>",
"<simple.local@[someexamplecom.domain]>",
[],
"",
)
self.assertEqual(msg_id.token_type, 'msg-id')


def test_get_msg_id_obsolete_domain_part(self):
msg_id = self._test_get_x(
parser.get_msg_id,
"<simplelocal@(old)example.com>",
"<simplelocal@(old)example.com>",
"<simplelocal@ example.com>",
[errors.ObsoleteHeaderDefect],
""
)

def test_get_msg_id_no_id_right_part(self):
msg_id = self._test_get_x(
parser.get_msg_id,
"<simplelocal>",
"<simplelocal>",
"<simplelocal>",
[errors.InvalidHeaderDefect],
""
)
self.assertEqual(msg_id.token_type, 'msg-id')

def test_get_msg_id_no_angle_start(self):
with self.assertRaises(errors.HeaderParseError):
parser.get_msg_id("msgwithnoankle")

def test_get_msg_id_no_angle_end(self):
msg_id = self._test_get_x(
parser.get_msg_id,
"<simplelocal@domain",
"<simplelocal@domain>",
"<simplelocal@domain>",
[errors.InvalidHeaderDefect],
""
)
self.assertEqual(msg_id.token_type, 'msg-id')


@parameterize
class Test_parse_mime_parameters(TestParserMixin, TestEmailBase):
Expand Down
Loading