Skip to content

bpo-44660: Updated email.feedparser with support for message/global emails with quoted-printable and base64 Content-Transfer-Encodings. #27208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
61 changes: 61 additions & 0 deletions Lib/email/base64mime.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@


from base64 import b64encode
from typing import ByteString, Callable

from binascii import b2a_base64, a2b_base64

CRLF = '\r\n'
Expand Down Expand Up @@ -114,6 +116,65 @@ def decode(string):
return a2b_base64(string)


class Base64FeedDecoder:
"""
Adaptation of RFC 2045, s. 6.8 that performs incremental decoding for
FeedParser API.

Note that there is no parsing-related functionality in this class.
Therefore, this class could be generalized, by making the _feed variable
optional, a new _decode_buffer variable that is returned by close(),
and _decode a constructor kwarg, for example; and refactored/moved to the
top-level, base64 package.
"""

def __init__(self, feed: Callable[[ByteString], None]):
"""
:param feed: function that, when specified, consumes the decoded data.
"""
self._decode = a2b_base64 # Underlying decoder implementation.
self._feed = feed # Consumes the decoded data.
# This buffers an incomplete base-64 block that can't be decoded or
# parsed yet:
self._encoded_buffer = bytearray()

def feed(self, data: ByteString):
"""
Feed the parser some more base-64-encoded data. data should be a
bytes-like object representing one or more decoded octets. The octets
can be partial and the decoder will stitch such partial octets together
properly.
:param data: bytes-like object of arbitrary-length.
"""
# Remove whitespace to ensure accurate length calculation:
data = bytes(encoded_byte
for encoded_byte in data
if encoded_byte not in b'\r\n')
# Update buffer and decode any complete base-64 blocks:
self._encoded_buffer.extend(data)
decodable_length = int(len(self._encoded_buffer) / 4) * 4
if decodable_length >= 1:
decodable_bytes = self._encoded_buffer[:decodable_length]
self._encoded_buffer = self._encoded_buffer[decodable_length:]
decoded_bytes = self._decode(decodable_bytes)
# If _feed were made optional, then the decoded bytes could be
# appended to a new self._decoded_buffer variable when _feed is
# None:
self._feed(decoded_bytes)

def close(self):
"""
Ensure the decoding of all previously fed data; and validate the input
length. It is undefined what happens if feed() is called after this
method has been called.
:raises: ValueError if the input fails length validation.
"""
if len(self._encoded_buffer) >= 1:
raise ValueError('The base-64 input has invalid length.')
# If _feed were made optional, then a new self._decoded_buffer variable
# could be returned when _feed is None.


# For convenience and backwards compatibility w/ standard base64 module
body_decode = decode
decodestring = decode
137 changes: 137 additions & 0 deletions Lib/email/feedparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@

__all__ = ['FeedParser', 'BytesFeedParser']

import abc
import base64
import quopri
import re

from email import errors
from email._policybase import compat32
from collections import deque
from io import StringIO
from email.base64mime import Base64FeedDecoder

NLCRE = re.compile(r'\r\n|\r|\n')
NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
Expand Down Expand Up @@ -292,6 +296,34 @@ def _parsegen(self):
# Not at EOF so this is a line we're going to need.
self._input.unreadline(line)
return
if self._cur.get_content_type() == 'message/global':
# Support for message/global parts that can have non-identity
# content-transfer-encodings as outlined in RFC 6532
# (s. 1, p. 3; s 3.5; "Encoding considerations," s. 3.7).
decoding_parser_factory = _decoding_parser_factory_map.get(
self._cur['Content-Transfer-Encoding']
)
if decoding_parser_factory is not None:
# This block only executes if the subpart needs to be decoded as
# it's parsed. Unspecified and identity
# Content-Transfer-Encodings are implicitly handled in a
# subsequent block.
decoding_parser = decoding_parser_factory(
policy=self.policy,
_factory=self._factory
)
# Decode current part's body and parse as another part:
for line in self._input:
if line is NeedMoreData:
yield NeedMoreData
continue
if line == '':
break
decoding_parser.feed(line)
# Retrieve new part and attach (i.e. make a subpart):
subpart = decoding_parser.close()
self._cur.attach(subpart)
return
if self._cur.get_content_maintype() == 'message':
# The message claims to be a message/* type, then what follows is
# another RFC 2822 message.
Expand Down Expand Up @@ -534,3 +566,108 @@ class BytesFeedParser(FeedParser):

def feed(self, data):
super().feed(data.decode('ascii', 'surrogateescape'))


class EncodedFeedParser(abc.ABC):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hanging indent on the docstring looks strange to me; consult PEP 257

This is an abstract base class; only its subclasses should be instantiated
directly.

Instances of this class work like FeedParser except that the concrete
implementations of feed(), prior to parsing the input, transparently decode
the input consistent with RFC 2045, s. 6.2. Each subclass reverses one of
the non-identity Content-Transfer-Encoding transformations described there.
"""

def __init__(self, *args, **kwargs):
self._bytes_feed_parser = BytesFeedParser(*args, **kwargs)

@abc.abstractmethod
def feed(self, text):
pass

def close(self):
return self._bytes_feed_parser.close()


class Base64EncodedFeedParser(EncodedFeedParser):
"""
FeedParser that supports base64-encoded message parts (i.e. the combination
of RFC 2045, s. 6.8; and RFC 6532, particularly s. 3.5).
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# This buffer, when non-empty between calls to feed(), represents an
# incomplete base64 block that can't be decoded or parsed yet:
self._decoder = Base64FeedDecoder(self._bytes_feed_parser.feed)
self._errors = []

def feed(self, text):
encoded_bytes = text.encode('ascii')
try:
self._decoder.feed(encoded_bytes)
except Exception as e:
self._errors.append(e)

def close(self):
message_part = self._bytes_feed_parser.close()
# Attempt to close the decoder in case any further errors occur:
try:
self._decoder.close()
except Exception as e:
self._errors.append(e)
# Include the decoding-related errors in the message:
for error in self._errors:
self.policy.handle_defect(message_part, error)
return message_part


class QuotedPrintableFeedParser(EncodedFeedParser):
"""
FeedParser that supports quoted-printable message parts (i.e. the
combination of RFC 2045, s. 6.7; and RFC 6532, particularly s. 3.5).
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._encoded_buffer = bytearray()

def feed(self, text):
self._encoded_buffer.extend(text.encode('ascii'))
if len(text) < 1 or len(self._encoded_buffer) < 1:
# Buffer either hasn't changed since last call or still has nothing
# that can be parsed.
return
index_of_last_equal_sign = self._encoded_buffer.rfind(b'=')
if (index_of_last_equal_sign < 0
or index_of_last_equal_sign < len(self._encoded_buffer) - 2):
# The buffer either contains no 3-char-sequence, octets/soft line
# breaks; or it contains all three chars of its last octet/soft line
# break; so the whole buffer can be decoded and parsed.
last_decodable_index = len(self._encoded_buffer) - 1
else:
# The buffer doesn't yet have all three chars of its last octet/soft
# line break, so only the chars leading up to its last equal sign
# can be decoded.
last_decodable_index = index_of_last_equal_sign - 1
encoded_bytes = self._encoded_buffer[:last_decodable_index + 1]
self._encoded_buffer = self._encoded_buffer[last_decodable_index + 1:]
if len(encoded_bytes) >= 1:
decoded_bytes = quopri.decodestring(encoded_bytes)
self._bytes_feed_parser.feed(decoded_bytes)

def close(self):
if len(self._encoded_buffer) >= 1:
# TODO: Add a defect to the message object.
pass
return self._bytes_feed_parser.close()


# Map of EncodedFeedParser "factory" functions keyed by
# Content-Transfer-Encodings. Note that the semantics of "decoding" in this
# context exclude identity transformations (i.e. where no decoding is required):
_decoding_parser_factory_map = {
'quoted-printable': QuotedPrintableFeedParser,
'base64': Base64EncodedFeedParser
}
Loading