Python 3: deal with multipart streams as bytes.

djc · djc · commit aa9f393c9a35 · 2014-03-30T19:09:54.000+02:00
diff --git a/couchdb/multipart.py b/couchdb/multipart.py
@@ -23,7 +23,7 @@
 __docformat__ = 'restructuredtext en'
 
 
-CRLF = '\r\n'
+CRLF = b'\r\n'
 
 
 def read_multipart(fileobj, boundary=None):
@@ -50,16 +50,16 @@ def read_multipart(fileobj, boundary=None):
     buf = []
     outer = in_headers = boundary is None
 
-    next_boundary = boundary and '--' + boundary + '\n' or None
-    last_boundary = boundary and '--' + boundary + '--\n' or None
+    next_boundary = boundary and ('--' + boundary + '\n').encode('ascii') or None
+    last_boundary = boundary and ('--' + boundary + '--\n').encode('ascii') or None
 
     def _current_part():
-        payload = ''.join(buf)
-        if payload.endswith('\r\n'):
+        payload = b''.join(buf)
+        if payload.endswith(b'\r\n'):
             payload = payload[:-2]
-        elif payload.endswith('\n'):
+        elif payload.endswith(b'\n'):
             payload = payload[:-1]
-        content_md5 = headers.get('content-md5')
+        content_md5 = headers.get(b'content-md5')
         if content_md5:
             h = b64encode(md5(payload).digest())
             if content_md5 != h:
@@ -68,11 +68,11 @@ def _current_part():
 
     for line in fileobj:
         if in_headers:
-            line = line.replace(CRLF, '\n')
-            if line != '\n':
-                name, value = [item.strip() for item in line.split(':', 1)]
-                name = name.lower()
-                value, charset = header.decode_header(value)[0]
+            line = line.replace(CRLF, b'\n')
+            if line != b'\n':
+                name, value = [item.strip() for item in line.split(b':', 1)]
+                name = name.lower().decode('ascii')
+                value, charset = header.decode_header(value.decode('utf-8'))[0]
                 if charset is None:
                     headers[name] = value
                 else:
@@ -92,7 +92,7 @@ def _current_part():
                             yield part
                         return
 
-        elif line.replace(CRLF, '\n') == next_boundary:
+        elif line.replace(CRLF, b'\n') == next_boundary:
             # We've reached the start of a new part, as indicated by the
             # boundary
             if headers:
@@ -104,7 +104,7 @@ def _current_part():
                 del buf[:]
             in_headers = True
 
-        elif line.replace(CRLF, '\n') == last_boundary:
+        elif line.replace(CRLF, b'\n') == last_boundary:
             # We're done with this multipart envelope
             break
 
@@ -130,39 +130,47 @@ def __init__(self, fileobj, headers=None, subtype='mixed', boundary=None):
         self._write_headers(headers)
 
     def open(self, headers=None, subtype='mixed', boundary=None):
-        self.fileobj.write('--')
-        self.fileobj.write(self.boundary)
+        self.fileobj.write(b'--')
+        self.fileobj.write(self.boundary.encode('utf-8'))
         self.fileobj.write(CRLF)
         return MultipartWriter(self.fileobj, headers=headers, subtype=subtype,
                                boundary=boundary)
 
     def add(self, mimetype, content, headers=None):
-        self.fileobj.write('--')
-        self.fileobj.write(self.boundary)
+        self.fileobj.write(b'--')
+        self.fileobj.write(self.boundary.encode('utf-8'))
         self.fileobj.write(CRLF)
         if headers is None:
             headers = {}
+
+        ctype, params = parse_header(mimetype)
         if isinstance(content, util.utype):
-            ctype, params = parse_header(mimetype)
             if 'charset' in params:
                 content = content.encode(params['charset'])
             else:
                 content = content.encode('utf-8')
                 mimetype = mimetype + ';charset=utf-8'
+        elif 'charset' not in params:
+            try:
+                content.decode('utf-8')
+            finally:
+                mimetype = mimetype + ';charset=utf-8'
+
         headers['Content-Type'] = mimetype
         if content:
             headers['Content-Length'] = str(len(content))
-            headers['Content-MD5'] = b64encode(md5(content).digest())
+            hash = b64encode(md5(content).digest()).decode('ascii')
+            headers['Content-MD5'] = hash
         self._write_headers(headers)
         if content:
             # XXX: throw an exception if a boundary appears in the content??
             self.fileobj.write(content)
             self.fileobj.write(CRLF)
 
     def close(self):
-        self.fileobj.write('--')
-        self.fileobj.write(self.boundary)
-        self.fileobj.write('--')
+        self.fileobj.write(b'--')
+        self.fileobj.write(self.boundary.encode('ascii'))
+        self.fileobj.write(b'--')
         self.fileobj.write(CRLF)
 
     def _make_boundary(self):
@@ -171,19 +179,18 @@ def _make_boundary(self):
             return '==' + uuid4().hex + '=='
         except ImportError:
             from random import randrange
-            token = randrange(sys.maxsize)
-            format = '%%0%dd' % len(repr(sys.maxsize - 1))
-            return '===============' + (format % token) + '=='
+            nonce = ('%%0%dd' % len(repr(sys.maxsize - 1))) % token
+            return '===============' + nonce + '=='
 
     def _write_headers(self, headers):
         if headers:
             for name in sorted(headers.keys()):
                 value = headers[name]
-                if isinstance(value, util.utype):
-                    value = str(header.make_header([(value, 'utf-8')]))
-                self.fileobj.write(name)
-                self.fileobj.write(': ')
-                self.fileobj.write(value)
+                if value.encode('ascii', 'ignore') != value.encode('utf-8'):
+                    value = header.make_header([(value, 'utf-8')]).encode()
+                self.fileobj.write(name.encode('utf-8'))
+                self.fileobj.write(b': ')
+                self.fileobj.write(value.encode('utf-8'))
                 self.fileobj.write(CRLF)
         self.fileobj.write(CRLF)
 
@@ -202,19 +209,19 @@ def write_multipart(fileobj, subtype='mixed', boundary=None):
     envelope you call the ``add(mimetype, content, [headers])`` method for
     every part, and finally call the ``close()`` method.
 
-    >>> from StringIO import StringIO
+    >>> from couchdb.util import StringIO
 
     >>> buf = StringIO()
     >>> envelope = write_multipart(buf, boundary='==123456789==')
     >>> envelope.add('text/plain', 'Just testing')
     >>> envelope.close()
-    >>> print(buf.getvalue().replace('\r\n', '\n'))
+    >>> print(buf.getvalue().replace(b'\r\n', b'\n'))
     Content-Type: multipart/mixed; boundary="==123456789=="
     <BLANKLINE>
     --==123456789==
     Content-Length: 12
     Content-MD5: nHmX4a6el41B06x2uCpglQ==
-    Content-Type: text/plain
+    Content-Type: text/plain;charset=utf-8
     <BLANKLINE>
     Just testing
     --==123456789==--
@@ -233,7 +240,7 @@ def write_multipart(fileobj, subtype='mixed', boundary=None):
     >>> part.add('text/plain', 'Just testing')
     >>> part.close()
     >>> envelope.close()
-    >>> print(buf.getvalue().replace('\r\n', '\n')) #:doctest +ELLIPSIS
+    >>> print(buf.getvalue().replace(b'\r\n', b'\n')) #:doctest +ELLIPSIS
     Content-Type: multipart/mixed; boundary="==123456789=="
     <BLANKLINE>
     --==123456789==
@@ -242,7 +249,7 @@ def write_multipart(fileobj, subtype='mixed', boundary=None):
     --==abcdefghi==
     Content-Length: 12
     Content-MD5: nHmX4a6el41B06x2uCpglQ==
-    Content-Type: text/plain
+    Content-Type: text/plain;charset=utf-8
     <BLANKLINE>
     Just testing
     --==abcdefghi==--
diff --git a/couchdb/tests/multipart.py b/couchdb/tests/multipart.py
@@ -203,7 +203,7 @@ def test_unicode_headers(self):
 Content-ID: =?utf-8?b?5paH5qGj?=
 Content-Length: 63
 Content-MD5: Cpw3iC3xPua8YzKeWLzwvw==
-Content-Type: application/json
+Content-Type: application/json;charset=utf-8
 
 {"_rev": "3-bc27b6930ca514527d8954c7c43e6a09", "_id": "文档"}
 ''', buf.getvalue().replace('\r\n', '\n'))