Skip to content

Commit 8cc48dc

Browse files
xvillaneauuntitaker
authored andcommitted
Fixed the serialization of byte-string objects in Python 3 (getsentry#551)
* fix: Make safe_repr not decode non-printable characters Note: fixed in Python 3 only * fix: bytes now correctly serialized as strings
1 parent b784d63 commit 8cc48dc

File tree

4 files changed

+91
-26
lines changed

4 files changed

+91
-26
lines changed

sentry_sdk/serializer.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,17 @@
3636
# Importing ABCs from collections is deprecated, and will stop working in 3.8
3737
# https://github.com/python/cpython/blob/master/Lib/collections/__init__.py#L49
3838
from collections import Mapping, Sequence
39+
40+
serializable_str_types = string_types
41+
3942
else:
4043
# New in 3.3
4144
# https://docs.python.org/3/library/collections.abc.html
4245
from collections.abc import Mapping, Sequence
4346

47+
# Bytes are technically not strings in Python 3, but we can serialize them
48+
serializable_str_types = (str, bytes)
49+
4450
MAX_DATABAG_DEPTH = 5
4551
MAX_DATABAG_BREADTH = 10
4652
CYCLE_MARKER = u"<cyclic>"
@@ -285,7 +291,7 @@ def _serialize_node_impl(
285291

286292
return rv_dict
287293

288-
elif not isinstance(obj, string_types) and isinstance(obj, Sequence):
294+
elif not isinstance(obj, serializable_str_types) and isinstance(obj, Sequence):
289295
rv_list = []
290296

291297
for i, v in enumerate(obj):

sentry_sdk/utils.py

+34-24
Original file line numberDiff line numberDiff line change
@@ -348,32 +348,42 @@ def safe_str(value):
348348
return safe_repr(value)
349349

350350

351-
def safe_repr(value):
352-
# type: (Any) -> str
353-
try:
354-
rv = repr(value)
355-
if isinstance(rv, bytes):
356-
rv = rv.decode("utf-8", "replace")
357-
358-
# At this point `rv` contains a bunch of literal escape codes, like
359-
# this (exaggerated example):
360-
#
361-
# u"\\x2f"
362-
#
363-
# But we want to show this string as:
364-
#
365-
# u"/"
351+
if PY2:
352+
353+
def safe_repr(value):
354+
# type: (Any) -> str
366355
try:
367-
# unicode-escape does this job, but can only decode latin1. So we
368-
# attempt to encode in latin1.
369-
return rv.encode("latin1").decode("unicode-escape")
356+
rv = repr(value).decode("utf-8", "replace")
357+
358+
# At this point `rv` contains a bunch of literal escape codes, like
359+
# this (exaggerated example):
360+
#
361+
# u"\\x2f"
362+
#
363+
# But we want to show this string as:
364+
#
365+
# u"/"
366+
try:
367+
# unicode-escape does this job, but can only decode latin1. So we
368+
# attempt to encode in latin1.
369+
return rv.encode("latin1").decode("unicode-escape")
370+
except Exception:
371+
# Since usually strings aren't latin1 this can break. In those
372+
# cases we just give up.
373+
return rv
370374
except Exception:
371-
# Since usually strings aren't latin1 this can break. In those
372-
# cases we just give up.
373-
return rv
374-
except Exception:
375-
# If e.g. the call to `repr` already fails
376-
return u"<broken repr>"
375+
# If e.g. the call to `repr` already fails
376+
return u"<broken repr>"
377+
378+
379+
else:
380+
381+
def safe_repr(value):
382+
# type: (Any) -> str
383+
try:
384+
return repr(value)
385+
except Exception:
386+
return "<broken repr>"
377387

378388

379389
def filename_for_module(module, abs_path):

tests/test_serializer.py

+37-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from datetime import datetime
2-
2+
import sys
33

44
import pytest
55

@@ -30,3 +30,39 @@ def test_datetime_precision(dt, semaphore_normalize):
3030
# Float glitches can happen, and more glitches can happen
3131
# because we try to work around some float glitches in semaphore
3232
assert (dt - dt2).total_seconds() < 1.0
33+
34+
@given(binary=st.binary(min_size=1))
35+
def test_bytes_serialization_decode_many(binary, message_normalizer):
36+
result = message_normalizer(binary, should_repr_strings=False)
37+
assert result == binary.decode("utf-8", "replace")
38+
39+
@given(binary=st.binary(min_size=1))
40+
def test_bytes_serialization_repr_many(binary, message_normalizer):
41+
result = message_normalizer(binary, should_repr_strings=True)
42+
assert result == repr(binary)
43+
44+
45+
@pytest.fixture
46+
def message_normalizer(semaphore_normalize):
47+
if semaphore_normalize({"test": "test"}) is None:
48+
pytest.skip("no semaphore available")
49+
50+
def inner(message, **kwargs):
51+
event = serialize({"logentry": {"message": message}}, **kwargs)
52+
normalized = semaphore_normalize(event)
53+
return normalized["logentry"]["message"]
54+
55+
return inner
56+
57+
58+
def test_bytes_serialization_decode(message_normalizer):
59+
binary = b"abc123\x80\xf0\x9f\x8d\x95"
60+
result = message_normalizer(binary, should_repr_strings=False)
61+
assert result == u"abc123\ufffd\U0001f355"
62+
63+
64+
@pytest.mark.xfail(sys.version_info < (3,), reason="Known safe_repr bugs in Py2.7")
65+
def test_bytes_serialization_repr(message_normalizer):
66+
binary = b"abc123\x80\xf0\x9f\x8d\x95"
67+
result = message_normalizer(binary, should_repr_strings=True)
68+
assert result == r"b'abc123\x80\xf0\x9f\x8d\x95'"

tests/utils/test_general.py

+13
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,19 @@ def test_safe_repr_regressions():
3636
assert u"лошадь" in safe_repr(u"лошадь")
3737

3838

39+
@pytest.mark.xfail(
40+
sys.version_info < (3,),
41+
reason="Fixing this in Python 2 would break other behaviors",
42+
)
43+
@pytest.mark.parametrize("prefix", (u"", u"abcd", u"лошадь"))
44+
@pytest.mark.parametrize("character", u"\x00\x07\x1b\n")
45+
def test_safe_repr_non_printable(prefix, character):
46+
"""Check that non-printable characters are escaped"""
47+
string = prefix + character
48+
assert character not in safe_repr(string)
49+
assert character not in safe_repr(string.encode("utf-8"))
50+
51+
3952
def test_abs_path():
4053
"""Check if abs_path is actually an absolute path. This can happen either
4154
with eval/exec like here, or when the file in the frame is relative to

0 commit comments

Comments
 (0)