Skip to content

Commit b6aacbf

Browse files
authored
Merge pull request RustPython#5629 from coolreader18/surrogate-literals
Parse surrogates in string literals properly
2 parents cd89aa5 + dd467f6 commit b6aacbf

37 files changed

+631
-203
lines changed

Cargo.lock

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/test_codeccallbacks.py

-6
Original file line numberDiff line numberDiff line change
@@ -536,8 +536,6 @@ def test_badandgoodxmlcharrefreplaceexceptions(self):
536536
("".join("&#%d;" % c for c in cs), 1 + len(s))
537537
)
538538

539-
# TODO: RUSTPYTHON
540-
@unittest.expectedFailure
541539
def test_badandgoodbackslashreplaceexceptions(self):
542540
# "backslashreplace" complains about a non-exception passed in
543541
self.assertRaises(
@@ -596,8 +594,6 @@ def test_badandgoodbackslashreplaceexceptions(self):
596594
(r, 2)
597595
)
598596

599-
# TODO: RUSTPYTHON
600-
@unittest.expectedFailure
601597
def test_badandgoodnamereplaceexceptions(self):
602598
# "namereplace" complains about a non-exception passed in
603599
self.assertRaises(
@@ -644,8 +640,6 @@ def test_badandgoodnamereplaceexceptions(self):
644640
(r, 1 + len(s))
645641
)
646642

647-
# TODO: RUSTPYTHON
648-
@unittest.expectedFailure
649643
def test_badandgoodsurrogateescapeexceptions(self):
650644
surrogateescape_errors = codecs.lookup_error('surrogateescape')
651645
# "surrogateescape" complains about a non-exception passed in

Lib/test/test_codecs.py

+15-16
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,11 @@ def test_bug691291(self):
869869
with reader:
870870
self.assertEqual(reader.read(), s1)
871871

872+
# TODO: RUSTPYTHON
873+
@unittest.expectedFailure
874+
def test_incremental_surrogatepass(self):
875+
super().test_incremental_surrogatepass()
876+
872877
class UTF16LETest(ReadTest, unittest.TestCase):
873878
encoding = "utf-16-le"
874879
ill_formed_sequence = b"\x80\xdc"
@@ -917,6 +922,11 @@ def test_nonbmp(self):
917922
self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
918923
"\U00010203")
919924

925+
# TODO: RUSTPYTHON
926+
@unittest.expectedFailure
927+
def test_incremental_surrogatepass(self):
928+
super().test_incremental_surrogatepass()
929+
920930
class UTF16BETest(ReadTest, unittest.TestCase):
921931
encoding = "utf-16-be"
922932
ill_formed_sequence = b"\xdc\x80"
@@ -965,6 +975,11 @@ def test_nonbmp(self):
965975
self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
966976
"\U00010203")
967977

978+
# TODO: RUSTPYTHON
979+
@unittest.expectedFailure
980+
def test_incremental_surrogatepass(self):
981+
super().test_incremental_surrogatepass()
982+
968983
class UTF8Test(ReadTest, unittest.TestCase):
969984
encoding = "utf-8"
970985
ill_formed_sequence = b"\xed\xb2\x80"
@@ -998,8 +1013,6 @@ def test_decoder_state(self):
9981013
self.check_state_handling_decode(self.encoding,
9991014
u, u.encode(self.encoding))
10001015

1001-
# TODO: RUSTPYTHON
1002-
@unittest.expectedFailure
10031016
def test_decode_error(self):
10041017
for data, error_handler, expected in (
10051018
(b'[\x80\xff]', 'ignore', '[]'),
@@ -1026,8 +1039,6 @@ def test_lone_surrogates(self):
10261039
exc = cm.exception
10271040
self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
10281041

1029-
# TODO: RUSTPYTHON
1030-
@unittest.expectedFailure
10311042
def test_surrogatepass_handler(self):
10321043
self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
10331044
self.BOM + b"abc\xed\xa0\x80def")
@@ -2884,8 +2895,6 @@ def test_escape_encode(self):
28842895

28852896
class SurrogateEscapeTest(unittest.TestCase):
28862897

2887-
# TODO: RUSTPYTHON
2888-
@unittest.expectedFailure
28892898
def test_utf8(self):
28902899
# Bad byte
28912900
self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
@@ -2898,8 +2907,6 @@ def test_utf8(self):
28982907
self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
28992908
b"\xed\xb0\x80")
29002909

2901-
# TODO: RUSTPYTHON
2902-
@unittest.expectedFailure
29032910
def test_ascii(self):
29042911
# bad byte
29052912
self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
@@ -2916,8 +2923,6 @@ def test_charmap(self):
29162923
self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
29172924
b"foo\xa5bar")
29182925

2919-
# TODO: RUSTPYTHON
2920-
@unittest.expectedFailure
29212926
def test_latin1(self):
29222927
# Issue6373
29232928
self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
@@ -3561,8 +3566,6 @@ class ASCIITest(unittest.TestCase):
35613566
def test_encode(self):
35623567
self.assertEqual('abc123'.encode('ascii'), b'abc123')
35633568

3564-
# TODO: RUSTPYTHON
3565-
@unittest.expectedFailure
35663569
def test_encode_error(self):
35673570
for data, error_handler, expected in (
35683571
('[\x80\xff\u20ac]', 'ignore', b'[]'),
@@ -3585,8 +3588,6 @@ def test_encode_surrogateescape_error(self):
35853588
def test_decode(self):
35863589
self.assertEqual(b'abc'.decode('ascii'), 'abc')
35873590

3588-
# TODO: RUSTPYTHON
3589-
@unittest.expectedFailure
35903591
def test_decode_error(self):
35913592
for data, error_handler, expected in (
35923593
(b'[\x80\xff]', 'ignore', '[]'),
@@ -3609,8 +3610,6 @@ def test_encode(self):
36093610
with self.subTest(data=data, expected=expected):
36103611
self.assertEqual(data.encode('latin1'), expected)
36113612

3612-
# TODO: RUSTPYTHON
3613-
@unittest.expectedFailure
36143613
def test_encode_errors(self):
36153614
for data, error_handler, expected in (
36163615
('[\u20ac\udc80]', 'ignore', b'[]'),

Lib/test/test_json/test_scanstring.py

-2
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ def test_scanstring(self):
8686
scanstring('["Bad value", truth]', 2, True),
8787
('Bad value', 12))
8888

89-
# TODO: RUSTPYTHON
90-
@unittest.expectedFailure
9189
def test_surrogates(self):
9290
scanstring = self.json.decoder.scanstring
9391
def assertScan(given, expect):

Lib/test/test_regrtest.py

-2
Original file line numberDiff line numberDiff line change
@@ -945,15 +945,13 @@ def test_leak(self):
945945
""")
946946
self.check_leak(code, 'file descriptors')
947947

948-
@unittest.expectedFailureIfWindows('TODO: RUSTPYTHON Windows')
949948
def test_list_tests(self):
950949
# test --list-tests
951950
tests = [self.create_test() for i in range(5)]
952951
output = self.run_tests('--list-tests', *tests)
953952
self.assertEqual(output.rstrip().splitlines(),
954953
tests)
955954

956-
@unittest.expectedFailureIfWindows('TODO: RUSTPYTHON Windows')
957955
def test_list_cases(self):
958956
# test --list-cases
959957
code = textwrap.dedent("""

Lib/test/test_stringprep.py

-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
from stringprep import *
77

88
class StringprepTests(unittest.TestCase):
9-
# TODO: RUSTPYTHON
10-
@unittest.expectedFailure
119
def test(self):
1210
self.assertTrue(in_table_a1("\u0221"))
1311
self.assertFalse(in_table_a1("\u0222"))

Lib/test/test_subprocess.py

-2
Original file line numberDiff line numberDiff line change
@@ -1198,8 +1198,6 @@ def test_universal_newlines_communicate_encodings(self):
11981198
stdout, stderr = popen.communicate(input='')
11991199
self.assertEqual(stdout, '1\n2\n3\n4')
12001200

1201-
# TODO: RUSTPYTHON
1202-
@unittest.expectedFailure
12031201
def test_communicate_errors(self):
12041202
for errors, expected in [
12051203
('ignore', ''),

Lib/test/test_tarfile.py

-14
Original file line numberDiff line numberDiff line change
@@ -2086,11 +2086,6 @@ class UstarUnicodeTest(UnicodeTest, unittest.TestCase):
20862086

20872087
format = tarfile.USTAR_FORMAT
20882088

2089-
# TODO: RUSTPYTHON
2090-
@unittest.expectedFailure
2091-
def test_uname_unicode(self):
2092-
super().test_uname_unicode()
2093-
20942089
# Test whether the utf-8 encoded version of a filename exceeds the 100
20952090
# bytes name field limit (every occurrence of '\xff' will be expanded to 2
20962091
# bytes).
@@ -2170,13 +2165,6 @@ class GNUUnicodeTest(UnicodeTest, unittest.TestCase):
21702165

21712166
format = tarfile.GNU_FORMAT
21722167

2173-
# TODO: RUSTPYTHON
2174-
@unittest.expectedFailure
2175-
def test_uname_unicode(self):
2176-
super().test_uname_unicode()
2177-
2178-
# TODO: RUSTPYTHON
2179-
@unittest.expectedFailure
21802168
def test_bad_pax_header(self):
21812169
# Test for issue #8633. GNU tar <= 1.23 creates raw binary fields
21822170
# without a hdrcharset=BINARY header.
@@ -2198,8 +2186,6 @@ class PAXUnicodeTest(UnicodeTest, unittest.TestCase):
21982186
# PAX_FORMAT ignores encoding in write mode.
21992187
test_unicode_filename_error = None
22002188

2201-
# TODO: RUSTPYTHON
2202-
@unittest.expectedFailure
22032189
def test_binary_header(self):
22042190
# Test a POSIX.1-2008 compatible header with a hdrcharset=BINARY field.
22052191
for encoding, name in (

Lib/test/test_unicode.py

-8
Original file line numberDiff line numberDiff line change
@@ -608,8 +608,6 @@ def test_bytes_comparison(self):
608608
self.assertEqual('abc' == bytearray(b'abc'), False)
609609
self.assertEqual('abc' != bytearray(b'abc'), True)
610610

611-
# TODO: RUSTPYTHON
612-
@unittest.expectedFailure
613611
def test_comparison(self):
614612
# Comparisons:
615613
self.assertEqual('abc', 'abc')
@@ -830,8 +828,6 @@ def test_isidentifier_legacy(self):
830828
warnings.simplefilter('ignore', DeprecationWarning)
831829
self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
832830

833-
# TODO: RUSTPYTHON
834-
@unittest.expectedFailure
835831
def test_isprintable(self):
836832
self.assertTrue("".isprintable())
837833
self.assertTrue(" ".isprintable())
@@ -847,8 +843,6 @@ def test_isprintable(self):
847843
self.assertTrue('\U0001F46F'.isprintable())
848844
self.assertFalse('\U000E0020'.isprintable())
849845

850-
# TODO: RUSTPYTHON
851-
@unittest.expectedFailure
852846
def test_surrogates(self):
853847
for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
854848
'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
@@ -1827,8 +1821,6 @@ def test_codecs_utf7(self):
18271821
'ill-formed sequence'):
18281822
b'+@'.decode('utf-7')
18291823

1830-
# TODO: RUSTPYTHON
1831-
@unittest.expectedFailure
18321824
def test_codecs_utf8(self):
18331825
self.assertEqual(''.encode('utf-8'), b'')
18341826
self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')

Lib/test/test_userstring.py

-4
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,13 @@ def __rmod__(self, other):
5353
str3 = ustr3('TEST')
5454
self.assertEqual(fmt2 % str3, 'value is TEST')
5555

56-
# TODO: RUSTPYTHON
57-
@unittest.expectedFailure
5856
def test_encode_default_args(self):
5957
self.checkequal(b'hello', 'hello', 'encode')
6058
# Check that encoding defaults to utf-8
6159
self.checkequal(b'\xf0\xa3\x91\x96', '\U00023456', 'encode')
6260
# Check that errors defaults to 'strict'
6361
self.checkraises(UnicodeError, '\ud800', 'encode')
6462

65-
# TODO: RUSTPYTHON
66-
@unittest.expectedFailure
6763
def test_encode_explicit_none_args(self):
6864
self.checkequal(b'hello', 'hello', 'encode', None, None)
6965
# Check that encoding defaults to utf-8

Lib/test/test_zipimport.py

+1
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,7 @@ def testTraceback(self):
730730

731731
@unittest.skipIf(os_helper.TESTFN_UNENCODABLE is None,
732732
"need an unencodable filename")
733+
@unittest.expectedFailureIfWindows("TODO: RUSTPYTHON")
733734
def testUnencodable(self):
734735
filename = os_helper.TESTFN_UNENCODABLE + ".zip"
735736
self.addCleanup(os_helper.unlink, filename)

common/src/encodings.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ pub mod errors {
401401
let mut out = String::with_capacity(num_chars * 4);
402402
for c in err_str.code_points() {
403403
let c_u32 = c.to_u32();
404-
if let Some(c_name) = unicode_names2::name(c.to_char_lossy()) {
404+
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
405405
write!(out, "\\N{{{c_name}}}").unwrap();
406406
} else if c_u32 >= 0x10000 {
407407
write!(out, "\\U{c_u32:08x}").unwrap();

0 commit comments

Comments
 (0)