sheeeng
diff --git a/‎Lib/test/test_codecs.py
+15-16 b/‎Lib/test/test_codecs.py
+15-16
diff --git a/‎Lib/test/test_json/test_scanstring.py
-2 b/‎Lib/test/test_json/test_scanstring.py
-2
diff --git a/‎Lib/test/test_regrtest.py
-2 b/‎Lib/test/test_regrtest.py
-2
diff --git a/‎Lib/test/test_stringprep.py
-2 b/‎Lib/test/test_stringprep.py
-2
diff --git a/‎Lib/test/test_subprocess.py
-2 b/‎Lib/test/test_subprocess.py
-2
diff --git a/‎Lib/test/test_tarfile.py
-14 b/‎Lib/test/test_tarfile.py
-14
diff --git a/‎Lib/test/test_unicode.py
-8 b/‎Lib/test/test_unicode.py
-8
diff --git a/‎Lib/test/test_userstring.py
-4 b/‎Lib/test/test_userstring.py
-4
diff --git a/‎Lib/test/test_zipimport.py
+1 b/‎Lib/test/test_zipimport.py
+1
diff --git a/‎common/src/wtf8/mod.rs
+34-21 b/‎common/src/wtf8/mod.rs
+34-21
diff --git a/‎stdlib/src/json.rs
+3-2 b/‎stdlib/src/json.rs
+3-2
@@ -869,6 +869,11 @@ def test_bug691291(self):
         with reader:
             self.assertEqual(reader.read(), s1)
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_incremental_surrogatepass(self):
+        super().test_incremental_surrogatepass()
+
 class UTF16LETest(ReadTest, unittest.TestCase):
     encoding = "utf-16-le"
     ill_formed_sequence = b"\x80\xdc"
@@ -917,6 +922,11 @@ def test_nonbmp(self):
         self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
                          "\U00010203")
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_incremental_surrogatepass(self):
+        super().test_incremental_surrogatepass()
+
 class UTF16BETest(ReadTest, unittest.TestCase):
     encoding = "utf-16-be"
     ill_formed_sequence = b"\xdc\x80"
@@ -965,6 +975,11 @@ def test_nonbmp(self):
         self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
                          "\U00010203")
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_incremental_surrogatepass(self):
+        super().test_incremental_surrogatepass()
+
 class UTF8Test(ReadTest, unittest.TestCase):
     encoding = "utf-8"
     ill_formed_sequence = b"\xed\xb2\x80"
@@ -998,8 +1013,6 @@ def test_decoder_state(self):
         self.check_state_handling_decode(self.encoding,
                                          u, u.encode(self.encoding))
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_decode_error(self):
         for data, error_handler, expected in (
             (b'[\x80\xff]', 'ignore', '[]'),
@@ -1026,8 +1039,6 @@ def test_lone_surrogates(self):
         exc = cm.exception
         self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_surrogatepass_handler(self):
         self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
                          self.BOM + b"abc\xed\xa0\x80def")
@@ -2884,8 +2895,6 @@ def test_escape_encode(self):
 
 class SurrogateEscapeTest(unittest.TestCase):
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_utf8(self):
         # Bad byte
         self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
@@ -2898,8 +2907,6 @@ def test_utf8(self):
         self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
                          b"\xed\xb0\x80")
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_ascii(self):
         # bad byte
         self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
@@ -2916,8 +2923,6 @@ def test_charmap(self):
         self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
                          b"foo\xa5bar")
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_latin1(self):
         # Issue6373
         self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
@@ -3561,8 +3566,6 @@ class ASCIITest(unittest.TestCase):
     def test_encode(self):
         self.assertEqual('abc123'.encode('ascii'), b'abc123')
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_encode_error(self):
         for data, error_handler, expected in (
             ('[\x80\xff\u20ac]', 'ignore', b'[]'),
@@ -3585,8 +3588,6 @@ def test_encode_surrogateescape_error(self):
     def test_decode(self):
         self.assertEqual(b'abc'.decode('ascii'), 'abc')
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_decode_error(self):
         for data, error_handler, expected in (
             (b'[\x80\xff]', 'ignore', '[]'),
@@ -3609,8 +3610,6 @@ def test_encode(self):
             with self.subTest(data=data, expected=expected):
                 self.assertEqual(data.encode('latin1'), expected)
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_encode_errors(self):
         for data, error_handler, expected in (
             ('[\u20ac\udc80]', 'ignore', b'[]'),
 
@@ -86,8 +86,6 @@ def test_scanstring(self):
             scanstring('["Bad value", truth]', 2, True),
             ('Bad value', 12))
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_surrogates(self):
         scanstring = self.json.decoder.scanstring
         def assertScan(given, expect):
 
@@ -945,15 +945,13 @@ def test_leak(self):
         """)
         self.check_leak(code, 'file descriptors')
 
-    @unittest.expectedFailureIfWindows('TODO: RUSTPYTHON Windows')
     def test_list_tests(self):
         # test --list-tests
         tests = [self.create_test() for i in range(5)]
         output = self.run_tests('--list-tests', *tests)
         self.assertEqual(output.rstrip().splitlines(),
                          tests)
 
-    @unittest.expectedFailureIfWindows('TODO: RUSTPYTHON Windows')
     def test_list_cases(self):
         # test --list-cases
         code = textwrap.dedent("""
 
@@ -6,8 +6,6 @@
 from stringprep import *
 
 class StringprepTests(unittest.TestCase):
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test(self):
         self.assertTrue(in_table_a1("\u0221"))
         self.assertFalse(in_table_a1("\u0222"))
 
@@ -1198,8 +1198,6 @@ def test_universal_newlines_communicate_encodings(self):
             stdout, stderr = popen.communicate(input='')
             self.assertEqual(stdout, '1\n2\n3\n4')
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_communicate_errors(self):
         for errors, expected in [
             ('ignore', ''),
 
@@ -2086,11 +2086,6 @@ class UstarUnicodeTest(UnicodeTest, unittest.TestCase):
 
     format = tarfile.USTAR_FORMAT
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
-    def test_uname_unicode(self):
-        super().test_uname_unicode()
-
     # Test whether the utf-8 encoded version of a filename exceeds the 100
     # bytes name field limit (every occurrence of '\xff' will be expanded to 2
     # bytes).
@@ -2170,13 +2165,6 @@ class GNUUnicodeTest(UnicodeTest, unittest.TestCase):
 
     format = tarfile.GNU_FORMAT
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
-    def test_uname_unicode(self):
-        super().test_uname_unicode()
-
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_bad_pax_header(self):
         # Test for issue #8633. GNU tar <= 1.23 creates raw binary fields
         # without a hdrcharset=BINARY header.
@@ -2198,8 +2186,6 @@ class PAXUnicodeTest(UnicodeTest, unittest.TestCase):
     # PAX_FORMAT ignores encoding in write mode.
     test_unicode_filename_error = None
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_binary_header(self):
         # Test a POSIX.1-2008 compatible header with a hdrcharset=BINARY field.
         for encoding, name in (
 
@@ -608,8 +608,6 @@ def test_bytes_comparison(self):
             self.assertEqual('abc' == bytearray(b'abc'), False)
             self.assertEqual('abc' != bytearray(b'abc'), True)
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_comparison(self):
         # Comparisons:
         self.assertEqual('abc', 'abc')
@@ -830,8 +828,6 @@ def test_isidentifier_legacy(self):
             warnings.simplefilter('ignore', DeprecationWarning)
             self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_isprintable(self):
         self.assertTrue("".isprintable())
         self.assertTrue(" ".isprintable())
@@ -847,8 +843,6 @@ def test_isprintable(self):
         self.assertTrue('\U0001F46F'.isprintable())
         self.assertFalse('\U000E0020'.isprintable())
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_surrogates(self):
         for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
                   'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
@@ -1827,8 +1821,6 @@ def test_codecs_utf7(self):
                                     'ill-formed sequence'):
             b'+@'.decode('utf-7')
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_codecs_utf8(self):
         self.assertEqual(''.encode('utf-8'), b'')
         self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
 
@@ -53,17 +53,13 @@ def __rmod__(self, other):
         str3 = ustr3('TEST')
         self.assertEqual(fmt2 % str3, 'value is TEST')
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_encode_default_args(self):
         self.checkequal(b'hello', 'hello', 'encode')
         # Check that encoding defaults to utf-8
         self.checkequal(b'\xf0\xa3\x91\x96', '\U00023456', 'encode')
         # Check that errors defaults to 'strict'
         self.checkraises(UnicodeError, '\ud800', 'encode')
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_encode_explicit_none_args(self):
         self.checkequal(b'hello', 'hello', 'encode', None, None)
         # Check that encoding defaults to utf-8
 
@@ -730,6 +730,7 @@ def testTraceback(self):
 
     @unittest.skipIf(os_helper.TESTFN_UNENCODABLE is None,
                      "need an unencodable filename")
+    @unittest.expectedFailureIfWindows("TODO: RUSTPYTHON")
     def testUnencodable(self):
         filename = os_helper.TESTFN_UNENCODABLE + ".zip"
         self.addCleanup(os_helper.unlink, filename)
 
@@ -122,18 +122,18 @@ impl CodePoint {
 
     /// Returns the numeric value of the code point if it is a leading surrogate.
     #[inline]
-    pub fn to_lead_surrogate(self) -> Option<u16> {
+    pub fn to_lead_surrogate(self) -> Option<LeadSurrogate> {
         match self.value {
-            lead @ 0xD800..=0xDBFF => Some(lead as u16),
+            lead @ 0xD800..=0xDBFF => Some(LeadSurrogate(lead as u16)),
             _ => None,
         }
     }
 
     /// Returns the numeric value of the code point if it is a trailing surrogate.
     #[inline]
-    pub fn to_trail_surrogate(self) -> Option<u16> {
+    pub fn to_trail_surrogate(self) -> Option<TrailSurrogate> {
         match self.value {
-            trail @ 0xDC00..=0xDFFF => Some(trail as u16),
+            trail @ 0xDC00..=0xDFFF => Some(TrailSurrogate(trail as u16)),
             _ => None,
         }
     }
@@ -216,6 +216,18 @@ impl PartialEq<CodePoint> for char {
     }
 }
 
+#[derive(Clone, Copy)]
+pub struct LeadSurrogate(u16);
+
+#[derive(Clone, Copy)]
+pub struct TrailSurrogate(u16);
+
+impl LeadSurrogate {
+    pub fn merge(self, trail: TrailSurrogate) -> char {
+        decode_surrogate_pair(self.0, trail.0)
+    }
+}
+
 /// An owned, growable string of well-formed WTF-8 data.
 ///
 /// Similar to `String`, but can additionally contain surrogate code points
@@ -291,6 +303,14 @@ impl Wtf8Buf {
         Wtf8Buf { bytes: value }
     }
 
+    /// Create a WTF-8 string from a WTF-8 byte vec.
+    pub fn from_bytes(value: Vec<u8>) -> Result<Self, Vec<u8>> {
+        match Wtf8::from_bytes(&value) {
+            Some(_) => Ok(unsafe { Self::from_bytes_unchecked(value) }),
+            None => Err(value),
+        }
+    }
+
     /// Creates a WTF-8 string from a UTF-8 `String`.
     ///
     /// This takes ownership of the `String` and does not copy.
@@ -750,15 +770,10 @@ impl Wtf8 {
     }
 
     fn decode_surrogate(b: &[u8]) -> Option<CodePoint> {
-        let [a, b, c, ..] = *b else { return None };
-        if (a & 0xf0) == 0xe0 && (b & 0xc0) == 0x80 && (c & 0xc0) == 0x80 {
-            // it's a three-byte code
-            let c = ((a as u32 & 0x0f) << 12) + ((b as u32 & 0x3f) << 6) + (c as u32 & 0x3f);
-            let 0xD800..=0xDFFF = c else { return None };
-            Some(CodePoint { value: c })
-        } else {
-            None
-        }
+        let [0xed, b2 @ (0xa0..), b3, ..] = *b else {
+            return None;
+        };
+        Some(decode_surrogate(b2, b3).into())
     }
 
     /// Returns the length, in WTF-8 bytes.
@@ -914,14 +929,6 @@ impl Wtf8 {
         }
     }
 
-    #[inline]
-    fn final_lead_surrogate(&self) -> Option<u16> {
-        match self.bytes {
-            [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
-            _ => None,
-        }
-    }
-
     pub fn is_code_point_boundary(&self, index: usize) -> bool {
         is_code_point_boundary(self, index)
     }
@@ -1222,6 +1229,12 @@ fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
     0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
 }
 
+#[inline]
+fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
+    let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
+    unsafe { char::from_u32_unchecked(code_point) }
+}
+
 /// Copied from str::is_char_boundary
 #[inline]
 fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
 
@@ -13,6 +13,7 @@ mod _json {
         types::{Callable, Constructor},
     };
     use malachite_bigint::BigInt;
+    use rustpython_common::wtf8::Wtf8Buf;
     use std::str::FromStr;
 
     #[pyattr(name = "make_scanner")]
@@ -253,8 +254,8 @@ mod _json {
         end: usize,
         strict: OptionalArg<bool>,
         vm: &VirtualMachine,
-    ) -> PyResult<(String, usize)> {
-        machinery::scanstring(s.as_str(), end, strict.unwrap_or(true))
+    ) -> PyResult<(Wtf8Buf, usize)> {
+        machinery::scanstring(s.as_wtf8(), end, strict.unwrap_or(true))
             .map_err(|e| py_decode_error(e, s, vm))
     }
 }