sheeeng
diff --git a/‎Lib/test/test_cmd_line_script.py
Lines changed: 2 additions & 0 deletions b/‎Lib/test/test_cmd_line_script.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/src/encodings.rs
Lines changed: 74 additions & 62 deletions b/‎common/src/encodings.rs
Lines changed: 74 additions & 62 deletions
diff --git a/‎common/src/str.rs
Lines changed: 10 additions & 1 deletion b/‎common/src/str.rs
Lines changed: 10 additions & 1 deletion
diff --git a/‎common/src/wtf8/mod.rs
Lines changed: 70 additions & 1 deletion b/‎common/src/wtf8/mod.rs
Lines changed: 70 additions & 1 deletion
@@ -574,6 +574,8 @@ def test_pep_409_verbiage(self):
             self.assertTrue(text[1].startswith('  File '))
             self.assertTrue(text[3].startswith('NameError'))
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
     def test_non_ascii(self):
         # Mac OS X denies the creation of a file with an invalid UTF-8 name.
         # Windows allows creating a name with an arbitrary bytes name, but
 
@@ -1,14 +1,22 @@
 use std::ops::Range;
 
+use num_traits::ToPrimitive;
+
+use crate::str::StrKind;
 use crate::wtf8::{Wtf8, Wtf8Buf};
 
 pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
 
 pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;
 
 pub trait StrBuffer: AsRef<Wtf8> {
-    fn is_ascii(&self) -> bool {
-        self.as_ref().is_ascii()
+    fn is_compatible_with(&self, kind: StrKind) -> bool {
+        let s = self.as_ref();
+        match kind {
+            StrKind::Ascii => s.is_ascii(),
+            StrKind::Utf8 => s.is_utf8(),
+            StrKind::Wtf8 => true,
+        }
     }
 }
 
@@ -18,7 +26,7 @@ pub trait ErrorHandler {
     type BytesBuf: AsRef<[u8]>;
     fn handle_encode_error(
         &self,
-        data: &str,
+        data: &Wtf8,
         char_range: Range<usize>,
         reason: &str,
     ) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
@@ -29,7 +37,7 @@ pub trait ErrorHandler {
         reason: &str,
     ) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
     fn error_oob_restart(&self, i: usize) -> Self::Error;
-    fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
+    fn error_encoding(&self, data: &Wtf8, char_range: Range<usize>, reason: &str) -> Self::Error;
 }
 pub enum EncodeReplace<S, B> {
     Str(S),
@@ -118,14 +126,61 @@ where
     Ok((out, remaining_index))
 }
 
+#[inline]
+fn encode_utf8_compatible<E: ErrorHandler>(
+    s: &Wtf8,
+    errors: &E,
+    err_reason: &str,
+    target_kind: StrKind,
+) -> Result<Vec<u8>, E::Error> {
+    let full_data = s;
+    let mut data = s;
+    let mut char_data_index = 0;
+    let mut out = Vec::<u8>::new();
+    while let Some((char_i, (byte_i, _))) = data
+        .code_point_indices()
+        .enumerate()
+        .find(|(_, (_, c))| !target_kind.can_encode(*c))
+    {
+        out.extend_from_slice(&data.as_bytes()[..byte_i]);
+        let char_start = char_data_index + char_i;
+
+        // number of non-compatible chars between the first non-compatible char and the next compatible char
+        let non_compat_run_length = data[byte_i..]
+            .code_points()
+            .take_while(|c| !target_kind.can_encode(*c))
+            .count();
+        let char_range = char_start..char_start + non_compat_run_length;
+        let (replace, char_restart) =
+            errors.handle_encode_error(full_data, char_range.clone(), err_reason)?;
+        match replace {
+            EncodeReplace::Str(s) => {
+                if s.is_compatible_with(target_kind) {
+                    out.extend_from_slice(s.as_ref().as_bytes());
+                } else {
+                    return Err(errors.error_encoding(full_data, char_range, err_reason));
+                }
+            }
+            EncodeReplace::Bytes(b) => {
+                out.extend_from_slice(b.as_ref());
+            }
+        }
+        data = crate::str::try_get_codepoints(full_data, char_restart..)
+            .ok_or_else(|| errors.error_oob_restart(char_restart))?;
+        char_data_index = char_restart;
+    }
+    out.extend_from_slice(data.as_bytes());
+    Ok(out)
+}
+
 pub mod utf8 {
     use super::*;
 
     pub const ENCODING_NAME: &str = "utf-8";
 
     #[inline]
-    pub fn encode<E: ErrorHandler>(s: &str, _errors: &E) -> Result<Vec<u8>, E::Error> {
-        Ok(s.as_bytes().to_vec())
+    pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
+        encode_utf8_compatible(s, errors, "surrogates not allowed", StrKind::Utf8)
     }
 
     pub fn decode<E: ErrorHandler>(
@@ -175,21 +230,22 @@ pub mod utf8 {
 }
 
 pub mod latin_1 {
+
     use super::*;
 
     pub const ENCODING_NAME: &str = "latin-1";
 
     const ERR_REASON: &str = "ordinal not in range(256)";
 
     #[inline]
-    pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
+    pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
         let full_data = s;
         let mut data = s;
         let mut char_data_index = 0;
         let mut out = Vec::<u8>::new();
         loop {
             match data
-                .char_indices()
+                .code_point_indices()
                 .enumerate()
                 .find(|(_, (_, c))| !c.is_ascii())
             {
@@ -200,17 +256,16 @@ pub mod latin_1 {
                 Some((char_i, (byte_i, ch))) => {
                     out.extend_from_slice(&data.as_bytes()[..byte_i]);
                     let char_start = char_data_index + char_i;
-                    if (ch as u32) <= 255 {
-                        out.push(ch as u8);
-                        let char_restart = char_start + 1;
-                        data = crate::str::try_get_chars(full_data, char_restart..)
-                            .ok_or_else(|| errors.error_oob_restart(char_restart))?;
-                        char_data_index = char_restart;
+                    if let Some(byte) = ch.to_u32().to_u8() {
+                        out.push(byte);
+                        // if the codepoint is between 128..=255, it's utf8-length is 2
+                        data = &data[byte_i + 2..];
+                        char_data_index = char_start + 1;
                     } else {
                         // number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
                         let non_latin_1_run_length = data[byte_i..]
-                            .chars()
-                            .take_while(|c| (*c as u32) > 255)
+                            .code_points()
+                            .take_while(|c| c.to_u32() > 255)
                             .count();
                         let char_range = char_start..char_start + non_latin_1_run_length;
                         let (replace, char_restart) = errors.handle_encode_error(
@@ -231,7 +286,7 @@ pub mod latin_1 {
                                 out.extend_from_slice(b.as_ref());
                             }
                         }
-                        data = crate::str::try_get_chars(full_data, char_restart..)
+                        data = crate::str::try_get_codepoints(full_data, char_restart..)
                             .ok_or_else(|| errors.error_oob_restart(char_restart))?;
                         char_data_index = char_restart;
                     }
@@ -258,51 +313,8 @@ pub mod ascii {
     const ERR_REASON: &str = "ordinal not in range(128)";
 
     #[inline]
-    pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
-        let full_data = s;
-        let mut data = s;
-        let mut char_data_index = 0;
-        let mut out = Vec::<u8>::new();
-        loop {
-            match data
-                .char_indices()
-                .enumerate()
-                .find(|(_, (_, c))| !c.is_ascii())
-            {
-                None => {
-                    out.extend_from_slice(data.as_bytes());
-                    break;
-                }
-                Some((char_i, (byte_i, _))) => {
-                    out.extend_from_slice(&data.as_bytes()[..byte_i]);
-                    let char_start = char_data_index + char_i;
-                    // number of non-ascii chars between the first non-ascii char and the next ascii char
-                    let non_ascii_run_length =
-                        data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
-                    let char_range = char_start..char_start + non_ascii_run_length;
-                    let (replace, char_restart) =
-                        errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
-                    match replace {
-                        EncodeReplace::Str(s) => {
-                            if !s.is_ascii() {
-                                return Err(
-                                    errors.error_encoding(full_data, char_range, ERR_REASON)
-                                );
-                            }
-                            out.extend_from_slice(s.as_ref().as_bytes());
-                        }
-                        EncodeReplace::Bytes(b) => {
-                            out.extend_from_slice(b.as_ref());
-                        }
-                    }
-                    data = crate::str::try_get_chars(full_data, char_restart..)
-                        .ok_or_else(|| errors.error_oob_restart(char_restart))?;
-                    char_data_index = char_restart;
-                    continue;
-                }
-            }
-        }
-        Ok(out)
+    pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
+        encode_utf8_compatible(s, errors, ERR_REASON, StrKind::Ascii)
     }
 
     pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(Wtf8Buf, usize), E::Error> {
 
@@ -14,7 +14,7 @@ pub type wchar_t = libc::wchar_t;
 pub type wchar_t = u32;
 
 /// Utf8 + state.ascii (+ PyUnicode_Kind in future)
-#[derive(Debug, Copy, Clone, PartialEq)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
 pub enum StrKind {
     Ascii,
     Utf8,
@@ -41,6 +41,15 @@ impl StrKind {
     pub fn is_utf8(&self) -> bool {
         matches!(self, Self::Ascii | Self::Utf8)
     }
+
+    #[inline(always)]
+    pub fn can_encode(&self, code: CodePoint) -> bool {
+        match self {
+            StrKind::Ascii => code.is_ascii(),
+            StrKind::Utf8 => code.to_char().is_some(),
+            StrKind::Wtf8 => true,
+        }
+    }
 }
 
 pub trait DeduceStrKind {
 
@@ -49,7 +49,7 @@ use std::collections::TryReserveError;
 use std::string::String;
 use std::vec::Vec;
 
-use bstr::ByteSlice;
+use bstr::{ByteSlice, ByteVec};
 
 mod core_char;
 mod core_str;
@@ -168,6 +168,10 @@ impl CodePoint {
     pub fn len_wtf8(&self) -> usize {
         len_utf8(self.value)
     }
+
+    pub fn is_ascii(&self) -> bool {
+        self.is_char_and(|c| c.is_ascii())
+    }
 }
 
 impl From<u16> for CodePoint {
@@ -436,6 +440,13 @@ impl Wtf8Buf {
         self.push_wtf8(code_point.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
     }
 
+    pub fn pop(&mut self) -> Option<CodePoint> {
+        let ch = self.code_points().next_back()?;
+        let newlen = self.len() - ch.len_wtf8();
+        self.bytes.truncate(newlen);
+        Some(ch)
+    }
+
     /// Shortens a string to the specified length.
     ///
     /// # Panics
@@ -448,6 +459,20 @@ impl Wtf8Buf {
         self.bytes.truncate(new_len)
     }
 
+    /// Inserts a codepoint into this `Wtf8Buf` at a byte position.
+    #[inline]
+    pub fn insert(&mut self, idx: usize, c: CodePoint) {
+        self.insert_wtf8(idx, c.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
+    }
+
+    /// Inserts a WTF-8 slice into this `Wtf8Buf` at a byte position.
+    #[inline]
+    pub fn insert_wtf8(&mut self, idx: usize, w: &Wtf8) {
+        assert!(is_code_point_boundary(self, idx));
+
+        self.bytes.insert_str(idx, w)
+    }
+
     /// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
     #[inline]
     pub fn into_bytes(self) -> Vec<u8> {
@@ -914,6 +939,21 @@ impl Wtf8 {
             .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
     }
 
+    pub fn trim(&self) -> &Self {
+        let w = self.bytes.trim();
+        unsafe { Wtf8::from_bytes_unchecked(w) }
+    }
+
+    pub fn trim_start(&self) -> &Self {
+        let w = self.bytes.trim_start();
+        unsafe { Wtf8::from_bytes_unchecked(w) }
+    }
+
+    pub fn trim_end(&self) -> &Self {
+        let w = self.bytes.trim_end();
+        unsafe { Wtf8::from_bytes_unchecked(w) }
+    }
+
     pub fn trim_start_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
         let mut iter = self.code_points();
         loop {
@@ -958,6 +998,15 @@ impl Wtf8 {
         memchr::memmem::rfind(self.as_bytes(), pat.as_bytes())
     }
 
+    pub fn contains(&self, pat: &Wtf8) -> bool {
+        self.bytes.contains_str(pat)
+    }
+
+    pub fn contains_code_point(&self, pat: CodePoint) -> bool {
+        self.bytes
+            .contains_str(pat.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
+    }
+
     pub fn get(&self, range: impl ops::RangeBounds<usize>) -> Option<&Self> {
         let start = match range.start_bound() {
             ops::Bound::Included(&i) => i,
@@ -977,6 +1026,26 @@ impl Wtf8 {
             None
         }
     }
+
+    pub fn ends_with(&self, w: &Wtf8) -> bool {
+        self.bytes.ends_with_str(w)
+    }
+
+    pub fn starts_with(&self, w: &Wtf8) -> bool {
+        self.bytes.starts_with_str(w)
+    }
+
+    pub fn strip_prefix(&self, w: &Wtf8) -> Option<&Self> {
+        self.bytes
+            .strip_prefix(w.as_bytes())
+            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
+    }
+
+    pub fn strip_suffix(&self, w: &Wtf8) -> Option<&Self> {
+        self.bytes
+            .strip_suffix(w.as_bytes())
+            .map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
+    }
 }
 
 impl AsRef<Wtf8> for str {