RustPython · coolreader18 · Sep 23, 2021 · Sep 22, 2021 · Sep 23, 2021
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/common/Cargo.toml b/common/Cargo.toml
@@ -22,3 +22,4 @@ rand = "0.8"
 volatile = "0.3"
 radium = "0.6"
 libc = "0.2.101"
+ascii = "1.0"
diff --git a/common/src/encodings.rs b/common/src/encodings.rs
@@ -4,13 +4,20 @@ pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
 
 pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;
 
+pub trait StrBuffer: AsRef<str> {
+    fn is_ascii(&self) -> bool {
+        self.as_ref().is_ascii()
+    }
+}
+
 pub trait ErrorHandler {
     type Error;
-    type StrBuf: AsRef<str>;
+    type StrBuf: StrBuffer;
     type BytesBuf: AsRef<[u8]>;
     fn handle_encode_error(
         &self,
-        byte_range: Range<usize>,
+        data: &str,
+        char_range: Range<usize>,
         reason: &str,
     ) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
     fn handle_decode_error(
@@ -20,12 +27,95 @@ pub trait ErrorHandler {
         reason: &str,
     ) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
     fn error_oob_restart(&self, i: usize) -> Self::Error;
+    fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
 }
 pub enum EncodeReplace<S, B> {
     Str(S),
     Bytes(B),
 }
 
+struct DecodeError<'a> {
+    valid_prefix: &'a str,
+    rest: &'a [u8],
+    err_len: Option<usize>,
+}
+/// # Safety
+/// `v[..valid_up_to]` must be valid utf8
+unsafe fn make_decode_err(v: &[u8], valid_up_to: usize, err_len: Option<usize>) -> DecodeError<'_> {
+    let valid_prefix = core::str::from_utf8_unchecked(v.get_unchecked(..valid_up_to));
+    let rest = v.get_unchecked(valid_up_to..);
+    DecodeError {
+        valid_prefix,
+        rest,
+        err_len,
+    }
+}
+
+enum HandleResult<'a> {
+    Done,
+    Error {
+        err_len: Option<usize>,
+        reason: &'a str,
+    },
+}
+fn decode_utf8_compatible<E: ErrorHandler, DecodeF, ErrF>(
+    data: &[u8],
+    errors: &E,
+    decode: DecodeF,
+    handle_error: ErrF,
+) -> Result<(String, usize), E::Error>
+where
+    DecodeF: Fn(&[u8]) -> Result<&str, DecodeError<'_>>,
+    ErrF: Fn(&[u8], Option<usize>) -> HandleResult<'_>,
+{
+    if data.is_empty() {
+        return Ok((String::new(), 0));
+    }
+    // we need to coerce the lifetime to that of the function body rather than the
+    // anonymous input lifetime, so that we can assign it data borrowed from data_from_err
+    let mut data = &*data;
+    let mut data_from_err: E::BytesBuf;
+    let mut out = String::with_capacity(data.len());
+    let mut remaining_index = 0;
+    let mut remaining_data = data;
+    loop {
+        match decode(remaining_data) {
+            Ok(decoded) => {
+                out.push_str(decoded);
+                remaining_index += decoded.len();
+                break;
+            }
+            Err(e) => {
+                out.push_str(e.valid_prefix);
+                match handle_error(e.rest, e.err_len) {
+                    HandleResult::Done => {
+                        remaining_index += e.valid_prefix.len();
+                        break;
+                    }
+                    HandleResult::Error { err_len, reason } => {
+                        let err_idx = remaining_index + e.valid_prefix.len();
+                        let err_range =
+                            err_idx..err_len.map_or_else(|| data.len(), |len| err_idx + len);
+                        let (replace, new_data, restart) =
+                            errors.handle_decode_error(data, err_range, reason)?;
+                        out.push_str(replace.as_ref());
+                        if let Some(new_data) = new_data {
+                            data_from_err = new_data;
+                            data = data_from_err.as_ref();
+                        }
+                        remaining_data = data
+                            .get(restart..)
+                            .ok_or_else(|| errors.error_oob_restart(restart))?;
+                        remaining_index = restart;
+                        continue;
+                    }
+                }
+            }
+        }
+    }
+    Ok((out, remaining_index))
+}
+
 pub mod utf8 {
     use super::*;
 
@@ -41,75 +131,118 @@ pub mod utf8 {
         errors: &E,
         final_decode: bool,
     ) -> Result<(String, usize), E::Error> {
-        if data.is_empty() {
-            return Ok((String::new(), 0));
-        }
-        // we need to coerce the lifetime to that of the function body rather than the
-        // anonymous input lifetime, so that we can assign it data borrowed from data_from_err
-        let mut data = &*data;
-        let mut data_from_err: E::BytesBuf;
-        let mut out = String::with_capacity(data.len());
-        let mut remaining_index = 0;
-        let mut remaining_data = data;
-        macro_rules! handle_error {
-            ($range:expr, $reason:expr) => {{
-                let (replace, new_data, restart) =
-                    errors.handle_decode_error(data, $range, $reason)?;
-                out.push_str(replace.as_ref());
-                if let Some(new_data) = new_data {
-                    data_from_err = new_data;
-                    data = data_from_err.as_ref();
+        decode_utf8_compatible(
+            data,
+            errors,
+            |v| {
+                core::str::from_utf8(v).map_err(|e| {
+                    // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
+                    //         is valid utf8
+                    unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
+                })
+            },
+            |rest, err_len| {
+                let first_err = rest[0];
+                if matches!(first_err, 0x80..=0xc1 | 0xf5..=0xff) {
+                    HandleResult::Error {
+                        err_len: Some(1),
+                        reason: "invalid start byte",
+                    }
+                } else if err_len.is_none() {
+                    // error_len() == None means unexpected eof
+                    if final_decode {
+                        HandleResult::Error {
+                            err_len,
+                            reason: "unexpected end of data",
+                        }
+                    } else {
+                        HandleResult::Done
+                    }
+                } else if !final_decode && matches!(rest, [0xed, 0xa0..=0xbf]) {
+                    // truncated surrogate
+                    HandleResult::Done
+                } else {
+                    HandleResult::Error {
+                        err_len,
+                        reason: "invalid continuation byte",
+                    }
                 }
-                remaining_data = data
-                    .get(restart..)
-                    .ok_or_else(|| errors.error_oob_restart(restart))?;
-                remaining_index = restart;
-                continue;
-            }};
-        }
+            },
+        )
+    }
+}
+
+pub mod ascii {
+    use super::*;
+    use ::ascii::AsciiStr;
+
+    pub const ENCODING_NAME: &str = "ascii";
+
+    const ERR_REASON: &str = "ordinal not in range(128)";
+
+    #[inline]
+    pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
+        let full_data = s;
+        let mut data = s;
+        let mut char_data_index = 0;
+        let mut out = Vec::<u8>::new();
         loop {
-            match core::str::from_utf8(remaining_data) {
-                Ok(decoded) => {
-                    out.push_str(decoded);
-                    remaining_index += decoded.len();
+            match data
+                .char_indices()
+                .enumerate()
+                .find(|(_, (_, c))| !c.is_ascii())
+            {
+                None => {
+                    out.extend_from_slice(data.as_bytes());
                     break;
                 }
-                Err(e) => {
-                    let (valid_prefix, rest, first_err) = unsafe {
-                        let index = e.valid_up_to();
-                        // SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
-                        let valid =
-                            std::str::from_utf8_unchecked(remaining_data.get_unchecked(..index));
-                        let rest = remaining_data.get_unchecked(index..);
-                        // SAFETY: if index didn't have something at it, this wouldn't be an error
-                        let first_err = *remaining_data.get_unchecked(index);
-                        (valid, rest, first_err)
-                    };
-                    out.push_str(valid_prefix);
-                    let err_idx = remaining_index + e.valid_up_to();
-                    remaining_data = rest;
-                    remaining_index += valid_prefix.len();
-                    if (0x80..0xc2).contains(&first_err) || (0xf5..=0xff).contains(&first_err) {
-                        handle_error!(err_idx..err_idx + 1, "invalid start byte");
-                    }
-                    let err_len = match e.error_len() {
-                        Some(l) => l,
-                        // error_len() == None means unexpected eof
-                        None => {
-                            if !final_decode {
-                                break;
+                Some((char_i, (byte_i, _))) => {
+                    out.extend_from_slice(&data.as_bytes()[..byte_i]);
+                    let char_start = char_data_index + char_i;
+                    // number of non-ascii chars between the first non-ascii char and the next ascii char
+                    let non_ascii_run_length =
+                        data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
+                    let char_range = char_start..char_start + non_ascii_run_length;
+                    let (replace, char_restart) =
+                        errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
+                    match replace {
+                        EncodeReplace::Str(s) => {
+                            if !s.is_ascii() {
+                                return Err(
+                                    errors.error_encoding(full_data, char_range, ERR_REASON)
+                                );
                             }
-                            handle_error!(err_idx..data.len(), "unexpected end of data");
+                            out.extend_from_slice(s.as_ref().as_bytes());
+                        }
+                        EncodeReplace::Bytes(b) => {
+                            out.extend_from_slice(b.as_ref());
                         }
-                    };
-                    if !final_decode && matches!(remaining_data, [0xed, 0xa0..=0xbf]) {
-                        // truncated surrogate
-                        break;
                     }
-                    handle_error!(err_idx..err_idx + err_len, "invalid continuation byte");
+                    data = crate::str::try_get_chars(full_data, char_restart..)
+                        .ok_or_else(|| errors.error_oob_restart(char_restart))?;
+                    char_data_index = char_restart;
+                    continue;
                 }
             }
         }
-        Ok((out, remaining_index))
+        Ok(out)
+    }
+
+    pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
+        decode_utf8_compatible(
+            data,
+            errors,
+            |v| {
+                AsciiStr::from_ascii(v).map(|s| s.as_str()).map_err(|e| {
+                    // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
+                    //         is valid ascii & therefore valid utf8
+                    unsafe { make_decode_err(v, e.valid_up_to(), Some(1)) }
+                })
+            },
+            |_rest, err_len| HandleResult::Error {
+                err_len,
+                reason: ERR_REASON,
+            },
+        )
     }
 }