Skip to content

Implement ascii codec in Rust #3118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ rand = "0.8"
volatile = "0.3"
radium = "0.6"
libc = "0.2.101"
ascii = "1.0"
257 changes: 195 additions & 62 deletions common/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@ pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;

pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;

pub trait StrBuffer: AsRef<str> {
fn is_ascii(&self) -> bool {
self.as_ref().is_ascii()
}
}

pub trait ErrorHandler {
type Error;
type StrBuf: AsRef<str>;
type StrBuf: StrBuffer;
type BytesBuf: AsRef<[u8]>;
fn handle_encode_error(
&self,
byte_range: Range<usize>,
data: &str,
char_range: Range<usize>,
reason: &str,
) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
fn handle_decode_error(
Expand All @@ -20,12 +27,95 @@ pub trait ErrorHandler {
reason: &str,
) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
fn error_oob_restart(&self, i: usize) -> Self::Error;
fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
}
pub enum EncodeReplace<S, B> {
Str(S),
Bytes(B),
}

struct DecodeError<'a> {
valid_prefix: &'a str,
rest: &'a [u8],
err_len: Option<usize>,
}
/// # Safety
/// `v[..valid_up_to]` must be valid utf8
unsafe fn make_decode_err(v: &[u8], valid_up_to: usize, err_len: Option<usize>) -> DecodeError<'_> {
let valid_prefix = core::str::from_utf8_unchecked(v.get_unchecked(..valid_up_to));
let rest = v.get_unchecked(valid_up_to..);
DecodeError {
valid_prefix,
rest,
err_len,
}
}

enum HandleResult<'a> {
Done,
Error {
err_len: Option<usize>,
reason: &'a str,
},
}
fn decode_utf8_compatible<E: ErrorHandler, DecodeF, ErrF>(
data: &[u8],
errors: &E,
decode: DecodeF,
handle_error: ErrF,
) -> Result<(String, usize), E::Error>
where
DecodeF: Fn(&[u8]) -> Result<&str, DecodeError<'_>>,
ErrF: Fn(&[u8], Option<usize>) -> HandleResult<'_>,
{
if data.is_empty() {
return Ok((String::new(), 0));
}
// we need to coerce the lifetime to that of the function body rather than the
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
let mut data = &*data;
let mut data_from_err: E::BytesBuf;
let mut out = String::with_capacity(data.len());
let mut remaining_index = 0;
let mut remaining_data = data;
loop {
match decode(remaining_data) {
Ok(decoded) => {
out.push_str(decoded);
remaining_index += decoded.len();
break;
}
Err(e) => {
out.push_str(e.valid_prefix);
match handle_error(e.rest, e.err_len) {
HandleResult::Done => {
remaining_index += e.valid_prefix.len();
break;
}
HandleResult::Error { err_len, reason } => {
let err_idx = remaining_index + e.valid_prefix.len();
let err_range =
err_idx..err_len.map_or_else(|| data.len(), |len| err_idx + len);
let (replace, new_data, restart) =
errors.handle_decode_error(data, err_range, reason)?;
out.push_str(replace.as_ref());
if let Some(new_data) = new_data {
data_from_err = new_data;
data = data_from_err.as_ref();
}
remaining_data = data
.get(restart..)
.ok_or_else(|| errors.error_oob_restart(restart))?;
remaining_index = restart;
continue;
}
}
}
}
}
Ok((out, remaining_index))
}

pub mod utf8 {
use super::*;

Expand All @@ -41,75 +131,118 @@ pub mod utf8 {
errors: &E,
final_decode: bool,
) -> Result<(String, usize), E::Error> {
if data.is_empty() {
return Ok((String::new(), 0));
}
// we need to coerce the lifetime to that of the function body rather than the
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
let mut data = &*data;
let mut data_from_err: E::BytesBuf;
let mut out = String::with_capacity(data.len());
let mut remaining_index = 0;
let mut remaining_data = data;
macro_rules! handle_error {
($range:expr, $reason:expr) => {{
let (replace, new_data, restart) =
errors.handle_decode_error(data, $range, $reason)?;
out.push_str(replace.as_ref());
if let Some(new_data) = new_data {
data_from_err = new_data;
data = data_from_err.as_ref();
decode_utf8_compatible(
data,
errors,
|v| {
core::str::from_utf8(v).map_err(|e| {
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
// is valid utf8
unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
})
},
|rest, err_len| {
let first_err = rest[0];
if matches!(first_err, 0x80..=0xc1 | 0xf5..=0xff) {
HandleResult::Error {
err_len: Some(1),
reason: "invalid start byte",
}
} else if err_len.is_none() {
// error_len() == None means unexpected eof
if final_decode {
HandleResult::Error {
err_len,
reason: "unexpected end of data",
}
} else {
HandleResult::Done
}
} else if !final_decode && matches!(rest, [0xed, 0xa0..=0xbf]) {
// truncated surrogate
HandleResult::Done
} else {
HandleResult::Error {
err_len,
reason: "invalid continuation byte",
}
}
remaining_data = data
.get(restart..)
.ok_or_else(|| errors.error_oob_restart(restart))?;
remaining_index = restart;
continue;
}};
}
},
)
}
}

pub mod ascii {
use super::*;
use ::ascii::AsciiStr;

pub const ENCODING_NAME: &str = "ascii";

const ERR_REASON: &str = "ordinal not in range(128)";

#[inline]
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
let full_data = s;
let mut data = s;
let mut char_data_index = 0;
let mut out = Vec::<u8>::new();
loop {
match core::str::from_utf8(remaining_data) {
Ok(decoded) => {
out.push_str(decoded);
remaining_index += decoded.len();
match data
.char_indices()
.enumerate()
.find(|(_, (_, c))| !c.is_ascii())
{
None => {
out.extend_from_slice(data.as_bytes());
break;
}
Err(e) => {
let (valid_prefix, rest, first_err) = unsafe {
let index = e.valid_up_to();
// SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
let valid =
std::str::from_utf8_unchecked(remaining_data.get_unchecked(..index));
let rest = remaining_data.get_unchecked(index..);
// SAFETY: if index didn't have something at it, this wouldn't be an error
let first_err = *remaining_data.get_unchecked(index);
(valid, rest, first_err)
};
out.push_str(valid_prefix);
let err_idx = remaining_index + e.valid_up_to();
remaining_data = rest;
remaining_index += valid_prefix.len();
if (0x80..0xc2).contains(&first_err) || (0xf5..=0xff).contains(&first_err) {
handle_error!(err_idx..err_idx + 1, "invalid start byte");
}
let err_len = match e.error_len() {
Some(l) => l,
// error_len() == None means unexpected eof
None => {
if !final_decode {
break;
Some((char_i, (byte_i, _))) => {
out.extend_from_slice(&data.as_bytes()[..byte_i]);
let char_start = char_data_index + char_i;
// number of non-ascii chars between the first non-ascii char and the next ascii char
let non_ascii_run_length =
data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
let char_range = char_start..char_start + non_ascii_run_length;
let (replace, char_restart) =
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
match replace {
EncodeReplace::Str(s) => {
if !s.is_ascii() {
return Err(
errors.error_encoding(full_data, char_range, ERR_REASON)
);
}
handle_error!(err_idx..data.len(), "unexpected end of data");
out.extend_from_slice(s.as_ref().as_bytes());
}
EncodeReplace::Bytes(b) => {
out.extend_from_slice(b.as_ref());
}
};
if !final_decode && matches!(remaining_data, [0xed, 0xa0..=0xbf]) {
// truncated surrogate
break;
}
handle_error!(err_idx..err_idx + err_len, "invalid continuation byte");
data = crate::str::try_get_chars(full_data, char_restart..)
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
char_data_index = char_restart;
continue;
}
}
}
Ok((out, remaining_index))
Ok(out)
}

pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
decode_utf8_compatible(
data,
errors,
|v| {
AsciiStr::from_ascii(v).map(|s| s.as_str()).map_err(|e| {
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
// is valid ascii & therefore valid utf8
unsafe { make_decode_err(v, e.valid_up_to(), Some(1)) }
})
},
|_rest, err_len| HandleResult::Error {
err_len,
reason: ERR_REASON,
},
)
}
}
Loading