Skip to content

Commit ec7ae7b

Browse files
committed
Implement ascii codec in Rust
1 parent 29c90a6 commit ec7ae7b

File tree

4 files changed

+292
-76
lines changed

4 files changed

+292
-76
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ rand = "0.8"
2222
volatile = "0.3"
2323
radium = "0.6"
2424
libc = "0.2.101"
25+
ascii = "1.0"

common/src/encodings.rs

Lines changed: 197 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,20 @@ pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
44

55
pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;
66

7+
pub trait StrBuffer: AsRef<str> {
8+
fn is_ascii(&self) -> bool {
9+
self.as_ref().is_ascii()
10+
}
11+
}
12+
713
pub trait ErrorHandler {
814
type Error;
9-
type StrBuf: AsRef<str>;
15+
type StrBuf: StrBuffer;
1016
type BytesBuf: AsRef<[u8]>;
1117
fn handle_encode_error(
1218
&self,
13-
byte_range: Range<usize>,
19+
data: &str,
20+
char_range: Range<usize>,
1421
reason: &str,
1522
) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
1623
fn handle_decode_error(
@@ -20,12 +27,95 @@ pub trait ErrorHandler {
2027
reason: &str,
2128
) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
2229
fn error_oob_restart(&self, i: usize) -> Self::Error;
30+
fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
2331
}
2432
pub enum EncodeReplace<S, B> {
2533
Str(S),
2634
Bytes(B),
2735
}
2836

37+
struct DecodeError<'a> {
38+
valid_prefix: &'a str,
39+
rest: &'a [u8],
40+
err_len: Option<usize>,
41+
}
42+
/// # Safety
43+
/// `v[..valid_up_to]` must be valid utf8
44+
unsafe fn make_decode_err(v: &[u8], valid_up_to: usize, err_len: Option<usize>) -> DecodeError<'_> {
45+
let valid_prefix = core::str::from_utf8_unchecked(v.get_unchecked(..valid_up_to));
46+
let rest = v.get_unchecked(valid_up_to..);
47+
DecodeError {
48+
valid_prefix,
49+
rest,
50+
err_len,
51+
}
52+
}
53+
54+
enum HandleResult<'a> {
55+
Done,
56+
Error {
57+
err_len: Option<usize>,
58+
reason: &'a str,
59+
},
60+
}
61+
fn decode_utf8_compatible<E: ErrorHandler, DecodeF, ErrF>(
62+
data: &[u8],
63+
errors: &E,
64+
decode: DecodeF,
65+
handle_error: ErrF,
66+
) -> Result<(String, usize), E::Error>
67+
where
68+
DecodeF: Fn(&[u8]) -> Result<&str, DecodeError<'_>>,
69+
ErrF: Fn(&[u8], Option<usize>) -> HandleResult<'_>,
70+
{
71+
if data.is_empty() {
72+
return Ok((String::new(), 0));
73+
}
74+
// we need to coerce the lifetime to that of the function body rather than the
75+
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
76+
let mut data = &*data;
77+
let mut data_from_err: E::BytesBuf;
78+
let mut out = String::with_capacity(data.len());
79+
let mut remaining_index = 0;
80+
let mut remaining_data = data;
81+
loop {
82+
match decode(remaining_data) {
83+
Ok(decoded) => {
84+
out.push_str(decoded);
85+
remaining_index += decoded.len();
86+
break;
87+
}
88+
Err(e) => {
89+
out.push_str(e.valid_prefix);
90+
match handle_error(e.rest, e.err_len) {
91+
HandleResult::Done => {
92+
remaining_index += e.valid_prefix.len();
93+
break;
94+
}
95+
HandleResult::Error { err_len, reason } => {
96+
let err_idx = remaining_index + e.valid_prefix.len();
97+
let err_range =
98+
err_idx..err_len.map_or_else(|| data.len(), |len| err_idx + len);
99+
let (replace, new_data, restart) =
100+
errors.handle_decode_error(data, err_range, reason)?;
101+
out.push_str(replace.as_ref());
102+
if let Some(new_data) = new_data {
103+
data_from_err = new_data;
104+
data = data_from_err.as_ref();
105+
}
106+
remaining_data = data
107+
.get(restart..)
108+
.ok_or_else(|| errors.error_oob_restart(restart))?;
109+
remaining_index = restart;
110+
continue;
111+
}
112+
}
113+
}
114+
}
115+
}
116+
Ok((out, remaining_index))
117+
}
118+
29119
pub mod utf8 {
30120
use super::*;
31121

@@ -41,75 +131,120 @@ pub mod utf8 {
41131
errors: &E,
42132
final_decode: bool,
43133
) -> Result<(String, usize), E::Error> {
44-
if data.is_empty() {
45-
return Ok((String::new(), 0));
46-
}
47-
// we need to coerce the lifetime to that of the function body rather than the
48-
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
49-
let mut data = &*data;
50-
let mut data_from_err: E::BytesBuf;
51-
let mut out = String::with_capacity(data.len());
52-
let mut remaining_index = 0;
53-
let mut remaining_data = data;
54-
macro_rules! handle_error {
55-
($range:expr, $reason:expr) => {{
56-
let (replace, new_data, restart) =
57-
errors.handle_decode_error(data, $range, $reason)?;
58-
out.push_str(replace.as_ref());
59-
if let Some(new_data) = new_data {
60-
data_from_err = new_data;
61-
data = data_from_err.as_ref();
134+
decode_utf8_compatible(
135+
data,
136+
errors,
137+
|v| {
138+
core::str::from_utf8(v).map_err(|e| {
139+
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
140+
// is valid utf8
141+
unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
142+
})
143+
},
144+
|rest, err_len| {
145+
let first_err = rest[0];
146+
if matches!(first_err, 0x80..=0xc1 | 0xf5..=0xff) {
147+
return HandleResult::Error {
148+
err_len: Some(1),
149+
reason: "invalid start byte",
150+
};
62151
}
63-
remaining_data = data
64-
.get(restart..)
65-
.ok_or_else(|| errors.error_oob_restart(restart))?;
66-
remaining_index = restart;
67-
continue;
68-
}};
69-
}
152+
if err_len.is_none() {
153+
// error_len() == None means unexpected eof
154+
let res = if final_decode {
155+
HandleResult::Error {
156+
err_len,
157+
reason: "unexpected end of data",
158+
}
159+
} else {
160+
HandleResult::Done
161+
};
162+
return res;
163+
}
164+
if !final_decode && matches!(rest, [0xed, 0xa0..=0xbf]) {
165+
// truncated surrogate
166+
return HandleResult::Done;
167+
}
168+
return HandleResult::Error {
169+
err_len,
170+
reason: "invalid continuation byte",
171+
};
172+
},
173+
)
174+
}
175+
}
176+
177+
pub mod ascii {
178+
use super::*;
179+
use ::ascii::AsciiStr;
180+
181+
pub const ENCODING_NAME: &str = "ascii";
182+
183+
const ERR_REASON: &str = "ordinal not in range(128)";
184+
185+
#[inline]
186+
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
187+
let full_data = s;
188+
let mut data = s;
189+
let mut char_data_index = 0;
190+
let mut out = Vec::<u8>::new();
70191
loop {
71-
match core::str::from_utf8(remaining_data) {
72-
Ok(decoded) => {
73-
out.push_str(decoded);
74-
remaining_index += decoded.len();
192+
match data
193+
.char_indices()
194+
.enumerate()
195+
.find(|(_, (_, c))| !c.is_ascii())
196+
{
197+
None => {
198+
out.extend_from_slice(data.as_bytes());
75199
break;
76200
}
77-
Err(e) => {
78-
let (valid_prefix, rest, first_err) = unsafe {
79-
let index = e.valid_up_to();
80-
// SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
81-
let valid =
82-
std::str::from_utf8_unchecked(remaining_data.get_unchecked(..index));
83-
let rest = remaining_data.get_unchecked(index..);
84-
// SAFETY: if index didn't have something at it, this wouldn't be an error
85-
let first_err = *remaining_data.get_unchecked(index);
86-
(valid, rest, first_err)
87-
};
88-
out.push_str(valid_prefix);
89-
let err_idx = remaining_index + e.valid_up_to();
90-
remaining_data = rest;
91-
remaining_index += valid_prefix.len();
92-
if (0x80..0xc2).contains(&first_err) || (0xf5..=0xff).contains(&first_err) {
93-
handle_error!(err_idx..err_idx + 1, "invalid start byte");
94-
}
95-
let err_len = match e.error_len() {
96-
Some(l) => l,
97-
// error_len() == None means unexpected eof
98-
None => {
99-
if !final_decode {
100-
break;
201+
Some((char_i, (byte_i, _))) => {
202+
out.extend_from_slice(&data.as_bytes()[..byte_i]);
203+
let char_start = char_data_index + char_i;
204+
// number of non-ascii chars between the first non-ascii char and the next ascii char
205+
let non_ascii_run_length =
206+
data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
207+
let char_range = char_start..char_start + non_ascii_run_length;
208+
let (replace, char_restart) =
209+
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
210+
match replace {
211+
EncodeReplace::Str(s) => {
212+
if !s.is_ascii() {
213+
return Err(
214+
errors.error_encoding(full_data, char_range, ERR_REASON)
215+
);
101216
}
102-
handle_error!(err_idx..data.len(), "unexpected end of data");
217+
out.extend_from_slice(s.as_ref().as_bytes());
218+
}
219+
EncodeReplace::Bytes(b) => {
220+
out.extend_from_slice(b.as_ref());
103221
}
104-
};
105-
if !final_decode && matches!(remaining_data, [0xed, 0xa0..=0xbf]) {
106-
// truncated surrogate
107-
break;
108222
}
109-
handle_error!(err_idx..err_idx + err_len, "invalid continuation byte");
223+
data = crate::str::try_get_chars(full_data, char_restart..)
224+
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
225+
char_data_index = char_restart;
226+
continue;
110227
}
111228
}
112229
}
113-
Ok((out, remaining_index))
230+
Ok(out)
231+
}
232+
233+
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
234+
decode_utf8_compatible(
235+
data,
236+
errors,
237+
|v| {
238+
AsciiStr::from_ascii(v).map(|s| s.as_str()).map_err(|e| {
239+
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
240+
// is valid ascii & therefore valid utf8
241+
unsafe { make_decode_err(v, e.valid_up_to(), Some(1)) }
242+
})
243+
},
244+
|_rest, err_len| HandleResult::Error {
245+
err_len,
246+
reason: ERR_REASON,
247+
},
248+
)
114249
}
115250
}

0 commit comments

Comments
 (0)