Skip to content

Commit 20b86b2

Browse files
committed
Implement ascii codec in Rust
1 parent 29c90a6 commit 20b86b2

File tree

4 files changed

+290
-76
lines changed

4 files changed

+290
-76
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ rand = "0.8"
2222
volatile = "0.3"
2323
radium = "0.6"
2424
libc = "0.2.101"
25+
ascii = "1.0"

common/src/encodings.rs

Lines changed: 195 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,20 @@ pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
44

55
pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;
66

7+
pub trait StrBuffer: AsRef<str> {
8+
fn is_ascii(&self) -> bool {
9+
self.as_ref().is_ascii()
10+
}
11+
}
12+
713
pub trait ErrorHandler {
814
type Error;
9-
type StrBuf: AsRef<str>;
15+
type StrBuf: StrBuffer;
1016
type BytesBuf: AsRef<[u8]>;
1117
fn handle_encode_error(
1218
&self,
13-
byte_range: Range<usize>,
19+
data: &str,
20+
char_range: Range<usize>,
1421
reason: &str,
1522
) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
1623
fn handle_decode_error(
@@ -20,12 +27,95 @@ pub trait ErrorHandler {
2027
reason: &str,
2128
) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
2229
fn error_oob_restart(&self, i: usize) -> Self::Error;
30+
fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
2331
}
2432
pub enum EncodeReplace<S, B> {
2533
Str(S),
2634
Bytes(B),
2735
}
2836

37+
struct DecodeError<'a> {
38+
valid_prefix: &'a str,
39+
rest: &'a [u8],
40+
err_len: Option<usize>,
41+
}
42+
/// # Safety
43+
/// `v[..valid_up_to]` must be valid utf8
44+
unsafe fn make_decode_err(v: &[u8], valid_up_to: usize, err_len: Option<usize>) -> DecodeError<'_> {
45+
let valid_prefix = core::str::from_utf8_unchecked(v.get_unchecked(..valid_up_to));
46+
let rest = v.get_unchecked(valid_up_to..);
47+
DecodeError {
48+
valid_prefix,
49+
rest,
50+
err_len,
51+
}
52+
}
53+
54+
enum HandleResult<'a> {
55+
Done,
56+
Error {
57+
err_len: Option<usize>,
58+
reason: &'a str,
59+
},
60+
}
61+
fn decode_utf8_compatible<E: ErrorHandler, DecodeF, ErrF>(
62+
data: &[u8],
63+
errors: &E,
64+
decode: DecodeF,
65+
handle_error: ErrF,
66+
) -> Result<(String, usize), E::Error>
67+
where
68+
DecodeF: Fn(&[u8]) -> Result<&str, DecodeError<'_>>,
69+
ErrF: Fn(&[u8], Option<usize>) -> HandleResult<'_>,
70+
{
71+
if data.is_empty() {
72+
return Ok((String::new(), 0));
73+
}
74+
// we need to coerce the lifetime to that of the function body rather than the
75+
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
76+
let mut data = &*data;
77+
let mut data_from_err: E::BytesBuf;
78+
let mut out = String::with_capacity(data.len());
79+
let mut remaining_index = 0;
80+
let mut remaining_data = data;
81+
loop {
82+
match decode(remaining_data) {
83+
Ok(decoded) => {
84+
out.push_str(decoded);
85+
remaining_index += decoded.len();
86+
break;
87+
}
88+
Err(e) => {
89+
out.push_str(e.valid_prefix);
90+
match handle_error(e.rest, e.err_len) {
91+
HandleResult::Done => {
92+
remaining_index += e.valid_prefix.len();
93+
break;
94+
}
95+
HandleResult::Error { err_len, reason } => {
96+
let err_idx = remaining_index + e.valid_prefix.len();
97+
let err_range =
98+
err_idx..err_len.map_or_else(|| data.len(), |len| err_idx + len);
99+
let (replace, new_data, restart) =
100+
errors.handle_decode_error(data, err_range, reason)?;
101+
out.push_str(replace.as_ref());
102+
if let Some(new_data) = new_data {
103+
data_from_err = new_data;
104+
data = data_from_err.as_ref();
105+
}
106+
remaining_data = data
107+
.get(restart..)
108+
.ok_or_else(|| errors.error_oob_restart(restart))?;
109+
remaining_index = restart;
110+
continue;
111+
}
112+
}
113+
}
114+
}
115+
}
116+
Ok((out, remaining_index))
117+
}
118+
29119
pub mod utf8 {
30120
use super::*;
31121

@@ -41,75 +131,118 @@ pub mod utf8 {
41131
errors: &E,
42132
final_decode: bool,
43133
) -> Result<(String, usize), E::Error> {
44-
if data.is_empty() {
45-
return Ok((String::new(), 0));
46-
}
47-
// we need to coerce the lifetime to that of the function body rather than the
48-
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
49-
let mut data = &*data;
50-
let mut data_from_err: E::BytesBuf;
51-
let mut out = String::with_capacity(data.len());
52-
let mut remaining_index = 0;
53-
let mut remaining_data = data;
54-
macro_rules! handle_error {
55-
($range:expr, $reason:expr) => {{
56-
let (replace, new_data, restart) =
57-
errors.handle_decode_error(data, $range, $reason)?;
58-
out.push_str(replace.as_ref());
59-
if let Some(new_data) = new_data {
60-
data_from_err = new_data;
61-
data = data_from_err.as_ref();
134+
decode_utf8_compatible(
135+
data,
136+
errors,
137+
|v| {
138+
core::str::from_utf8(v).map_err(|e| {
139+
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
140+
// is valid utf8
141+
unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
142+
})
143+
},
144+
|rest, err_len| {
145+
let first_err = rest[0];
146+
if matches!(first_err, 0x80..=0xc1 | 0xf5..=0xff) {
147+
HandleResult::Error {
148+
err_len: Some(1),
149+
reason: "invalid start byte",
150+
}
151+
} else if err_len.is_none() {
152+
// error_len() == None means unexpected eof
153+
if final_decode {
154+
HandleResult::Error {
155+
err_len,
156+
reason: "unexpected end of data",
157+
}
158+
} else {
159+
HandleResult::Done
160+
}
161+
} else if !final_decode && matches!(rest, [0xed, 0xa0..=0xbf]) {
162+
// truncated surrogate
163+
HandleResult::Done
164+
} else {
165+
HandleResult::Error {
166+
err_len,
167+
reason: "invalid continuation byte",
168+
}
62169
}
63-
remaining_data = data
64-
.get(restart..)
65-
.ok_or_else(|| errors.error_oob_restart(restart))?;
66-
remaining_index = restart;
67-
continue;
68-
}};
69-
}
170+
},
171+
)
172+
}
173+
}
174+
175+
pub mod ascii {
176+
use super::*;
177+
use ::ascii::AsciiStr;
178+
179+
pub const ENCODING_NAME: &str = "ascii";
180+
181+
const ERR_REASON: &str = "ordinal not in range(128)";
182+
183+
#[inline]
184+
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
185+
let full_data = s;
186+
let mut data = s;
187+
let mut char_data_index = 0;
188+
let mut out = Vec::<u8>::new();
70189
loop {
71-
match core::str::from_utf8(remaining_data) {
72-
Ok(decoded) => {
73-
out.push_str(decoded);
74-
remaining_index += decoded.len();
190+
match data
191+
.char_indices()
192+
.enumerate()
193+
.find(|(_, (_, c))| !c.is_ascii())
194+
{
195+
None => {
196+
out.extend_from_slice(data.as_bytes());
75197
break;
76198
}
77-
Err(e) => {
78-
let (valid_prefix, rest, first_err) = unsafe {
79-
let index = e.valid_up_to();
80-
// SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
81-
let valid =
82-
std::str::from_utf8_unchecked(remaining_data.get_unchecked(..index));
83-
let rest = remaining_data.get_unchecked(index..);
84-
// SAFETY: if index didn't have something at it, this wouldn't be an error
85-
let first_err = *remaining_data.get_unchecked(index);
86-
(valid, rest, first_err)
87-
};
88-
out.push_str(valid_prefix);
89-
let err_idx = remaining_index + e.valid_up_to();
90-
remaining_data = rest;
91-
remaining_index += valid_prefix.len();
92-
if (0x80..0xc2).contains(&first_err) || (0xf5..=0xff).contains(&first_err) {
93-
handle_error!(err_idx..err_idx + 1, "invalid start byte");
94-
}
95-
let err_len = match e.error_len() {
96-
Some(l) => l,
97-
// error_len() == None means unexpected eof
98-
None => {
99-
if !final_decode {
100-
break;
199+
Some((char_i, (byte_i, _))) => {
200+
out.extend_from_slice(&data.as_bytes()[..byte_i]);
201+
let char_start = char_data_index + char_i;
202+
// number of non-ascii chars between the first non-ascii char and the next ascii char
203+
let non_ascii_run_length =
204+
data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
205+
let char_range = char_start..char_start + non_ascii_run_length;
206+
let (replace, char_restart) =
207+
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
208+
match replace {
209+
EncodeReplace::Str(s) => {
210+
if !s.is_ascii() {
211+
return Err(
212+
errors.error_encoding(full_data, char_range, ERR_REASON)
213+
);
101214
}
102-
handle_error!(err_idx..data.len(), "unexpected end of data");
215+
out.extend_from_slice(s.as_ref().as_bytes());
216+
}
217+
EncodeReplace::Bytes(b) => {
218+
out.extend_from_slice(b.as_ref());
103219
}
104-
};
105-
if !final_decode && matches!(remaining_data, [0xed, 0xa0..=0xbf]) {
106-
// truncated surrogate
107-
break;
108220
}
109-
handle_error!(err_idx..err_idx + err_len, "invalid continuation byte");
221+
data = crate::str::try_get_chars(full_data, char_restart..)
222+
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
223+
char_data_index = char_restart;
224+
continue;
110225
}
111226
}
112227
}
113-
Ok((out, remaining_index))
228+
Ok(out)
229+
}
230+
231+
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
232+
decode_utf8_compatible(
233+
data,
234+
errors,
235+
|v| {
236+
AsciiStr::from_ascii(v).map(|s| s.as_str()).map_err(|e| {
237+
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
238+
// is valid ascii & therefore valid utf8
239+
unsafe { make_decode_err(v, e.valid_up_to(), Some(1)) }
240+
})
241+
},
242+
|_rest, err_len| HandleResult::Error {
243+
err_len,
244+
reason: ERR_REASON,
245+
},
246+
)
114247
}
115248
}

0 commit comments

Comments
 (0)