Skip to content

Commit 0f889ce

Browse files
committed
Implement latin_1 in Rust
This implementation is patterned off of the ascii codec.
1 parent 0bb0946 commit 0f889ce

File tree

2 files changed

+89
-8
lines changed

2 files changed

+89
-8
lines changed

common/src/encodings.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,82 @@ pub mod utf8 {
172172
}
173173
}
174174

175+
pub mod latin_1 {
176+
use super::*;
177+
178+
pub const ENCODING_NAME: &str = "latin-1";
179+
180+
const ERR_REASON: &str = "ordinal not in range(256)";
181+
182+
#[inline]
183+
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
184+
let full_data = s;
185+
let mut data = s;
186+
let mut char_data_index = 0;
187+
let mut out = Vec::<u8>::new();
188+
loop {
189+
match data
190+
.char_indices()
191+
.enumerate()
192+
.find(|(_, (_, c))| (*c as u32) > 255)
193+
{
194+
None => {
195+
out.extend_from_slice(data.as_bytes());
196+
break;
197+
}
198+
Some((char_i, (byte_i, _))) => {
199+
out.extend_from_slice(&data.as_bytes()[..byte_i]);
200+
let char_start = char_data_index + char_i;
201+
// number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
202+
let non_latin_1_run_length = data[byte_i..]
203+
.chars()
204+
.take_while(|c| (*c as u32) > 255)
205+
.count();
206+
let char_range = char_start..char_start + non_latin_1_run_length;
207+
let (replace, char_restart) =
208+
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
209+
match replace {
210+
EncodeReplace::Str(s) => {
211+
if s.as_ref().chars().any(|c| (c as u32) > 255) {
212+
return Err(
213+
errors.error_encoding(full_data, char_range, ERR_REASON)
214+
);
215+
}
216+
out.extend_from_slice(s.as_ref().as_bytes());
217+
}
218+
EncodeReplace::Bytes(b) => {
219+
out.extend_from_slice(b.as_ref());
220+
}
221+
}
222+
data = crate::str::try_get_chars(full_data, char_restart..)
223+
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
224+
char_data_index = char_restart;
225+
continue;
226+
}
227+
}
228+
}
229+
Ok(out)
230+
}
231+
232+
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
233+
decode_utf8_compatible(
234+
data,
235+
errors,
236+
|v| {
237+
std::str::from_utf8(v).map_err(|e| {
238+
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
239+
// is valid ascii & therefore valid utf8
240+
unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
241+
})
242+
},
243+
|_rest, err_len| HandleResult::Error {
244+
err_len,
245+
reason: ERR_REASON,
246+
},
247+
)
248+
}
249+
}
250+
175251
pub mod ascii {
176252
use super::*;
177253
use ::ascii::AsciiStr;

vm/src/stdlib/codecs.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,19 @@ mod _codecs {
315315
do_codec!(utf8::decode, args, vm)
316316
}
317317

318+
#[pyfunction]
319+
fn latin_1_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
320+
if args.s.as_ref().chars().all(|c| (c as u32) < 256) {
321+
return Ok((args.s.as_str().as_bytes().to_vec(), args.s.byte_len()));
322+
}
323+
do_codec!(latin_1::encode, args, vm)
324+
}
325+
326+
#[pyfunction]
327+
fn latin_1_decode(args: DecodeArgsNoFinal, vm: &VirtualMachine) -> DecodeResult {
328+
do_codec!(latin_1::decode, args, vm)
329+
}
330+
318331
#[pyfunction]
319332
fn ascii_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
320333
if args.s.is_ascii() {
@@ -353,14 +366,6 @@ mod _codecs {
353366
}};
354367
}
355368

356-
#[pyfunction]
357-
fn latin_1_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
358-
delegate_pycodecs!(latin_1_encode, args, vm)
359-
}
360-
#[pyfunction]
361-
fn latin_1_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
362-
delegate_pycodecs!(latin_1_decode, args, vm)
363-
}
364369
#[pyfunction]
365370
fn mbcs_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
366371
delegate_pycodecs!(mbcs_encode, args, vm)

0 commit comments

Comments
 (0)