Skip to content

Commit 9c32ad7

Browse files
committed
Implement utf16-le
1 parent 1d36034 commit 9c32ad7

File tree

3 files changed

+177
-10
lines changed

3 files changed

+177
-10
lines changed

Lib/test/test_codecs.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -878,8 +878,6 @@ class UTF16LETest(ReadTest, unittest.TestCase):
878878
encoding = "utf-16-le"
879879
ill_formed_sequence = b"\x80\xdc"
880880

881-
# TODO: RUSTPYTHON
882-
@unittest.expectedFailure
883881
def test_partial(self):
884882
self.check_partial(
885883
"\x00\xff\u0100\uffff\U00010000",
@@ -922,10 +920,6 @@ def test_nonbmp(self):
922920
self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
923921
"\U00010203")
924922

925-
# TODO: RUSTPYTHON
926-
@unittest.expectedFailure
927-
def test_incremental_surrogatepass(self):
928-
super().test_incremental_surrogatepass()
929923

930924
class UTF16BETest(ReadTest, unittest.TestCase):
931925
encoding = "utf-16-be"

common/src/encodings.rs

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,3 +633,176 @@ pub mod ascii {
633633
)
634634
}
635635
}
636+
637+
pub mod utf16_le {
638+
use super::*;
639+
640+
pub const ENCODING_NAME: &str = "utf-16-le";
641+
642+
pub fn encode<Ctx, E>(mut ctx: Ctx, errors: &E) -> Result<Vec<u8>, Ctx::Error>
643+
where
644+
Ctx: EncodeContext,
645+
E: EncodeErrorHandler<Ctx>,
646+
{
647+
let mut out = Vec::<u8>::new();
648+
loop {
649+
let data = ctx.remaining_data();
650+
let error_info = {
651+
let mut iter = iter_code_points(data);
652+
iter.find(|(_, c)| c.to_u32() > 0x10FFFF)
653+
};
654+
let Some((i, ch)) = error_info else {
655+
break;
656+
};
657+
658+
// Add valid part up to the error
659+
for ch in data[..i.bytes].code_points() {
660+
let ch_u32 = ch.to_u32();
661+
if ch_u32 <= 0xFFFF {
662+
out.extend_from_slice(&(ch_u32 as u16).to_le_bytes());
663+
} else if ch_u32 <= 0x10FFFF {
664+
let code = ch_u32 - 0x10000;
665+
let high = 0xD800 + (code >> 10);
666+
let low = 0xDC00 + (code & 0x3FF);
667+
out.extend_from_slice(&(high as u16).to_le_bytes());
668+
out.extend_from_slice(&(low as u16).to_le_bytes());
669+
}
670+
}
671+
672+
let err_start = ctx.position() + i;
673+
let err_end = StrSize {
674+
bytes: i.bytes + ch.len_wtf8(),
675+
chars: i.chars + 1,
676+
};
677+
let err_end = ctx.position() + err_end;
678+
let replace =
679+
ctx.handle_error(errors, err_start..err_end, Some("surrogates not allowed"))?;
680+
match replace {
681+
EncodeReplace::Str(s) => {
682+
// Re-encode the replacement string
683+
for cp in s.as_ref().code_points() {
684+
let cp_u32 = cp.to_u32();
685+
if cp_u32 <= 0xFFFF {
686+
out.extend_from_slice(&(cp_u32 as u16).to_le_bytes());
687+
} else if cp_u32 <= 0x10FFFF {
688+
let code = cp_u32 - 0x10000;
689+
let high = 0xD800 + (code >> 10);
690+
let low = 0xDC00 + (code & 0x3FF);
691+
out.extend_from_slice(&(high as u16).to_le_bytes());
692+
out.extend_from_slice(&(low as u16).to_le_bytes());
693+
}
694+
}
695+
}
696+
EncodeReplace::Bytes(b) => {
697+
out.extend_from_slice(b.as_ref());
698+
}
699+
}
700+
}
701+
702+
// Process all remaining data
703+
for ch in ctx.remaining_data().code_points() {
704+
let ch_u32 = ch.to_u32();
705+
if ch_u32 <= 0xFFFF {
706+
out.extend_from_slice(&(ch_u32 as u16).to_le_bytes());
707+
} else if ch_u32 <= 0x10FFFF {
708+
let code = ch_u32 - 0x10000;
709+
let high = 0xD800 + (code >> 10);
710+
let low = 0xDC00 + (code & 0x3FF);
711+
out.extend_from_slice(&(high as u16).to_le_bytes());
712+
out.extend_from_slice(&(low as u16).to_le_bytes());
713+
}
714+
}
715+
Ok(out)
716+
}
717+
718+
pub fn decode<Ctx: DecodeContext, E: DecodeErrorHandler<Ctx>>(
719+
mut ctx: Ctx,
720+
errors: &E,
721+
final_decode: bool,
722+
) -> Result<(Wtf8Buf, usize), Ctx::Error> {
723+
let mut out = Wtf8Buf::new();
724+
725+
while ctx.remaining_data().len() >= 2 {
726+
let data = ctx.remaining_data();
727+
let ch = u16::from_le_bytes([data[0], data[1]]);
728+
729+
if ch < 0xD800 || ch > 0xDFFF {
730+
// BMP character
731+
if let Some(c) = char::from_u32(ch as u32) {
732+
out.push_str(&c.to_string());
733+
ctx.advance(2);
734+
} else {
735+
let pos = ctx.position();
736+
let replace =
737+
ctx.handle_error(errors, pos..pos + 2, Some("invalid character"))?;
738+
out.push_wtf8(replace.as_ref());
739+
// Don't advance here, the error handler already positioned us
740+
}
741+
} else if ch >= 0xD800 && ch <= 0xDBFF {
742+
// High surrogate
743+
if data.len() < 4 {
744+
if final_decode {
745+
let pos = ctx.position();
746+
let replace =
747+
ctx.handle_error(errors, pos..pos + 2, Some("unexpected end of data"))?;
748+
out.push_wtf8(replace.as_ref());
749+
// Don't advance here, the error handler already positioned us
750+
} else {
751+
// In partial mode, stop here and return what we have
752+
break;
753+
}
754+
} else {
755+
let ch2 = u16::from_le_bytes([data[2], data[3]]);
756+
if ch2 >= 0xDC00 && ch2 <= 0xDFFF {
757+
// Valid surrogate pair
758+
let code = (((ch & 0x3FF) as u32) << 10) | ((ch2 & 0x3FF) as u32);
759+
let code_point = code + 0x10000;
760+
if let Some(c) = char::from_u32(code_point) {
761+
out.push_str(&c.to_string());
762+
ctx.advance(4);
763+
} else {
764+
let pos = ctx.position();
765+
let replace = ctx.handle_error(
766+
errors,
767+
pos..pos + 4,
768+
Some("invalid surrogate pair"),
769+
)?;
770+
out.push_wtf8(replace.as_ref());
771+
// Don't advance here, the error handler already positioned us
772+
}
773+
} else {
774+
// Invalid surrogate pair
775+
let pos = ctx.position();
776+
let replace = ctx.handle_error(
777+
errors,
778+
pos..pos + 2,
779+
Some("illegal UTF-16 surrogate"),
780+
)?;
781+
out.push_wtf8(replace.as_ref());
782+
// Don't advance here, the error handler already positioned us
783+
}
784+
}
785+
} else {
786+
// Low surrogate without high surrogate
787+
let pos = ctx.position();
788+
let replace =
789+
ctx.handle_error(errors, pos..pos + 2, Some("illegal UTF-16 surrogate"))?;
790+
out.push_wtf8(replace.as_ref());
791+
// Don't advance here, the error handler already positioned us
792+
}
793+
}
794+
795+
// Handle remaining single byte
796+
if ctx.remaining_data().len() == 1 {
797+
if final_decode {
798+
let pos = ctx.position();
799+
let replace = ctx.handle_error(errors, pos..pos + 1, Some("truncated data"))?;
800+
out.push_wtf8(replace.as_ref());
801+
// Don't advance here, the error handler already positioned us
802+
}
803+
// In partial mode, just leave it for next call
804+
}
805+
806+
Ok((out, ctx.position()))
807+
}
808+
}

vm/src/stdlib/codecs.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,12 @@ mod _codecs {
286286
delegate_pycodecs!(charmap_build, args, vm)
287287
}
288288
#[pyfunction]
289-
fn utf_16_le_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
290-
delegate_pycodecs!(utf_16_le_encode, args, vm)
289+
fn utf_16_le_encode(args: EncodeArgs, vm: &VirtualMachine) -> EncodeResult {
290+
do_codec!(utf16_le::encode, args, vm)
291291
}
292292
#[pyfunction]
293-
fn utf_16_le_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {
294-
delegate_pycodecs!(utf_16_le_decode, args, vm)
293+
fn utf_16_le_decode(args: DecodeArgs, vm: &VirtualMachine) -> DecodeResult {
294+
do_codec!(utf16_le::decode, args, vm)
295295
}
296296
#[pyfunction]
297297
fn utf_16_be_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult {

0 commit comments

Comments
 (0)