@@ -633,3 +633,176 @@ pub mod ascii {
633
633
)
634
634
}
635
635
}
636
+
637
+ pub mod utf16_le {
638
+ use super :: * ;
639
+
640
+ pub const ENCODING_NAME : & str = "utf-16-le" ;
641
+
642
+ pub fn encode < Ctx , E > ( mut ctx : Ctx , errors : & E ) -> Result < Vec < u8 > , Ctx :: Error >
643
+ where
644
+ Ctx : EncodeContext ,
645
+ E : EncodeErrorHandler < Ctx > ,
646
+ {
647
+ let mut out = Vec :: < u8 > :: new ( ) ;
648
+ loop {
649
+ let data = ctx. remaining_data ( ) ;
650
+ let error_info = {
651
+ let mut iter = iter_code_points ( data) ;
652
+ iter. find ( |( _, c) | c. to_u32 ( ) > 0x10FFFF )
653
+ } ;
654
+ let Some ( ( i, ch) ) = error_info else {
655
+ break ;
656
+ } ;
657
+
658
+ // Add valid part up to the error
659
+ for ch in data[ ..i. bytes ] . code_points ( ) {
660
+ let ch_u32 = ch. to_u32 ( ) ;
661
+ if ch_u32 <= 0xFFFF {
662
+ out. extend_from_slice ( & ( ch_u32 as u16 ) . to_le_bytes ( ) ) ;
663
+ } else if ch_u32 <= 0x10FFFF {
664
+ let code = ch_u32 - 0x10000 ;
665
+ let high = 0xD800 + ( code >> 10 ) ;
666
+ let low = 0xDC00 + ( code & 0x3FF ) ;
667
+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
668
+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
669
+ }
670
+ }
671
+
672
+ let err_start = ctx. position ( ) + i;
673
+ let err_end = StrSize {
674
+ bytes : i. bytes + ch. len_wtf8 ( ) ,
675
+ chars : i. chars + 1 ,
676
+ } ;
677
+ let err_end = ctx. position ( ) + err_end;
678
+ let replace =
679
+ ctx. handle_error ( errors, err_start..err_end, Some ( "surrogates not allowed" ) ) ?;
680
+ match replace {
681
+ EncodeReplace :: Str ( s) => {
682
+ // Re-encode the replacement string
683
+ for cp in s. as_ref ( ) . code_points ( ) {
684
+ let cp_u32 = cp. to_u32 ( ) ;
685
+ if cp_u32 <= 0xFFFF {
686
+ out. extend_from_slice ( & ( cp_u32 as u16 ) . to_le_bytes ( ) ) ;
687
+ } else if cp_u32 <= 0x10FFFF {
688
+ let code = cp_u32 - 0x10000 ;
689
+ let high = 0xD800 + ( code >> 10 ) ;
690
+ let low = 0xDC00 + ( code & 0x3FF ) ;
691
+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
692
+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
693
+ }
694
+ }
695
+ }
696
+ EncodeReplace :: Bytes ( b) => {
697
+ out. extend_from_slice ( b. as_ref ( ) ) ;
698
+ }
699
+ }
700
+ }
701
+
702
+ // Process all remaining data
703
+ for ch in ctx. remaining_data ( ) . code_points ( ) {
704
+ let ch_u32 = ch. to_u32 ( ) ;
705
+ if ch_u32 <= 0xFFFF {
706
+ out. extend_from_slice ( & ( ch_u32 as u16 ) . to_le_bytes ( ) ) ;
707
+ } else if ch_u32 <= 0x10FFFF {
708
+ let code = ch_u32 - 0x10000 ;
709
+ let high = 0xD800 + ( code >> 10 ) ;
710
+ let low = 0xDC00 + ( code & 0x3FF ) ;
711
+ out. extend_from_slice ( & ( high as u16 ) . to_le_bytes ( ) ) ;
712
+ out. extend_from_slice ( & ( low as u16 ) . to_le_bytes ( ) ) ;
713
+ }
714
+ }
715
+ Ok ( out)
716
+ }
717
+
718
+ pub fn decode < Ctx : DecodeContext , E : DecodeErrorHandler < Ctx > > (
719
+ mut ctx : Ctx ,
720
+ errors : & E ,
721
+ final_decode : bool ,
722
+ ) -> Result < ( Wtf8Buf , usize ) , Ctx :: Error > {
723
+ let mut out = Wtf8Buf :: new ( ) ;
724
+
725
+ while ctx. remaining_data ( ) . len ( ) >= 2 {
726
+ let data = ctx. remaining_data ( ) ;
727
+ let ch = u16:: from_le_bytes ( [ data[ 0 ] , data[ 1 ] ] ) ;
728
+
729
+ if ch < 0xD800 || ch > 0xDFFF {
730
+ // BMP character
731
+ if let Some ( c) = char:: from_u32 ( ch as u32 ) {
732
+ out. push_str ( & c. to_string ( ) ) ;
733
+ ctx. advance ( 2 ) ;
734
+ } else {
735
+ let pos = ctx. position ( ) ;
736
+ let replace =
737
+ ctx. handle_error ( errors, pos..pos + 2 , Some ( "invalid character" ) ) ?;
738
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
739
+ // Don't advance here, the error handler already positioned us
740
+ }
741
+ } else if ch >= 0xD800 && ch <= 0xDBFF {
742
+ // High surrogate
743
+ if data. len ( ) < 4 {
744
+ if final_decode {
745
+ let pos = ctx. position ( ) ;
746
+ let replace =
747
+ ctx. handle_error ( errors, pos..pos + 2 , Some ( "unexpected end of data" ) ) ?;
748
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
749
+ // Don't advance here, the error handler already positioned us
750
+ } else {
751
+ // In partial mode, stop here and return what we have
752
+ break ;
753
+ }
754
+ } else {
755
+ let ch2 = u16:: from_le_bytes ( [ data[ 2 ] , data[ 3 ] ] ) ;
756
+ if ch2 >= 0xDC00 && ch2 <= 0xDFFF {
757
+ // Valid surrogate pair
758
+ let code = ( ( ( ch & 0x3FF ) as u32 ) << 10 ) | ( ( ch2 & 0x3FF ) as u32 ) ;
759
+ let code_point = code + 0x10000 ;
760
+ if let Some ( c) = char:: from_u32 ( code_point) {
761
+ out. push_str ( & c. to_string ( ) ) ;
762
+ ctx. advance ( 4 ) ;
763
+ } else {
764
+ let pos = ctx. position ( ) ;
765
+ let replace = ctx. handle_error (
766
+ errors,
767
+ pos..pos + 4 ,
768
+ Some ( "invalid surrogate pair" ) ,
769
+ ) ?;
770
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
771
+ // Don't advance here, the error handler already positioned us
772
+ }
773
+ } else {
774
+ // Invalid surrogate pair
775
+ let pos = ctx. position ( ) ;
776
+ let replace = ctx. handle_error (
777
+ errors,
778
+ pos..pos + 2 ,
779
+ Some ( "illegal UTF-16 surrogate" ) ,
780
+ ) ?;
781
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
782
+ // Don't advance here, the error handler already positioned us
783
+ }
784
+ }
785
+ } else {
786
+ // Low surrogate without high surrogate
787
+ let pos = ctx. position ( ) ;
788
+ let replace =
789
+ ctx. handle_error ( errors, pos..pos + 2 , Some ( "illegal UTF-16 surrogate" ) ) ?;
790
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
791
+ // Don't advance here, the error handler already positioned us
792
+ }
793
+ }
794
+
795
+ // Handle remaining single byte
796
+ if ctx. remaining_data ( ) . len ( ) == 1 {
797
+ if final_decode {
798
+ let pos = ctx. position ( ) ;
799
+ let replace = ctx. handle_error ( errors, pos..pos + 1 , Some ( "truncated data" ) ) ?;
800
+ out. push_wtf8 ( replace. as_ref ( ) ) ;
801
+ // Don't advance here, the error handler already positioned us
802
+ }
803
+ // In partial mode, just leave it for next call
804
+ }
805
+
806
+ Ok ( ( out, ctx. position ( ) ) )
807
+ }
808
+ }
0 commit comments