1
1
use std:: ops:: Range ;
2
2
3
+ use num_traits:: ToPrimitive ;
4
+
5
+ use crate :: str:: StrKind ;
3
6
use crate :: wtf8:: { Wtf8 , Wtf8Buf } ;
4
7
5
8
pub type EncodeErrorResult < S , B , E > = Result < ( EncodeReplace < S , B > , usize ) , E > ;
6
9
7
10
pub type DecodeErrorResult < S , B , E > = Result < ( S , Option < B > , usize ) , E > ;
8
11
9
12
pub trait StrBuffer : AsRef < Wtf8 > {
10
- fn is_ascii ( & self ) -> bool {
11
- self . as_ref ( ) . is_ascii ( )
13
+ fn is_compatible_with ( & self , kind : StrKind ) -> bool {
14
+ let s = self . as_ref ( ) ;
15
+ match kind {
16
+ StrKind :: Ascii => s. is_ascii ( ) ,
17
+ StrKind :: Utf8 => s. is_utf8 ( ) ,
18
+ StrKind :: Wtf8 => true ,
19
+ }
12
20
}
13
21
}
14
22
@@ -18,7 +26,7 @@ pub trait ErrorHandler {
18
26
type BytesBuf : AsRef < [ u8 ] > ;
19
27
fn handle_encode_error (
20
28
& self ,
21
- data : & str ,
29
+ data : & Wtf8 ,
22
30
char_range : Range < usize > ,
23
31
reason : & str ,
24
32
) -> EncodeErrorResult < Self :: StrBuf , Self :: BytesBuf , Self :: Error > ;
@@ -29,7 +37,7 @@ pub trait ErrorHandler {
29
37
reason : & str ,
30
38
) -> DecodeErrorResult < Self :: StrBuf , Self :: BytesBuf , Self :: Error > ;
31
39
fn error_oob_restart ( & self , i : usize ) -> Self :: Error ;
32
- fn error_encoding ( & self , data : & str , char_range : Range < usize > , reason : & str ) -> Self :: Error ;
40
+ fn error_encoding ( & self , data : & Wtf8 , char_range : Range < usize > , reason : & str ) -> Self :: Error ;
33
41
}
34
42
pub enum EncodeReplace < S , B > {
35
43
Str ( S ) ,
@@ -118,14 +126,61 @@ where
118
126
Ok ( ( out, remaining_index) )
119
127
}
120
128
129
+ #[ inline]
130
+ fn encode_utf8_compatible < E : ErrorHandler > (
131
+ s : & Wtf8 ,
132
+ errors : & E ,
133
+ err_reason : & str ,
134
+ target_kind : StrKind ,
135
+ ) -> Result < Vec < u8 > , E :: Error > {
136
+ let full_data = s;
137
+ let mut data = s;
138
+ let mut char_data_index = 0 ;
139
+ let mut out = Vec :: < u8 > :: new ( ) ;
140
+ while let Some ( ( char_i, ( byte_i, _) ) ) = data
141
+ . code_point_indices ( )
142
+ . enumerate ( )
143
+ . find ( |( _, ( _, c) ) | !target_kind. can_encode ( * c) )
144
+ {
145
+ out. extend_from_slice ( & data. as_bytes ( ) [ ..byte_i] ) ;
146
+ let char_start = char_data_index + char_i;
147
+
148
+ // number of non-compatible chars between the first non-compatible char and the next compatible char
149
+ let non_compat_run_length = data[ byte_i..]
150
+ . code_points ( )
151
+ . take_while ( |c| !target_kind. can_encode ( * c) )
152
+ . count ( ) ;
153
+ let char_range = char_start..char_start + non_compat_run_length;
154
+ let ( replace, char_restart) =
155
+ errors. handle_encode_error ( full_data, char_range. clone ( ) , err_reason) ?;
156
+ match replace {
157
+ EncodeReplace :: Str ( s) => {
158
+ if s. is_compatible_with ( target_kind) {
159
+ out. extend_from_slice ( s. as_ref ( ) . as_bytes ( ) ) ;
160
+ } else {
161
+ return Err ( errors. error_encoding ( full_data, char_range, err_reason) ) ;
162
+ }
163
+ }
164
+ EncodeReplace :: Bytes ( b) => {
165
+ out. extend_from_slice ( b. as_ref ( ) ) ;
166
+ }
167
+ }
168
+ data = crate :: str:: try_get_codepoints ( full_data, char_restart..)
169
+ . ok_or_else ( || errors. error_oob_restart ( char_restart) ) ?;
170
+ char_data_index = char_restart;
171
+ }
172
+ out. extend_from_slice ( data. as_bytes ( ) ) ;
173
+ Ok ( out)
174
+ }
175
+
121
176
pub mod utf8 {
122
177
use super :: * ;
123
178
124
179
pub const ENCODING_NAME : & str = "utf-8" ;
125
180
126
181
#[ inline]
127
- pub fn encode < E : ErrorHandler > ( s : & str , _errors : & E ) -> Result < Vec < u8 > , E :: Error > {
128
- Ok ( s . as_bytes ( ) . to_vec ( ) )
182
+ pub fn encode < E : ErrorHandler > ( s : & Wtf8 , errors : & E ) -> Result < Vec < u8 > , E :: Error > {
183
+ encode_utf8_compatible ( s , errors , "surrogates not allowed" , StrKind :: Utf8 )
129
184
}
130
185
131
186
pub fn decode < E : ErrorHandler > (
@@ -175,21 +230,22 @@ pub mod utf8 {
175
230
}
176
231
177
232
pub mod latin_1 {
233
+
178
234
use super :: * ;
179
235
180
236
pub const ENCODING_NAME : & str = "latin-1" ;
181
237
182
238
const ERR_REASON : & str = "ordinal not in range(256)" ;
183
239
184
240
#[ inline]
185
- pub fn encode < E : ErrorHandler > ( s : & str , errors : & E ) -> Result < Vec < u8 > , E :: Error > {
241
+ pub fn encode < E : ErrorHandler > ( s : & Wtf8 , errors : & E ) -> Result < Vec < u8 > , E :: Error > {
186
242
let full_data = s;
187
243
let mut data = s;
188
244
let mut char_data_index = 0 ;
189
245
let mut out = Vec :: < u8 > :: new ( ) ;
190
246
loop {
191
247
match data
192
- . char_indices ( )
248
+ . code_point_indices ( )
193
249
. enumerate ( )
194
250
. find ( |( _, ( _, c) ) | !c. is_ascii ( ) )
195
251
{
@@ -200,17 +256,16 @@ pub mod latin_1 {
200
256
Some ( ( char_i, ( byte_i, ch) ) ) => {
201
257
out. extend_from_slice ( & data. as_bytes ( ) [ ..byte_i] ) ;
202
258
let char_start = char_data_index + char_i;
203
- if ( ch as u32 ) <= 255 {
204
- out. push ( ch as u8 ) ;
205
- let char_restart = char_start + 1 ;
206
- data = crate :: str:: try_get_chars ( full_data, char_restart..)
207
- . ok_or_else ( || errors. error_oob_restart ( char_restart) ) ?;
208
- char_data_index = char_restart;
259
+ if let Some ( byte) = ch. to_u32 ( ) . to_u8 ( ) {
260
+ out. push ( byte) ;
261
+ // if the codepoint is between 128..=255, it's utf8-length is 2
262
+ data = & data[ byte_i + 2 ..] ;
263
+ char_data_index = char_start + 1 ;
209
264
} else {
210
265
// number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
211
266
let non_latin_1_run_length = data[ byte_i..]
212
- . chars ( )
213
- . take_while ( |c| ( * c as u32 ) > 255 )
267
+ . code_points ( )
268
+ . take_while ( |c| c . to_u32 ( ) > 255 )
214
269
. count ( ) ;
215
270
let char_range = char_start..char_start + non_latin_1_run_length;
216
271
let ( replace, char_restart) = errors. handle_encode_error (
@@ -231,7 +286,7 @@ pub mod latin_1 {
231
286
out. extend_from_slice ( b. as_ref ( ) ) ;
232
287
}
233
288
}
234
- data = crate :: str:: try_get_chars ( full_data, char_restart..)
289
+ data = crate :: str:: try_get_codepoints ( full_data, char_restart..)
235
290
. ok_or_else ( || errors. error_oob_restart ( char_restart) ) ?;
236
291
char_data_index = char_restart;
237
292
}
@@ -258,51 +313,8 @@ pub mod ascii {
258
313
const ERR_REASON : & str = "ordinal not in range(128)" ;
259
314
260
315
#[ inline]
261
- pub fn encode < E : ErrorHandler > ( s : & str , errors : & E ) -> Result < Vec < u8 > , E :: Error > {
262
- let full_data = s;
263
- let mut data = s;
264
- let mut char_data_index = 0 ;
265
- let mut out = Vec :: < u8 > :: new ( ) ;
266
- loop {
267
- match data
268
- . char_indices ( )
269
- . enumerate ( )
270
- . find ( |( _, ( _, c) ) | !c. is_ascii ( ) )
271
- {
272
- None => {
273
- out. extend_from_slice ( data. as_bytes ( ) ) ;
274
- break ;
275
- }
276
- Some ( ( char_i, ( byte_i, _) ) ) => {
277
- out. extend_from_slice ( & data. as_bytes ( ) [ ..byte_i] ) ;
278
- let char_start = char_data_index + char_i;
279
- // number of non-ascii chars between the first non-ascii char and the next ascii char
280
- let non_ascii_run_length =
281
- data[ byte_i..] . chars ( ) . take_while ( |c| !c. is_ascii ( ) ) . count ( ) ;
282
- let char_range = char_start..char_start + non_ascii_run_length;
283
- let ( replace, char_restart) =
284
- errors. handle_encode_error ( full_data, char_range. clone ( ) , ERR_REASON ) ?;
285
- match replace {
286
- EncodeReplace :: Str ( s) => {
287
- if !s. is_ascii ( ) {
288
- return Err (
289
- errors. error_encoding ( full_data, char_range, ERR_REASON )
290
- ) ;
291
- }
292
- out. extend_from_slice ( s. as_ref ( ) . as_bytes ( ) ) ;
293
- }
294
- EncodeReplace :: Bytes ( b) => {
295
- out. extend_from_slice ( b. as_ref ( ) ) ;
296
- }
297
- }
298
- data = crate :: str:: try_get_chars ( full_data, char_restart..)
299
- . ok_or_else ( || errors. error_oob_restart ( char_restart) ) ?;
300
- char_data_index = char_restart;
301
- continue ;
302
- }
303
- }
304
- }
305
- Ok ( out)
316
+ pub fn encode < E : ErrorHandler > ( s : & Wtf8 , errors : & E ) -> Result < Vec < u8 > , E :: Error > {
317
+ encode_utf8_compatible ( s, errors, ERR_REASON , StrKind :: Ascii )
306
318
}
307
319
308
320
pub fn decode < E : ErrorHandler > ( data : & [ u8 ] , errors : & E ) -> Result < ( Wtf8Buf , usize ) , E :: Error > {
0 commit comments