Skip to content

Commit ba1b581

Browse files
committed
Update encoding to use wtf8
1 parent 7f4582b commit ba1b581

File tree

18 files changed

+242
-138
lines changed

18 files changed

+242
-138
lines changed

Lib/test/test_cmd_line_script.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,8 @@ def test_pep_409_verbiage(self):
574574
self.assertTrue(text[1].startswith(' File '))
575575
self.assertTrue(text[3].startswith('NameError'))
576576

577+
# TODO: RUSTPYTHON
578+
@unittest.expectedFailure
577579
def test_non_ascii(self):
578580
# Mac OS X denies the creation of a file with an invalid UTF-8 name.
579581
# Windows allows creating a name with an arbitrary bytes name, but

common/src/encodings.rs

Lines changed: 74 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,22 @@
11
use std::ops::Range;
22

3+
use num_traits::ToPrimitive;
4+
5+
use crate::str::StrKind;
36
use crate::wtf8::{Wtf8, Wtf8Buf};
47

58
pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
69

710
pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;
811

912
pub trait StrBuffer: AsRef<Wtf8> {
10-
fn is_ascii(&self) -> bool {
11-
self.as_ref().is_ascii()
13+
fn is_compatible_with(&self, kind: StrKind) -> bool {
14+
let s = self.as_ref();
15+
match kind {
16+
StrKind::Ascii => s.is_ascii(),
17+
StrKind::Utf8 => s.is_utf8(),
18+
StrKind::Wtf8 => true,
19+
}
1220
}
1321
}
1422

@@ -18,7 +26,7 @@ pub trait ErrorHandler {
1826
type BytesBuf: AsRef<[u8]>;
1927
fn handle_encode_error(
2028
&self,
21-
data: &str,
29+
data: &Wtf8,
2230
char_range: Range<usize>,
2331
reason: &str,
2432
) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
@@ -29,7 +37,7 @@ pub trait ErrorHandler {
2937
reason: &str,
3038
) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
3139
fn error_oob_restart(&self, i: usize) -> Self::Error;
32-
fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
40+
fn error_encoding(&self, data: &Wtf8, char_range: Range<usize>, reason: &str) -> Self::Error;
3341
}
3442
pub enum EncodeReplace<S, B> {
3543
Str(S),
@@ -118,14 +126,61 @@ where
118126
Ok((out, remaining_index))
119127
}
120128

129+
#[inline]
130+
fn encode_utf8_compatible<E: ErrorHandler>(
131+
s: &Wtf8,
132+
errors: &E,
133+
err_reason: &str,
134+
target_kind: StrKind,
135+
) -> Result<Vec<u8>, E::Error> {
136+
let full_data = s;
137+
let mut data = s;
138+
let mut char_data_index = 0;
139+
let mut out = Vec::<u8>::new();
140+
while let Some((char_i, (byte_i, _))) = data
141+
.code_point_indices()
142+
.enumerate()
143+
.find(|(_, (_, c))| !target_kind.can_encode(*c))
144+
{
145+
out.extend_from_slice(&data.as_bytes()[..byte_i]);
146+
let char_start = char_data_index + char_i;
147+
148+
// number of non-compatible chars between the first non-compatible char and the next compatible char
149+
let non_compat_run_length = data[byte_i..]
150+
.code_points()
151+
.take_while(|c| !target_kind.can_encode(*c))
152+
.count();
153+
let char_range = char_start..char_start + non_compat_run_length;
154+
let (replace, char_restart) =
155+
errors.handle_encode_error(full_data, char_range.clone(), err_reason)?;
156+
match replace {
157+
EncodeReplace::Str(s) => {
158+
if s.is_compatible_with(target_kind) {
159+
out.extend_from_slice(s.as_ref().as_bytes());
160+
} else {
161+
return Err(errors.error_encoding(full_data, char_range, err_reason));
162+
}
163+
}
164+
EncodeReplace::Bytes(b) => {
165+
out.extend_from_slice(b.as_ref());
166+
}
167+
}
168+
data = crate::str::try_get_codepoints(full_data, char_restart..)
169+
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
170+
char_data_index = char_restart;
171+
}
172+
out.extend_from_slice(data.as_bytes());
173+
Ok(out)
174+
}
175+
121176
pub mod utf8 {
122177
use super::*;
123178

124179
pub const ENCODING_NAME: &str = "utf-8";
125180

126181
#[inline]
127-
pub fn encode<E: ErrorHandler>(s: &str, _errors: &E) -> Result<Vec<u8>, E::Error> {
128-
Ok(s.as_bytes().to_vec())
182+
pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
183+
encode_utf8_compatible(s, errors, "surrogates not allowed", StrKind::Utf8)
129184
}
130185

131186
pub fn decode<E: ErrorHandler>(
@@ -175,21 +230,22 @@ pub mod utf8 {
175230
}
176231

177232
pub mod latin_1 {
233+
178234
use super::*;
179235

180236
pub const ENCODING_NAME: &str = "latin-1";
181237

182238
const ERR_REASON: &str = "ordinal not in range(256)";
183239

184240
#[inline]
185-
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
241+
pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
186242
let full_data = s;
187243
let mut data = s;
188244
let mut char_data_index = 0;
189245
let mut out = Vec::<u8>::new();
190246
loop {
191247
match data
192-
.char_indices()
248+
.code_point_indices()
193249
.enumerate()
194250
.find(|(_, (_, c))| !c.is_ascii())
195251
{
@@ -200,17 +256,16 @@ pub mod latin_1 {
200256
Some((char_i, (byte_i, ch))) => {
201257
out.extend_from_slice(&data.as_bytes()[..byte_i]);
202258
let char_start = char_data_index + char_i;
203-
if (ch as u32) <= 255 {
204-
out.push(ch as u8);
205-
let char_restart = char_start + 1;
206-
data = crate::str::try_get_chars(full_data, char_restart..)
207-
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
208-
char_data_index = char_restart;
259+
if let Some(byte) = ch.to_u32().to_u8() {
260+
out.push(byte);
261+
// if the codepoint is between 128..=255, it's utf8-length is 2
262+
data = &data[byte_i + 2..];
263+
char_data_index = char_start + 1;
209264
} else {
210265
// number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
211266
let non_latin_1_run_length = data[byte_i..]
212-
.chars()
213-
.take_while(|c| (*c as u32) > 255)
267+
.code_points()
268+
.take_while(|c| c.to_u32() > 255)
214269
.count();
215270
let char_range = char_start..char_start + non_latin_1_run_length;
216271
let (replace, char_restart) = errors.handle_encode_error(
@@ -231,7 +286,7 @@ pub mod latin_1 {
231286
out.extend_from_slice(b.as_ref());
232287
}
233288
}
234-
data = crate::str::try_get_chars(full_data, char_restart..)
289+
data = crate::str::try_get_codepoints(full_data, char_restart..)
235290
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
236291
char_data_index = char_restart;
237292
}
@@ -258,51 +313,8 @@ pub mod ascii {
258313
const ERR_REASON: &str = "ordinal not in range(128)";
259314

260315
#[inline]
261-
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
262-
let full_data = s;
263-
let mut data = s;
264-
let mut char_data_index = 0;
265-
let mut out = Vec::<u8>::new();
266-
loop {
267-
match data
268-
.char_indices()
269-
.enumerate()
270-
.find(|(_, (_, c))| !c.is_ascii())
271-
{
272-
None => {
273-
out.extend_from_slice(data.as_bytes());
274-
break;
275-
}
276-
Some((char_i, (byte_i, _))) => {
277-
out.extend_from_slice(&data.as_bytes()[..byte_i]);
278-
let char_start = char_data_index + char_i;
279-
// number of non-ascii chars between the first non-ascii char and the next ascii char
280-
let non_ascii_run_length =
281-
data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
282-
let char_range = char_start..char_start + non_ascii_run_length;
283-
let (replace, char_restart) =
284-
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
285-
match replace {
286-
EncodeReplace::Str(s) => {
287-
if !s.is_ascii() {
288-
return Err(
289-
errors.error_encoding(full_data, char_range, ERR_REASON)
290-
);
291-
}
292-
out.extend_from_slice(s.as_ref().as_bytes());
293-
}
294-
EncodeReplace::Bytes(b) => {
295-
out.extend_from_slice(b.as_ref());
296-
}
297-
}
298-
data = crate::str::try_get_chars(full_data, char_restart..)
299-
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
300-
char_data_index = char_restart;
301-
continue;
302-
}
303-
}
304-
}
305-
Ok(out)
316+
pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
317+
encode_utf8_compatible(s, errors, ERR_REASON, StrKind::Ascii)
306318
}
307319

308320
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(Wtf8Buf, usize), E::Error> {

common/src/str.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pub type wchar_t = libc::wchar_t;
1414
pub type wchar_t = u32;
1515

1616
/// Utf8 + state.ascii (+ PyUnicode_Kind in future)
17-
#[derive(Debug, Copy, Clone, PartialEq)]
17+
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
1818
pub enum StrKind {
1919
Ascii,
2020
Utf8,
@@ -41,6 +41,15 @@ impl StrKind {
4141
pub fn is_utf8(&self) -> bool {
4242
matches!(self, Self::Ascii | Self::Utf8)
4343
}
44+
45+
#[inline(always)]
46+
pub fn can_encode(&self, code: CodePoint) -> bool {
47+
match self {
48+
StrKind::Ascii => code.is_ascii(),
49+
StrKind::Utf8 => code.to_char().is_some(),
50+
StrKind::Wtf8 => true,
51+
}
52+
}
4453
}
4554

4655
pub trait DeduceStrKind {

common/src/wtf8/mod.rs

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ use std::collections::TryReserveError;
4949
use std::string::String;
5050
use std::vec::Vec;
5151

52-
use bstr::ByteSlice;
52+
use bstr::{ByteSlice, ByteVec};
5353

5454
mod core_char;
5555
mod core_str;
@@ -168,6 +168,10 @@ impl CodePoint {
168168
pub fn len_wtf8(&self) -> usize {
169169
len_utf8(self.value)
170170
}
171+
172+
pub fn is_ascii(&self) -> bool {
173+
self.is_char_and(|c| c.is_ascii())
174+
}
171175
}
172176

173177
impl From<u16> for CodePoint {
@@ -436,6 +440,13 @@ impl Wtf8Buf {
436440
self.push_wtf8(code_point.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
437441
}
438442

443+
pub fn pop(&mut self) -> Option<CodePoint> {
444+
let ch = self.code_points().next_back()?;
445+
let newlen = self.len() - ch.len_wtf8();
446+
self.bytes.truncate(newlen);
447+
Some(ch)
448+
}
449+
439450
/// Shortens a string to the specified length.
440451
///
441452
/// # Panics
@@ -448,6 +459,20 @@ impl Wtf8Buf {
448459
self.bytes.truncate(new_len)
449460
}
450461

462+
/// Inserts a codepoint into this `Wtf8Buf` at a byte position.
463+
#[inline]
464+
pub fn insert(&mut self, idx: usize, c: CodePoint) {
465+
self.insert_wtf8(idx, c.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
466+
}
467+
468+
/// Inserts a WTF-8 slice into this `Wtf8Buf` at a byte position.
469+
#[inline]
470+
pub fn insert_wtf8(&mut self, idx: usize, w: &Wtf8) {
471+
assert!(is_code_point_boundary(self, idx));
472+
473+
self.bytes.insert_str(idx, w)
474+
}
475+
451476
/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
452477
#[inline]
453478
pub fn into_bytes(self) -> Vec<u8> {
@@ -914,6 +939,21 @@ impl Wtf8 {
914939
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
915940
}
916941

942+
pub fn trim(&self) -> &Self {
943+
let w = self.bytes.trim();
944+
unsafe { Wtf8::from_bytes_unchecked(w) }
945+
}
946+
947+
pub fn trim_start(&self) -> &Self {
948+
let w = self.bytes.trim_start();
949+
unsafe { Wtf8::from_bytes_unchecked(w) }
950+
}
951+
952+
pub fn trim_end(&self) -> &Self {
953+
let w = self.bytes.trim_end();
954+
unsafe { Wtf8::from_bytes_unchecked(w) }
955+
}
956+
917957
pub fn trim_start_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
918958
let mut iter = self.code_points();
919959
loop {
@@ -958,6 +998,15 @@ impl Wtf8 {
958998
memchr::memmem::rfind(self.as_bytes(), pat.as_bytes())
959999
}
9601000

1001+
pub fn contains(&self, pat: &Wtf8) -> bool {
1002+
self.bytes.contains_str(pat)
1003+
}
1004+
1005+
pub fn contains_code_point(&self, pat: CodePoint) -> bool {
1006+
self.bytes
1007+
.contains_str(pat.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
1008+
}
1009+
9611010
pub fn get(&self, range: impl ops::RangeBounds<usize>) -> Option<&Self> {
9621011
let start = match range.start_bound() {
9631012
ops::Bound::Included(&i) => i,
@@ -977,6 +1026,26 @@ impl Wtf8 {
9771026
None
9781027
}
9791028
}
1029+
1030+
pub fn ends_with(&self, w: &Wtf8) -> bool {
1031+
self.bytes.ends_with_str(w)
1032+
}
1033+
1034+
pub fn starts_with(&self, w: &Wtf8) -> bool {
1035+
self.bytes.starts_with_str(w)
1036+
}
1037+
1038+
pub fn strip_prefix(&self, w: &Wtf8) -> Option<&Self> {
1039+
self.bytes
1040+
.strip_prefix(w.as_bytes())
1041+
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1042+
}
1043+
1044+
pub fn strip_suffix(&self, w: &Wtf8) -> Option<&Self> {
1045+
self.bytes
1046+
.strip_suffix(w.as_bytes())
1047+
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
1048+
}
9801049
}
9811050

9821051
impl AsRef<Wtf8> for str {

0 commit comments

Comments
 (0)