Skip to content

Commit cace112

Browse files
committed
Allow surrogates in str
1 parent e3a1031 commit cace112

24 files changed

+2747
-476
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

common/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@ rustpython-literal = { workspace = true }
1616

1717
ascii = { workspace = true }
1818
bitflags = { workspace = true }
19+
bstr = { workspace = true }
1920
cfg-if = { workspace = true }
2021
itertools = { workspace = true }
2122
libc = { workspace = true }
2223
malachite-bigint = { workspace = true }
2324
malachite-q = { workspace = true }
2425
malachite-base = { workspace = true }
26+
memchr = { workspace = true }
2527
num-complex = { workspace = true }
2628
num-traits = { workspace = true }
2729
once_cell = { workspace = true }

common/src/encodings.rs

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
use std::ops::Range;
22

3+
use crate::wtf8::{Wtf8, Wtf8Buf};
4+
35
pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
46

57
pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;
68

7-
pub trait StrBuffer: AsRef<str> {
9+
pub trait StrBuffer: AsRef<Wtf8> {
810
fn is_ascii(&self) -> bool {
911
self.as_ref().is_ascii()
1012
}
@@ -63,19 +65,19 @@ fn decode_utf8_compatible<E: ErrorHandler, DecodeF, ErrF>(
6365
errors: &E,
6466
decode: DecodeF,
6567
handle_error: ErrF,
66-
) -> Result<(String, usize), E::Error>
68+
) -> Result<(Wtf8Buf, usize), E::Error>
6769
where
6870
DecodeF: Fn(&[u8]) -> Result<&str, DecodeError<'_>>,
6971
ErrF: Fn(&[u8], Option<usize>) -> HandleResult<'_>,
7072
{
7173
if data.is_empty() {
72-
return Ok((String::new(), 0));
74+
return Ok((Wtf8Buf::new(), 0));
7375
}
7476
// we need to coerce the lifetime to that of the function body rather than the
7577
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
7678
let mut data = data;
7779
let mut data_from_err: E::BytesBuf;
78-
let mut out = String::with_capacity(data.len());
80+
let mut out = Wtf8Buf::with_capacity(data.len());
7981
let mut remaining_index = 0;
8082
let mut remaining_data = data;
8183
loop {
@@ -98,7 +100,7 @@ where
98100
err_idx..err_len.map_or_else(|| data.len(), |len| err_idx + len);
99101
let (replace, new_data, restart) =
100102
errors.handle_decode_error(data, err_range, reason)?;
101-
out.push_str(replace.as_ref());
103+
out.push_wtf8(replace.as_ref());
102104
if let Some(new_data) = new_data {
103105
data_from_err = new_data;
104106
data = data_from_err.as_ref();
@@ -130,7 +132,7 @@ pub mod utf8 {
130132
data: &[u8],
131133
errors: &E,
132134
final_decode: bool,
133-
) -> Result<(String, usize), E::Error> {
135+
) -> Result<(Wtf8Buf, usize), E::Error> {
134136
decode_utf8_compatible(
135137
data,
136138
errors,
@@ -218,7 +220,7 @@ pub mod latin_1 {
218220
)?;
219221
match replace {
220222
EncodeReplace::Str(s) => {
221-
if s.as_ref().chars().any(|c| (c as u32) > 255) {
223+
if s.as_ref().code_points().any(|c| c.to_u32() > 255) {
222224
return Err(
223225
errors.error_encoding(full_data, char_range, ERR_REASON)
224226
);
@@ -240,10 +242,10 @@ pub mod latin_1 {
240242
Ok(out)
241243
}
242244

243-
pub fn decode<E: ErrorHandler>(data: &[u8], _errors: &E) -> Result<(String, usize), E::Error> {
245+
pub fn decode<E: ErrorHandler>(data: &[u8], _errors: &E) -> Result<(Wtf8Buf, usize), E::Error> {
244246
let out: String = data.iter().map(|c| *c as char).collect();
245247
let out_len = out.len();
246-
Ok((out, out_len))
248+
Ok((out.into(), out_len))
247249
}
248250
}
249251

@@ -303,7 +305,7 @@ pub mod ascii {
303305
Ok(out)
304306
}
305307

306-
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
308+
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(Wtf8Buf, usize), E::Error> {
307309
decode_utf8_compatible(
308310
data,
309311
errors,

common/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ pub mod static_cell;
2929
pub mod str;
3030
#[cfg(windows)]
3131
pub mod windows;
32+
pub mod wtf8;
3233

3334
pub mod vendored {
3435
pub use ascii;

0 commit comments

Comments
 (0)