Skip to content

Commit e9e116b

Browse files
committed
Parse surrogates in string literals properly
1 parent 2ab8716 commit e9e116b

File tree

15 files changed

+503
-79
lines changed

15 files changed

+503
-79
lines changed

Cargo.lock

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/test_codeccallbacks.py

-6
Original file line numberDiff line numberDiff line change
@@ -536,8 +536,6 @@ def test_badandgoodxmlcharrefreplaceexceptions(self):
536536
("".join("&#%d;" % c for c in cs), 1 + len(s))
537537
)
538538

539-
# TODO: RUSTPYTHON
540-
@unittest.expectedFailure
541539
def test_badandgoodbackslashreplaceexceptions(self):
542540
# "backslashreplace" complains about a non-exception passed in
543541
self.assertRaises(
@@ -596,8 +594,6 @@ def test_badandgoodbackslashreplaceexceptions(self):
596594
(r, 2)
597595
)
598596

599-
# TODO: RUSTPYTHON
600-
@unittest.expectedFailure
601597
def test_badandgoodnamereplaceexceptions(self):
602598
# "namereplace" complains about a non-exception passed in
603599
self.assertRaises(
@@ -644,8 +640,6 @@ def test_badandgoodnamereplaceexceptions(self):
644640
(r, 1 + len(s))
645641
)
646642

647-
# TODO: RUSTPYTHON
648-
@unittest.expectedFailure
649643
def test_badandgoodsurrogateescapeexceptions(self):
650644
surrogateescape_errors = codecs.lookup_error('surrogateescape')
651645
# "surrogateescape" complains about a non-exception passed in

common/src/encodings.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ pub mod errors {
401401
let mut out = String::with_capacity(num_chars * 4);
402402
for c in err_str.code_points() {
403403
let c_u32 = c.to_u32();
404-
if let Some(c_name) = unicode_names2::name(c.to_char_lossy()) {
404+
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
405405
write!(out, "\\N{{{c_name}}}").unwrap();
406406
} else if c_u32 >= 0x10000 {
407407
write!(out, "\\U{c_u32:08x}").unwrap();

common/src/wtf8/mod.rs

+53
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,12 @@ impl<W: AsRef<Wtf8>> FromIterator<W> for Wtf8Buf {
574574
}
575575
}
576576

577+
impl Hash for Wtf8Buf {
578+
fn hash<H: Hasher>(&self, state: &mut H) {
579+
Wtf8::hash(self, state)
580+
}
581+
}
582+
577583
impl AsRef<Wtf8> for Wtf8Buf {
578584
fn as_ref(&self) -> &Wtf8 {
579585
self
@@ -692,6 +698,13 @@ impl Default for &Wtf8 {
692698
}
693699
}
694700

701+
impl Hash for Wtf8 {
702+
fn hash<H: Hasher>(&self, state: &mut H) {
703+
state.write(self.as_bytes());
704+
state.write_u8(0xff);
705+
}
706+
}
707+
695708
impl Wtf8 {
696709
/// Creates a WTF-8 slice from a UTF-8 `&str` slice.
697710
///
@@ -722,6 +735,32 @@ impl Wtf8 {
722735
unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
723736
}
724737

738+
/// Create a WTF-8 slice from a WTF-8 byte slice.
739+
//
740+
// whooops! using WTF-8 for interchange!
741+
#[inline]
742+
pub fn from_bytes(b: &[u8]) -> Option<&Self> {
743+
let mut rest = b;
744+
while let Err(e) = std::str::from_utf8(rest) {
745+
rest = &rest[e.valid_up_to()..];
746+
Self::decode_surrogate(rest)?;
747+
rest = &rest[3..];
748+
}
749+
Some(unsafe { Wtf8::from_bytes_unchecked(b) })
750+
}
751+
752+
fn decode_surrogate(b: &[u8]) -> Option<CodePoint> {
753+
let [a, b, c, ..] = *b else { return None };
754+
if (a & 0xf0) == 0xe0 && (b & 0xc0) == 0x80 && (c & 0xc0) == 0x80 {
755+
// it's a three-byte code
756+
let c = ((a as u32 & 0x0f) << 12) + ((b as u32 & 0x3f) << 6) + (c as u32 & 0x3f);
757+
let 0xD800..=0xDFFF = c else { return None };
758+
Some(CodePoint { value: c })
759+
} else {
760+
None
761+
}
762+
}
763+
725764
/// Returns the length, in WTF-8 bytes.
726765
#[inline]
727766
pub fn len(&self) -> usize {
@@ -875,6 +914,14 @@ impl Wtf8 {
875914
}
876915
}
877916

917+
#[inline]
918+
fn final_lead_surrogate(&self) -> Option<u16> {
919+
match self.bytes {
920+
[.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
921+
_ => None,
922+
}
923+
}
924+
878925
pub fn is_code_point_boundary(&self, index: usize) -> bool {
879926
is_code_point_boundary(self, index)
880927
}
@@ -1481,6 +1528,12 @@ impl From<Wtf8Buf> for Box<Wtf8> {
14811528
}
14821529
}
14831530

1531+
impl From<Box<Wtf8>> for Wtf8Buf {
1532+
fn from(w: Box<Wtf8>) -> Self {
1533+
Wtf8Buf::from_box(w)
1534+
}
1535+
}
1536+
14841537
impl From<String> for Box<Wtf8> {
14851538
fn from(s: String) -> Self {
14861539
s.into_boxed_str().into()

compiler/codegen/Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ num-complex = { workspace = true }
3030
num-traits = { workspace = true }
3131
thiserror = { workspace = true }
3232
malachite-bigint = { workspace = true }
33+
memchr = { workspace = true }
34+
unicode_names2 = { workspace = true }
3335

3436
[dev-dependencies]
3537
# rustpython-parser = { workspace = true }

0 commit comments

Comments
 (0)