Skip to content

Commit b36b32b

Browse files
committed
Make re wtf8-compatible
1 parent 3945d3b commit b36b32b

File tree

6 files changed

+87
-9
lines changed

6 files changed

+87
-9
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/test_re.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -854,8 +854,6 @@ def test_string_boundaries(self):
854854
# Can match around the whitespace.
855855
self.assertEqual(len(re.findall(r"\B", " ")), 2)
856856

857-
# TODO: RUSTPYTHON
858-
@unittest.expectedFailure
859857
def test_bigcharset(self):
860858
self.assertEqual(re.match("([\u2222\u2223])",
861859
"\u2222").group(1), "\u2222")

Lib/test/test_smtplib.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,8 +1459,6 @@ def test_send_unicode_with_SMTPUTF8_via_low_level_API(self):
14591459
self.assertIn('SMTPUTF8', self.serv.last_mail_options)
14601460
self.assertEqual(self.serv.last_rcpt_options, [])
14611461

1462-
# TODO: RUSTPYTHON
1463-
@unittest.expectedFailure
14641462
def test_send_message_uses_smtputf8_if_addrs_non_ascii(self):
14651463
msg = EmailMessage()
14661464
msg['From'] = "Páolo <főo@bar.com>"

vm/src/stdlib/sre.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ mod _sre {
99
PyCallableIterator, PyDictRef, PyGenericAlias, PyInt, PyList, PyListRef, PyStr,
1010
PyStrRef, PyTuple, PyTupleRef, PyTypeRef,
1111
},
12+
common::wtf8::{Wtf8, Wtf8Buf},
1213
common::{ascii, hash::PyHash},
1314
convert::ToPyObject,
1415
function::{ArgCallable, OptionalArg, PosArgs, PyComparisonValue},
@@ -66,10 +67,15 @@ mod _sre {
6667
}
6768
}
6869

69-
impl SreStr for &str {
70+
impl SreStr for &Wtf8 {
7071
fn slice(&self, start: usize, end: usize, vm: &VirtualMachine) -> PyObjectRef {
7172
vm.ctx
72-
.new_str(self.chars().take(end).skip(start).collect::<String>())
73+
.new_str(
74+
self.code_points()
75+
.take(end)
76+
.skip(start)
77+
.collect::<Wtf8Buf>(),
78+
)
7379
.into()
7480
}
7581
}
@@ -206,12 +212,12 @@ mod _sre {
206212
impl Pattern {
207213
fn with_str<F, R>(string: &PyObject, vm: &VirtualMachine, f: F) -> PyResult<R>
208214
where
209-
F: FnOnce(&str) -> PyResult<R>,
215+
F: FnOnce(&Wtf8) -> PyResult<R>,
210216
{
211217
let string = string.payload::<PyStr>().ok_or_else(|| {
212218
vm.new_type_error(format!("expected string got '{}'", string.class()))
213219
})?;
214-
f(string.as_str())
220+
f(string.as_wtf8())
215221
}
216222

217223
fn with_bytes<F, R>(string: &PyObject, vm: &VirtualMachine, f: F) -> PyResult<R>
@@ -425,7 +431,7 @@ mod _sre {
425431
let is_template = if zelf.isbytes {
426432
Self::with_bytes(&repl, vm, |x| Ok(x.contains(&b'\\')))?
427433
} else {
428-
Self::with_str(&repl, vm, |x| Ok(x.contains('\\')))?
434+
Self::with_str(&repl, vm, |x| Ok(x.contains("\\".as_ref())))?
429435
};
430436

431437
if is_template {

vm/sre_engine/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@ license.workspace = true
1414
name = "benches"
1515
harness = false
1616

17+
[features]
18+
default = ["wtf8"]
19+
wtf8 = ["rustpython-common"]
20+
1721
[dependencies]
22+
rustpython-common = { workspace = true, optional = true }
1823
num_enum = { workspace = true }
1924
bitflags = { workspace = true }
2025
optional = "0.5"

vm/sre_engine/src/string.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#[cfg(feature = "wtf8")]
2+
use rustpython_common::wtf8::Wtf8;
3+
14
#[derive(Debug, Clone, Copy)]
25
pub struct StringCursor {
36
pub(crate) ptr: *const u8,
@@ -148,6 +151,73 @@ impl StrDrive for &str {
148151
}
149152
}
150153

154+
#[cfg(feature = "wtf8")]
155+
impl StrDrive for &Wtf8 {
156+
#[inline]
157+
fn count(&self) -> usize {
158+
self.code_points().count()
159+
}
160+
161+
#[inline]
162+
fn create_cursor(&self, n: usize) -> StringCursor {
163+
let mut cursor = StringCursor {
164+
ptr: self.as_bytes().as_ptr(),
165+
position: 0,
166+
};
167+
Self::skip(&mut cursor, n);
168+
cursor
169+
}
170+
171+
#[inline]
172+
fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) {
173+
if cursor.ptr.is_null() || cursor.position > n {
174+
*cursor = Self::create_cursor(self, n);
175+
} else if cursor.position < n {
176+
Self::skip(cursor, n - cursor.position);
177+
}
178+
}
179+
180+
#[inline]
181+
fn advance(cursor: &mut StringCursor) -> u32 {
182+
cursor.position += 1;
183+
unsafe { next_code_point(&mut cursor.ptr) }
184+
}
185+
186+
#[inline]
187+
fn peek(cursor: &StringCursor) -> u32 {
188+
let mut ptr = cursor.ptr;
189+
unsafe { next_code_point(&mut ptr) }
190+
}
191+
192+
#[inline]
193+
fn skip(cursor: &mut StringCursor, n: usize) {
194+
cursor.position += n;
195+
for _ in 0..n {
196+
unsafe { next_code_point(&mut cursor.ptr) };
197+
}
198+
}
199+
200+
#[inline]
201+
fn back_advance(cursor: &mut StringCursor) -> u32 {
202+
cursor.position -= 1;
203+
unsafe { next_code_point_reverse(&mut cursor.ptr) }
204+
}
205+
206+
#[inline]
207+
fn back_peek(cursor: &StringCursor) -> u32 {
208+
let mut ptr = cursor.ptr;
209+
unsafe { next_code_point_reverse(&mut ptr) }
210+
}
211+
212+
#[inline]
213+
fn back_skip(cursor: &mut StringCursor, n: usize) {
214+
cursor.position -= n;
215+
for _ in 0..n {
216+
unsafe { next_code_point_reverse(&mut cursor.ptr) };
217+
}
218+
}
219+
}
220+
151221
/// Reads the next code point out of a byte iterator (assuming a
152222
/// UTF-8-like encoding).
153223
///

0 commit comments

Comments
 (0)