Skip to content

Commit 7f4582b

Browse files
committed
Make cformat wtf8-compatible
1 parent cace112 commit 7f4582b

File tree

6 files changed

+153
-34
lines changed

6 files changed

+153
-34
lines changed

Lib/test/test_codecs.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,8 +1698,6 @@ def test_decode_invalid(self):
16981698

16991699

17001700
class NameprepTest(unittest.TestCase):
1701-
# TODO: RUSTPYTHON
1702-
@unittest.expectedFailure
17031701
def test_nameprep(self):
17041702
from encodings.idna import nameprep
17051703
for pos, (orig, prepped) in enumerate(nameprep_tests):

common/src/cformat.rs

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@ use std::{
1111
str::FromStr,
1212
};
1313

14+
use crate::wtf8::{CodePoint, Wtf8, Wtf8Buf};
15+
1416
#[derive(Debug, PartialEq)]
1517
pub enum CFormatErrorType {
1618
UnmatchedKeyParentheses,
1719
MissingModuloSign,
18-
UnsupportedFormatChar(char),
20+
UnsupportedFormatChar(CodePoint),
1921
IncompleteFormat,
2022
IntTooBig,
2123
// Unimplemented,
@@ -39,7 +41,9 @@ impl fmt::Display for CFormatError {
3941
UnsupportedFormatChar(c) => write!(
4042
f,
4143
"unsupported format character '{}' ({:#x}) at index {}",
42-
c, c as u32, self.index
44+
c,
45+
c.to_u32(),
46+
self.index
4347
),
4448
IntTooBig => write!(f, "width/precision too big"),
4549
_ => write!(f, "unexpected error parsing format string"),
@@ -160,7 +164,7 @@ pub trait FormatBuf:
160164
fn concat(self, other: Self) -> Self;
161165
}
162166

163-
pub trait FormatChar: Copy + Into<char> + From<u8> {
167+
pub trait FormatChar: Copy + Into<CodePoint> + From<u8> {
164168
fn to_char_lossy(self) -> char;
165169
fn eq_char(self, c: char) -> bool;
166170
}
@@ -188,6 +192,29 @@ impl FormatChar for char {
188192
}
189193
}
190194

195+
impl FormatBuf for Wtf8Buf {
196+
type Char = CodePoint;
197+
fn chars(&self) -> impl Iterator<Item = Self::Char> {
198+
self.code_points()
199+
}
200+
fn len(&self) -> usize {
201+
(**self).len()
202+
}
203+
fn concat(mut self, other: Self) -> Self {
204+
self.extend([other]);
205+
self
206+
}
207+
}
208+
209+
impl FormatChar for CodePoint {
210+
fn to_char_lossy(self) -> char {
211+
self.to_char_lossy()
212+
}
213+
fn eq_char(self, c: char) -> bool {
214+
self == c
215+
}
216+
}
217+
191218
impl FormatBuf for Vec<u8> {
192219
type Char = u8;
193220
fn chars(&self) -> impl Iterator<Item = Self::Char> {
@@ -801,6 +828,15 @@ impl FromStr for CFormatString {
801828
}
802829
}
803830

831+
pub type CFormatWtf8 = CFormatStrOrBytes<Wtf8Buf>;
832+
833+
impl CFormatWtf8 {
834+
pub fn parse_from_wtf8(s: &Wtf8) -> Result<Self, CFormatError> {
835+
let mut iter = s.code_points().enumerate().peekable();
836+
Self::parse(&mut iter)
837+
}
838+
}
839+
804840
#[cfg(test)]
805841
mod tests {
806842
use super::*;

common/src/wtf8/mod.rs

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ impl fmt::Debug for CodePoint {
7575
}
7676
}
7777

78+
impl fmt::Display for CodePoint {
79+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80+
self.to_char_lossy().fmt(f)
81+
}
82+
}
83+
7884
impl CodePoint {
7985
/// Unsafely creates a new `CodePoint` without checking the value.
8086
///
@@ -109,13 +115,13 @@ impl CodePoint {
109115

110116
/// Returns the numeric value of the code point.
111117
#[inline]
112-
pub fn to_u32(&self) -> u32 {
118+
pub fn to_u32(self) -> u32 {
113119
self.value
114120
}
115121

116122
/// Returns the numeric value of the code point if it is a leading surrogate.
117123
#[inline]
118-
pub fn to_lead_surrogate(&self) -> Option<u16> {
124+
pub fn to_lead_surrogate(self) -> Option<u16> {
119125
match self.value {
120126
lead @ 0xD800..=0xDBFF => Some(lead as u16),
121127
_ => None,
@@ -124,7 +130,7 @@ impl CodePoint {
124130

125131
/// Returns the numeric value of the code point if it is a trailing surrogate.
126132
#[inline]
127-
pub fn to_trail_surrogate(&self) -> Option<u16> {
133+
pub fn to_trail_surrogate(self) -> Option<u16> {
128134
match self.value {
129135
trail @ 0xDC00..=0xDFFF => Some(trail as u16),
130136
_ => None,
@@ -135,7 +141,7 @@ impl CodePoint {
135141
///
136142
/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
137143
#[inline]
138-
pub fn to_char(&self) -> Option<char> {
144+
pub fn to_char(self) -> Option<char> {
139145
match self.value {
140146
0xD800..=0xDFFF => None,
141147
_ => Some(unsafe { char::from_u32_unchecked(self.value) }),
@@ -147,7 +153,7 @@ impl CodePoint {
147153
/// Returns `'\u{FFFD}'` (the replacement character “�”)
148154
/// if the code point is a surrogate (from U+D800 to U+DFFF).
149155
#[inline]
150-
pub fn to_char_lossy(&self) -> char {
156+
pub fn to_char_lossy(self) -> char {
151157
self.to_char().unwrap_or('\u{FFFD}')
152158
}
153159

@@ -170,6 +176,12 @@ impl From<u16> for CodePoint {
170176
}
171177
}
172178

179+
impl From<u8> for CodePoint {
180+
fn from(value: u8) -> Self {
181+
char::from(value).into()
182+
}
183+
}
184+
173185
impl From<char> for CodePoint {
174186
fn from(value: char) -> Self {
175187
Self::from_char(value)
@@ -515,6 +527,13 @@ impl Extend<CodePoint> for Wtf8Buf {
515527
}
516528
}
517529

530+
impl<W: AsRef<Wtf8>> Extend<W> for Wtf8Buf {
531+
fn extend<T: IntoIterator<Item = W>>(&mut self, iter: T) {
532+
iter.into_iter()
533+
.for_each(move |w| self.push_wtf8(w.as_ref()));
534+
}
535+
}
536+
518537
impl<W: AsRef<Wtf8>> FromIterator<W> for Wtf8Buf {
519538
fn from_iter<T: IntoIterator<Item = W>>(iter: T) -> Self {
520539
let mut buf = Wtf8Buf::new();
@@ -541,6 +560,12 @@ impl From<&str> for Wtf8Buf {
541560
}
542561
}
543562

563+
impl From<ascii::AsciiString> for Wtf8Buf {
564+
fn from(s: ascii::AsciiString) -> Self {
565+
Wtf8Buf::from_string(s.into())
566+
}
567+
}
568+
544569
/// A borrowed slice of well-formed WTF-8 data.
545570
///
546571
/// Similar to `&str`, but can additionally contain surrogate code points

vm/src/builtins/str.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -857,8 +857,8 @@ impl PyStr {
857857
}
858858

859859
#[pymethod(name = "__mod__")]
860-
fn modulo(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<String> {
861-
cformat_string(vm, self.as_str(), values)
860+
fn modulo(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
861+
cformat_string(vm, self.as_wtf8(), values)
862862
}
863863

864864
#[pymethod(magic)]

vm/src/cformat.rs

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
//! as per the [Python Docs](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting).
33
44
use crate::common::cformat::*;
5+
use crate::common::wtf8::{CodePoint, Wtf8, Wtf8Buf};
56
use crate::{
67
AsObject, PyObjectRef, PyResult, TryFromBorrowedObject, TryFromObject, VirtualMachine,
78
builtins::{
@@ -125,13 +126,13 @@ fn spec_format_string(
125126
spec: &CFormatSpec,
126127
obj: PyObjectRef,
127128
idx: usize,
128-
) -> PyResult<String> {
129+
) -> PyResult<Wtf8Buf> {
129130
match &spec.format_type {
130131
CFormatType::String(conversion) => {
131132
let result = match conversion {
132133
CFormatConversion::Ascii => builtins::ascii(obj, vm)?.into(),
133-
CFormatConversion::Str => obj.str(vm)?.as_str().to_owned(),
134-
CFormatConversion::Repr => obj.repr(vm)?.as_str().to_owned(),
134+
CFormatConversion::Str => obj.str(vm)?.as_wtf8().to_owned(),
135+
CFormatConversion::Repr => obj.repr(vm)?.as_wtf8().to_owned(),
135136
CFormatConversion::Bytes => {
136137
// idx is the position of the %, we want the position of the b
137138
return Err(vm.new_value_error(format!(
@@ -146,16 +147,18 @@ fn spec_format_string(
146147
CNumberType::DecimalD | CNumberType::DecimalI | CNumberType::DecimalU => {
147148
match_class!(match &obj {
148149
ref i @ PyInt => {
149-
Ok(spec.format_number(i.as_bigint()))
150+
Ok(spec.format_number(i.as_bigint()).into())
150151
}
151152
ref f @ PyFloat => {
152-
Ok(spec.format_number(&try_f64_to_bigint(f.to_f64(), vm)?))
153+
Ok(spec
154+
.format_number(&try_f64_to_bigint(f.to_f64(), vm)?)
155+
.into())
153156
}
154157
obj => {
155158
if let Some(method) = vm.get_method(obj.clone(), identifier!(vm, __int__)) {
156159
let result = method?.call((), vm)?;
157160
if let Some(i) = result.payload::<PyInt>() {
158-
return Ok(spec.format_number(i.as_bigint()));
161+
return Ok(spec.format_number(i.as_bigint()).into());
159162
}
160163
}
161164
Err(vm.new_type_error(format!(
@@ -168,7 +171,7 @@ fn spec_format_string(
168171
}
169172
_ => {
170173
if let Some(i) = obj.payload::<PyInt>() {
171-
Ok(spec.format_number(i.as_bigint()))
174+
Ok(spec.format_number(i.as_bigint()).into())
172175
} else {
173176
Err(vm.new_type_error(format!(
174177
"%{} format: an integer is required, not {}",
@@ -180,21 +183,21 @@ fn spec_format_string(
180183
},
181184
CFormatType::Float(_) => {
182185
let value = ArgIntoFloat::try_from_object(vm, obj)?;
183-
Ok(spec.format_float(value.into()))
186+
Ok(spec.format_float(value.into()).into())
184187
}
185188
CFormatType::Character(CCharacterType::Character) => {
186189
if let Some(i) = obj.payload::<PyInt>() {
187190
let ch = i
188191
.as_bigint()
189192
.to_u32()
190-
.and_then(char::from_u32)
193+
.and_then(CodePoint::from_u32)
191194
.ok_or_else(|| {
192195
vm.new_overflow_error("%c arg not in range(0x110000)".to_owned())
193196
})?;
194197
return Ok(spec.format_char(ch));
195198
}
196199
if let Some(s) = obj.payload::<PyStr>() {
197-
if let Ok(ch) = s.as_str().chars().exactly_one() {
200+
if let Ok(ch) = s.as_wtf8().code_points().exactly_one() {
198201
return Ok(spec.format_char(ch));
199202
}
200203
}
@@ -374,17 +377,16 @@ pub(crate) fn cformat_bytes(
374377

375378
pub(crate) fn cformat_string(
376379
vm: &VirtualMachine,
377-
format_string: &str,
380+
format_string: &Wtf8,
378381
values_obj: PyObjectRef,
379-
) -> PyResult<String> {
380-
let format = format_string
381-
.parse::<CFormatString>()
382+
) -> PyResult<Wtf8Buf> {
383+
let format = CFormatWtf8::parse_from_wtf8(format_string)
382384
.map_err(|err| vm.new_value_error(err.to_string()))?;
383385
let (num_specifiers, mapping_required) = format
384386
.check_specifiers()
385387
.ok_or_else(|| specifier_error(vm))?;
386388

387-
let mut result = String::new();
389+
let mut result = Wtf8Buf::new();
388390

389391
let is_mapping = values_obj.class().has_attr(identifier!(vm, __getitem__))
390392
&& !values_obj.fast_isinstance(vm.ctx.types.tuple_type)
@@ -399,7 +401,7 @@ pub(crate) fn cformat_string(
399401
{
400402
for (_, part) in format.iter() {
401403
match part {
402-
CFormatPart::Literal(literal) => result.push_str(literal),
404+
CFormatPart::Literal(literal) => result.push_wtf8(literal),
403405
CFormatPart::Spec(_) => unreachable!(),
404406
}
405407
}
@@ -415,11 +417,11 @@ pub(crate) fn cformat_string(
415417
return if is_mapping {
416418
for (idx, part) in format {
417419
match part {
418-
CFormatPart::Literal(literal) => result.push_str(&literal),
420+
CFormatPart::Literal(literal) => result.push_wtf8(&literal),
419421
CFormatPart::Spec(CFormatSpecKeyed { mapping_key, spec }) => {
420422
let value = values_obj.get_item(&mapping_key.unwrap(), vm)?;
421423
let part_result = spec_format_string(vm, &spec, value, idx)?;
422-
result.push_str(&part_result);
424+
result.push_wtf8(&part_result);
423425
}
424426
}
425427
}
@@ -439,7 +441,7 @@ pub(crate) fn cformat_string(
439441

440442
for (idx, part) in format {
441443
match part {
442-
CFormatPart::Literal(literal) => result.push_str(&literal),
444+
CFormatPart::Literal(literal) => result.push_wtf8(&literal),
443445
CFormatPart::Spec(CFormatSpecKeyed { mut spec, .. }) => {
444446
try_update_quantity_from_tuple(
445447
vm,
@@ -456,7 +458,7 @@ pub(crate) fn cformat_string(
456458
}
457459
}?;
458460
let part_result = spec_format_string(vm, &spec, value, idx)?;
459-
result.push_str(&part_result);
461+
result.push_wtf8(&part_result);
460462
}
461463
}
462464
}

0 commit comments

Comments
 (0)