Skip to content

Commit 121cd43

Browse files
Merge pull request #901 from youknowone/str-encode
Add str.encode for utf-8
2 parents 5bd36ad + 59476c6 commit 121cd43

File tree

5 files changed

+89
-12
lines changed

5 files changed

+89
-12
lines changed

tests/snippets/strings.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,13 @@ def try_mutate_str():
206206
word[0] = 'x'
207207

208208
assert_raises(TypeError, try_mutate_str)
209+
210+
ss = ['Hello', '안녕', '👋']
211+
bs = [b'Hello', b'\xec\x95\x88\xeb\x85\x95', b'\xf0\x9f\x91\x8b']
212+
213+
for s, b in zip(ss, bs):
214+
assert s.encode() == b
215+
216+
for s, b, e in zip(ss, bs, ['u8', 'U8', 'utf-8', 'UTF-8', 'utf_8']):
217+
assert s.encode(e) == b
218+
# assert s.encode(encoding=e) == b

vm/src/function.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,17 @@ impl<T> OptionalArg<T> {
374374
Missing => f(),
375375
}
376376
}
377+
378+
pub fn map_or_else<U, D, F>(self, default: D, f: F) -> U
379+
where
380+
D: FnOnce() -> U,
381+
F: FnOnce(T) -> U,
382+
{
383+
match self {
384+
Present(value) => f(value),
385+
Missing => default(),
386+
}
387+
}
377388
}
378389

379390
impl<T> FromArgs for OptionalArg<T>

vm/src/obj/objbyteinner.rs

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -74,24 +74,33 @@ pub struct ByteInnerNewOptions {
7474
encoding: OptionalArg<PyStringRef>,
7575
}
7676

77+
//same algorithm as cpython
78+
pub fn normalize_encoding(encoding: &str) -> String {
79+
let mut res = String::new();
80+
let mut punct = false;
81+
82+
for c in encoding.chars() {
83+
if c.is_alphanumeric() || c == '.' {
84+
if punct && !res.is_empty() {
85+
res.push('_')
86+
}
87+
res.push(c.to_ascii_lowercase());
88+
punct = false;
89+
} else {
90+
punct = true;
91+
}
92+
}
93+
res
94+
}
95+
7796
impl ByteInnerNewOptions {
7897
pub fn get_value(self, vm: &VirtualMachine) -> PyResult<PyByteInner> {
7998
// First handle bytes(string, encoding[, errors])
8099
if let OptionalArg::Present(enc) = self.encoding {
81100
if let OptionalArg::Present(eval) = self.val_option {
82101
if let Ok(input) = eval.downcast::<PyString>() {
83-
let encoding = enc.as_str();
84-
if encoding.to_lowercase() == "utf8" || encoding.to_lowercase() == "utf-8"
85-
// TODO: different encoding
86-
{
87-
return Ok(PyByteInner {
88-
elements: input.value.as_bytes().to_vec(),
89-
});
90-
} else {
91-
return Err(
92-
vm.new_value_error(format!("unknown encoding: {}", encoding)), //should be lookup error
93-
);
94-
}
102+
let inner = PyByteInner::from_string(&input.value, enc.as_str(), vm)?;
103+
return Ok(inner);
95104
} else {
96105
return Err(vm.new_type_error("encoding without a string argument".to_string()));
97106
}
@@ -311,6 +320,20 @@ impl ByteInnerSplitlinesOptions {
311320
}
312321

313322
impl PyByteInner {
323+
pub fn from_string(value: &str, encoding: &str, vm: &VirtualMachine) -> PyResult<Self> {
324+
let normalized = normalize_encoding(encoding);
325+
if normalized == "utf_8" || normalized == "utf8" || normalized == "u8" {
326+
Ok(PyByteInner {
327+
elements: value.as_bytes().to_vec(),
328+
})
329+
} else {
330+
// TODO: different encoding
331+
Err(
332+
vm.new_value_error(format!("unknown encoding: {}", encoding)), // should be lookup error
333+
)
334+
}
335+
}
336+
314337
pub fn repr(&self) -> PyResult<String> {
315338
let mut res = String::with_capacity(self.elements.len());
316339
for i in self.elements.iter() {

vm/src/obj/objbytes.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ impl PyBytes {
4545
inner: PyByteInner { elements },
4646
}
4747
}
48+
49+
pub fn from_string(value: &str, encoding: &str, vm: &VirtualMachine) -> PyResult<Self> {
50+
Ok(PyBytes {
51+
inner: PyByteInner::from_string(value, encoding, vm)?,
52+
})
53+
}
54+
4855
pub fn get_value(&self) -> &[u8] {
4956
&self.inner.elements
5057
}

vm/src/obj/objstr.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use crate::pyobject::{
1919
};
2020
use crate::vm::VirtualMachine;
2121

22+
use super::objbytes::PyBytes;
2223
use super::objdict::PyDict;
2324
use super::objint::{self, PyInt};
2425
use super::objnone::PyNone;
@@ -957,6 +958,31 @@ impl PyString {
957958
}
958959
}
959960
}
961+
962+
#[pymethod]
963+
fn encode(
964+
&self,
965+
encoding: OptionalArg<PyObjectRef>,
966+
_errors: OptionalArg<PyObjectRef>,
967+
vm: &VirtualMachine,
968+
) -> PyResult {
969+
let encoding = encoding.map_or_else(
970+
|| Ok("utf-8".to_string()),
971+
|v| {
972+
if objtype::isinstance(&v, &vm.ctx.str_type()) {
973+
Ok(get_value(&v))
974+
} else {
975+
Err(vm.new_type_error(format!(
976+
"encode() argument 1 must be str, not {}",
977+
v.class().name
978+
)))
979+
}
980+
},
981+
)?;
982+
983+
let encoded = PyBytes::from_string(&self.value, &encoding, vm)?;
984+
Ok(encoded.into_pyobject(vm)?)
985+
}
960986
}
961987

962988
impl PyValue for PyString {

0 commit comments

Comments
 (0)