From ad357d08af064ff6611760467a9450321f50c6fe Mon Sep 17 00:00:00 2001 From: jgirardet Date: Tue, 7 May 2019 01:42:16 +0900 Subject: [PATCH 1/3] normalize_encoding --- vm/src/obj/objbyteinner.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/vm/src/obj/objbyteinner.rs b/vm/src/obj/objbyteinner.rs index e420fa038b..ee7ad3d3a8 100644 --- a/vm/src/obj/objbyteinner.rs +++ b/vm/src/obj/objbyteinner.rs @@ -74,6 +74,25 @@ pub struct ByteInnerNewOptions { encoding: OptionalArg, } +//same algorithm as cpython +pub fn normalize_encoding(encoding: &str) -> String { + let mut res = String::new(); + let mut punct = false; + + for c in encoding.chars() { + if c.is_alphanumeric() || c == '.' { + if punct && !res.is_empty() { + res.push('_') + } + res.push(c.to_ascii_lowercase()); + punct = false; + } else { + punct = true; + } + } + res +} + impl ByteInnerNewOptions { pub fn get_value(self, vm: &VirtualMachine) -> PyResult { // First handle bytes(string, encoding[, errors]) From 7f2560c9e1b89d63559870048b6469711400a336 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Wed, 1 May 2019 13:48:36 +0900 Subject: [PATCH 2/3] Add str.encode for utf-8 --- tests/snippets/strings.py | 10 ++++++++++ vm/src/function.rs | 11 +++++++++++ vm/src/obj/objbyteinner.rs | 26 +++++++++++++++----------- vm/src/obj/objstr.rs | 26 ++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/tests/snippets/strings.py b/tests/snippets/strings.py index 786622e9f5..aaeaed5f3e 100644 --- a/tests/snippets/strings.py +++ b/tests/snippets/strings.py @@ -206,3 +206,13 @@ def try_mutate_str(): word[0] = 'x' assert_raises(TypeError, try_mutate_str) + +ss = ['Hello', '안녕', '👋'] +bs = [b'Hello', b'\xec\x95\x88\xeb\x85\x95', b'\xf0\x9f\x91\x8b'] + +for s, b in zip(ss, bs): + assert s.encode() == b + +for s, b, e in zip(ss, bs, ['u8', 'U8', 'utf-8', 'UTF-8', 'utf_8']): + assert s.encode(e) == b + # assert s.encode(encoding=e) == b diff --git a/vm/src/function.rs b/vm/src/function.rs index 47fc051d59..22802ac1fa 100644 --- a/vm/src/function.rs +++ b/vm/src/function.rs @@ -374,6 +374,17 @@ impl OptionalArg { Missing => f(), } } + + pub fn map_or_else(self, default: D, f: F) -> U + where + D: FnOnce() -> U, + F: FnOnce(T) -> U, + { + match self { + Present(value) => f(value), + Missing => default(), + } + } } impl FromArgs for OptionalArg diff --git a/vm/src/obj/objbyteinner.rs b/vm/src/obj/objbyteinner.rs index ee7ad3d3a8..85bedd3017 100644 --- a/vm/src/obj/objbyteinner.rs +++ b/vm/src/obj/objbyteinner.rs @@ -93,6 +93,18 @@ pub fn normalize_encoding(encoding: &str) -> String { res } +pub fn encode_to_vec(value: &str, encoding: &str, vm: &VirtualMachine) -> PyResult> { + let encoding = normalize_encoding(encoding); + if encoding == "utf_8" || encoding == "u8" { + Ok(value.as_bytes().to_vec()) + } else { + // TODO: different encoding + return Err( + vm.new_value_error(format!("unknown encoding: {}", encoding)), //should be lookup error + ); + } +} + impl ByteInnerNewOptions { pub fn get_value(self, vm: &VirtualMachine) -> PyResult { // First handle bytes(string, encoding[, errors]) @@ -100,17 +112,9 @@ impl ByteInnerNewOptions { if let OptionalArg::Present(eval) = self.val_option { if let Ok(input) = eval.downcast::() { let encoding = enc.as_str(); - if encoding.to_lowercase() == "utf8" || encoding.to_lowercase() == "utf-8" - // TODO: different encoding - { - return Ok(PyByteInner { - elements: input.value.as_bytes().to_vec(), - }); - } else { - return Err( - vm.new_value_error(format!("unknown encoding: {}", encoding)), //should be lookup error - ); - } + return Ok(PyByteInner { + elements: encode_to_vec(&input.value, &encoding, vm)?, + }); } else { return Err(vm.new_type_error("encoding without a string argument".to_string())); } diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 6545c99858..827b109f03 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -19,6 +19,7 @@ use crate::pyobject::{ }; use crate::vm::VirtualMachine; +use super::objbyteinner; use super::objdict::PyDict; use super::objint::{self, PyInt}; use super::objnone::PyNone; @@ -957,6 +958,31 @@ impl PyString { } } } + + #[pymethod] + fn encode( + &self, + encoding: OptionalArg, + _errors: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + let encoding = encoding.map_or_else( + || Ok("utf-8".to_string()), + |v| { + if objtype::isinstance(&v, &vm.ctx.str_type()) { + Ok(get_value(&v)) + } else { + Err(vm.new_type_error(format!( + "encode() argument 1 must be str, not {}", + v.class().name + ))) + } + }, + )?; + + let encoded = objbyteinner::encode_to_vec(&self.value, &encoding, vm)?; + Ok(vm.ctx.new_bytes(encoded)) + } } impl PyValue for PyString { From 59476c65bb19a151dd0e5e795e8119f5aadeef64 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Tue, 7 May 2019 02:05:46 +0900 Subject: [PATCH 3/3] PyBytes::from_string --- vm/src/obj/objbyteinner.rs | 32 ++++++++++++++++---------------- vm/src/obj/objbytes.rs | 7 +++++++ vm/src/obj/objstr.rs | 6 +++--- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/vm/src/obj/objbyteinner.rs b/vm/src/obj/objbyteinner.rs index 85bedd3017..32a58255f8 100644 --- a/vm/src/obj/objbyteinner.rs +++ b/vm/src/obj/objbyteinner.rs @@ -93,28 +93,14 @@ pub fn normalize_encoding(encoding: &str) -> String { res } -pub fn encode_to_vec(value: &str, encoding: &str, vm: &VirtualMachine) -> PyResult> { - let encoding = normalize_encoding(encoding); - if encoding == "utf_8" || encoding == "u8" { - Ok(value.as_bytes().to_vec()) - } else { - // TODO: different encoding - return Err( - vm.new_value_error(format!("unknown encoding: {}", encoding)), //should be lookup error - ); - } -} - impl ByteInnerNewOptions { pub fn get_value(self, vm: &VirtualMachine) -> PyResult { // First handle bytes(string, encoding[, errors]) if let OptionalArg::Present(enc) = self.encoding { if let OptionalArg::Present(eval) = self.val_option { if let Ok(input) = eval.downcast::() { - let encoding = enc.as_str(); - return Ok(PyByteInner { - elements: encode_to_vec(&input.value, &encoding, vm)?, - }); + let inner = PyByteInner::from_string(&input.value, enc.as_str(), vm)?; + return Ok(inner); } else { return Err(vm.new_type_error("encoding without a string argument".to_string())); } @@ -334,6 +320,20 @@ impl ByteInnerSplitlinesOptions { } impl PyByteInner { + pub fn from_string(value: &str, encoding: &str, vm: &VirtualMachine) -> PyResult { + let normalized = normalize_encoding(encoding); + if normalized == "utf_8" || normalized == "utf8" || normalized == "u8" { + Ok(PyByteInner { + elements: value.as_bytes().to_vec(), + }) + } else { + // TODO: different encoding + Err( + vm.new_value_error(format!("unknown encoding: {}", encoding)), // should be lookup error + ) + } + } + pub fn repr(&self) -> PyResult { let mut res = String::with_capacity(self.elements.len()); for i in self.elements.iter() { diff --git a/vm/src/obj/objbytes.rs b/vm/src/obj/objbytes.rs index dd2e1518c0..33016e511a 100644 --- a/vm/src/obj/objbytes.rs +++ b/vm/src/obj/objbytes.rs @@ -45,6 +45,13 @@ impl PyBytes { inner: PyByteInner { elements }, } } + + pub fn from_string(value: &str, encoding: &str, vm: &VirtualMachine) -> PyResult { + Ok(PyBytes { + inner: PyByteInner::from_string(value, encoding, vm)?, + }) + } + pub fn get_value(&self) -> &[u8] { &self.inner.elements } diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 827b109f03..b0d19e9c3f 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -19,7 +19,7 @@ use crate::pyobject::{ }; use crate::vm::VirtualMachine; -use super::objbyteinner; +use super::objbytes::PyBytes; use super::objdict::PyDict; use super::objint::{self, PyInt}; use super::objnone::PyNone; @@ -980,8 +980,8 @@ impl PyString { }, )?; - let encoded = objbyteinner::encode_to_vec(&self.value, &encoding, vm)?; - Ok(vm.ctx.new_bytes(encoded)) + let encoded = PyBytes::from_string(&self.value, &encoding, vm)?; + Ok(encoded.into_pyobject(vm)?) } }