diff --git a/Cargo.lock b/Cargo.lock index a71ceb53b3..28fe718260 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1270,9 +1270,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "lexical-parse-float" -version = "0.8.5" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -1281,9 +1281,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.6" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ "lexical-util", "static_assertions", @@ -1291,9 +1291,9 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ "static_assertions", ] diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py index 19c17af596..30d27072fb 100644 --- a/Lib/test/test_float.py +++ b/Lib/test/test_float.py @@ -35,8 +35,6 @@ class OtherFloatSubclass(float): class GeneralFloatCases(unittest.TestCase): - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_float(self): self.assertEqual(float(3.14), 3.14) self.assertEqual(float(314), 314.0) diff --git a/common/src/str.rs b/common/src/str.rs index e72f2efb95..ca1723e7ef 100644 --- a/common/src/str.rs +++ b/common/src/str.rs @@ -609,6 +609,49 @@ macro_rules! ascii { } pub use ascii; +// TODO: this should probably live in a crate like unic or unicode-properties +const UNICODE_DECIMAL_VALUES: &[char] = &[ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '٠', '١', '٢', '٣', '٤', '٥', '٦', '٧', '٨', + '٩', '۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹', '߀', '߁', '߂', '߃', '߄', '߅', '߆', '߇', + '߈', '߉', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '০', '১', '২', '৩', '৪', '৫', '৬', + '৭', '৮', '৯', '੦', '੧', '੨', '੩', '੪', '੫', '੬', '੭', '੮', '੯', '૦', '૧', '૨', '૩', '૪', '૫', + '૬', '૭', '૮', '૯', '୦', '୧', '୨', '୩', '୪', '୫', '୬', '୭', '୮', '୯', '௦', '௧', '௨', '௩', '௪', + '௫', '௬', '௭', '௮', '௯', '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯', '೦', '೧', '೨', '೩', + '೪', '೫', '೬', '೭', '೮', '೯', '൦', '൧', '൨', '൩', '൪', '൫', '൬', '൭', '൮', '൯', '෦', '෧', '෨', + '෩', '෪', '෫', '෬', '෭', '෮', '෯', '๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', '໐', '໑', + '໒', '໓', '໔', '໕', '໖', '໗', '໘', '໙', '༠', '༡', '༢', '༣', '༤', '༥', '༦', '༧', '༨', '༩', '၀', + '၁', '၂', '၃', '၄', '၅', '၆', '၇', '၈', '၉', '႐', '႑', '႒', '႓', '႔', '႕', '႖', '႗', '႘', '႙', + '០', '១', '២', '៣', '៤', '៥', '៦', '៧', '៨', '៩', '᠐', '᠑', '᠒', '᠓', '᠔', '᠕', '᠖', '᠗', '᠘', + '᠙', '᥆', '᥇', '᥈', '᥉', '᥊', '᥋', '᥌', '᥍', '᥎', '᥏', '᧐', '᧑', '᧒', '᧓', '᧔', '᧕', '᧖', '᧗', + '᧘', '᧙', '᪀', '᪁', '᪂', '᪃', '᪄', '᪅', '᪆', '᪇', '᪈', '᪉', '᪐', '᪑', '᪒', '᪓', '᪔', '᪕', '᪖', + '᪗', '᪘', '᪙', '᭐', '᭑', '᭒', '᭓', '᭔', '᭕', '᭖', '᭗', '᭘', '᭙', '᮰', '᮱', '᮲', '᮳', '᮴', '᮵', + '᮶', '᮷', '᮸', '᮹', '᱀', '᱁', '᱂', '᱃', '᱄', '᱅', '᱆', '᱇', '᱈', '᱉', '᱐', '᱑', '᱒', '᱓', '᱔', + '᱕', '᱖', '᱗', '᱘', '᱙', '꘠', '꘡', '꘢', '꘣', '꘤', '꘥', '꘦', '꘧', '꘨', '꘩', '꣐', '꣑', '꣒', '꣓', + '꣔', '꣕', '꣖', '꣗', '꣘', '꣙', '꤀', '꤁', '꤂', '꤃', '꤄', '꤅', '꤆', '꤇', '꤈', '꤉', '꧐', '꧑', '꧒', + '꧓', '꧔', '꧕', '꧖', '꧗', '꧘', '꧙', '꧰', '꧱', '꧲', '꧳', '꧴', '꧵', '꧶', '꧷', '꧸', '꧹', '꩐', '꩑', + '꩒', '꩓', '꩔', '꩕', '꩖', '꩗', '꩘', '꩙', '꯰', '꯱', '꯲', '꯳', '꯴', '꯵', '꯶', '꯷', '꯸', '꯹', '0', + '1', '2', '3', '4', '5', '6', '7', '8', '9', '𐒠', '𐒡', '𐒢', '𐒣', '𐒤', '𐒥', '𐒦', '𐒧', + '𐒨', '𐒩', '𑁦', '𑁧', '𑁨', '𑁩', '𑁪', '𑁫', '𑁬', '𑁭', '𑁮', '𑁯', '𑃰', '𑃱', '𑃲', '𑃳', '𑃴', '𑃵', '𑃶', + '𑃷', '𑃸', '𑃹', '𑄶', '𑄷', '𑄸', '𑄹', '𑄺', '𑄻', '𑄼', '𑄽', '𑄾', '𑄿', '𑇐', '𑇑', '𑇒', '𑇓', '𑇔', '𑇕', + '𑇖', '𑇗', '𑇘', '𑇙', '𑋰', '𑋱', '𑋲', '𑋳', '𑋴', '𑋵', '𑋶', '𑋷', '𑋸', '𑋹', '𑑐', '𑑑', '𑑒', '𑑓', '𑑔', + '𑑕', '𑑖', '𑑗', '𑑘', '𑑙', '𑓐', '𑓑', '𑓒', '𑓓', '𑓔', '𑓕', '𑓖', '𑓗', '𑓘', '𑓙', '𑙐', '𑙑', '𑙒', '𑙓', + '𑙔', '𑙕', '𑙖', '𑙗', '𑙘', '𑙙', '𑛀', '𑛁', '𑛂', '𑛃', '𑛄', '𑛅', '𑛆', '𑛇', '𑛈', '𑛉', '𑜰', '𑜱', '𑜲', + '𑜳', '𑜴', '𑜵', '𑜶', '𑜷', '𑜸', '𑜹', '𑣠', '𑣡', '𑣢', '𑣣', '𑣤', '𑣥', '𑣦', '𑣧', '𑣨', '𑣩', '𑱐', '𑱑', + '𑱒', '𑱓', '𑱔', '𑱕', '𑱖', '𑱗', '𑱘', '𑱙', '𑵐', '𑵑', '𑵒', '𑵓', '𑵔', '𑵕', '𑵖', '𑵗', '𑵘', '𑵙', '𖩠', + '𖩡', '𖩢', '𖩣', '𖩤', '𖩥', '𖩦', '𖩧', '𖩨', '𖩩', '𖭐', '𖭑', '𖭒', '𖭓', '𖭔', '𖭕', '𖭖', '𖭗', '𖭘', '𖭙', + '𝟎', '𝟏', '𝟐', '𝟑', '𝟒', '𝟓', '𝟔', '𝟕', '𝟖', '𝟗', '𝟘', '𝟙', '𝟚', '𝟛', '𝟜', '𝟝', '𝟞', '𝟟', '𝟠', + '𝟡', '𝟢', '𝟣', '𝟤', '𝟥', '𝟦', '𝟧', '𝟨', '𝟩', '𝟪', '𝟫', '𝟬', '𝟭', '𝟮', '𝟯', '𝟰', '𝟱', '𝟲', '𝟳', + '𝟴', '𝟵', '𝟶', '𝟷', '𝟸', '𝟹', '𝟺', '𝟻', '𝟼', '𝟽', '𝟾', '𝟿', '𞥐', '𞥑', '𞥒', '𞥓', '𞥔', '𞥕', '𞥖', + '𞥗', '𞥘', '𞥙', +]; + +pub fn char_to_decimal(ch: char) -> Option<u8> { + UNICODE_DECIMAL_VALUES + .binary_search(&ch) + .ok() + .map(|i| (i % 10) as u8) +} + #[cfg(test)] mod tests { use super::*; diff --git a/compiler/codegen/src/unparse.rs b/compiler/codegen/src/unparse.rs index 458ff76fc7..1ecf1f9334 100644 --- a/compiler/codegen/src/unparse.rs +++ b/compiler/codegen/src/unparse.rs @@ -366,7 +366,7 @@ impl<'a, 'b, 'c> Unparser<'a, 'b, 'c> { } } &ruff::Number::Complex { real, imag } => self - .p(&rustpython_literal::float::complex_to_string(real, imag) + .p(&rustpython_literal::complex::to_string(real, imag) .replace("inf", inf_str))?, } } diff --git a/compiler/literal/Cargo.toml b/compiler/literal/Cargo.toml index b4fa6229f1..da55d107b3 100644 --- a/compiler/literal/Cargo.toml +++ b/compiler/literal/Cargo.toml @@ -13,7 +13,7 @@ rustpython-wtf8 = { workspace = true } hexf-parse = "0.2.1" is-macro.workspace = true -lexical-parse-float = { version = "0.8.0", features = ["format"] } +lexical-parse-float = { version = "1.0.4", features = ["format"] } num-traits = { workspace = true } unic-ucd-category = { workspace = true } diff --git a/compiler/literal/src/complex.rs b/compiler/literal/src/complex.rs new file mode 100644 index 0000000000..076f2807c9 --- /dev/null +++ b/compiler/literal/src/complex.rs @@ -0,0 +1,73 @@ +use crate::float; + +/// Convert a complex number to a string. +pub fn to_string(re: f64, im: f64) -> String { + // integer => drop ., fractional => float_ops + let mut im_part = if im.fract() == 0.0 { + im.to_string() + } else { + float::to_string(im) + }; + im_part.push('j'); + + // positive empty => return im_part, integer => drop ., fractional => float_ops + let re_part = if re == 0.0 { + if re.is_sign_positive() { + return im_part; + } else { + "-0".to_owned() + } + } else if re.fract() == 0.0 { + re.to_string() + } else { + float::to_string(re) + }; + let mut result = + String::with_capacity(re_part.len() + im_part.len() + 2 + im.is_sign_positive() as usize); + result.push('('); + result.push_str(&re_part); + if im.is_sign_positive() || im.is_nan() { + result.push('+'); + } + result.push_str(&im_part); + result.push(')'); + result +} + +/// Parse a complex number from a string. +/// +/// Returns `Some((re, im))` on success. +pub fn parse_str(s: &str) -> Option<(f64, f64)> { + let s = s.trim(); + // Handle parentheses + let s = match s.strip_prefix('(') { + None => s, + Some(s) => s.strip_suffix(')')?.trim(), + }; + + let value = match s.strip_suffix(|c| c == 'j' || c == 'J') { + None => (float::parse_str(s)?, 0.0), + Some(mut s) => { + let mut real = 0.0; + // Find the central +/- operator. If it exists, parse the real part. + for (i, w) in s.as_bytes().windows(2).enumerate() { + if (w[1] == b'+' || w[1] == b'-') && !(w[0] == b'e' || w[0] == b'E') { + real = float::parse_str(&s[..=i])?; + s = &s[i + 1..]; + break; + } + } + + let imag = match s { + // "j", "+j" + "" | "+" => 1.0, + // "-j" + "-" => -1.0, + s => float::parse_str(s)?, + }; + + (real, imag) + } + }; + Some(value) +} diff --git a/compiler/literal/src/float.rs b/compiler/literal/src/float.rs index e05a105fd4..49771b8184 100644 --- a/compiler/literal/src/float.rs +++ b/compiler/literal/src/float.rs @@ -6,49 +6,8 @@ pub fn parse_str(literal: &str) -> Option<f64> { parse_inner(literal.trim().as_bytes()) } -fn strip_underlines(literal: &[u8]) -> Option<Vec<u8>> { - let mut prev = b'\0'; - let mut dup = Vec::<u8>::new(); - for p in literal { - if *p == b'_' { - // Underscores are only allowed after digits. - if !prev.is_ascii_digit() { - return None; - } - } else { - dup.push(*p); - // Underscores are only allowed before digits. - if prev == b'_' && !p.is_ascii_digit() { - return None; - } - } - prev = *p; - } - - // Underscores are not allowed at the end. - if prev == b'_' { - return None; - } - - Some(dup) -} - pub fn parse_bytes(literal: &[u8]) -> Option<f64> { - parse_inner(trim_slice(literal, |b| b.is_ascii_whitespace())) -} - -fn trim_slice<T>(v: &[T], mut trim: impl FnMut(&T) -> bool) -> &[T] { - let mut it = v.iter(); - // it.take_while_ref(&mut trim).for_each(drop); - // hmm.. `&mut slice::Iter<_>` is not `Clone` - // it.by_ref().rev().take_while_ref(&mut trim).for_each(drop); - while it.clone().next().is_some_and(&mut trim) { - it.next(); - } - while it.clone().next_back().is_some_and(&mut trim) { - it.next_back(); - } - it.as_slice() + parse_inner(literal.trim_ascii()) } fn parse_inner(literal: &[u8]) -> Option<f64> { @@ -56,15 +15,11 @@ fn parse_inner(literal: &[u8]) -> Option<f64> { FromLexicalWithOptions, NumberFormatBuilder, Options, format::PYTHON3_LITERAL, }; - // Use custom function for underline handling for now. - // For further information see https://github.com/Alexhuszagh/rust-lexical/issues/96. - let stripped = strip_underlines(literal)?; - // lexical-core's format::PYTHON_STRING is inaccurate const PYTHON_STRING: u128 = NumberFormatBuilder::rebuild(PYTHON3_LITERAL) .no_special(false) .build(); - f64::from_lexical_with_options::<PYTHON_STRING>(&stripped, &Options::new()).ok() + f64::from_lexical_with_options::<PYTHON_STRING>(literal, &Options::new()).ok() } pub fn is_integer(v: f64) -> bool { @@ -223,39 +178,6 @@ pub fn to_string(value: f64) -> String { } } -pub fn complex_to_string(re: f64, im: f64) -> String { - // integer => drop ., fractional => float_ops - let mut im_part = if im.fract() == 0.0 { - im.to_string() - } else { - to_string(im) - }; - im_part.push('j'); - - // positive empty => return im_part, integer => drop ., fractional => float_ops - let re_part = if re == 0.0 { - if re.is_sign_positive() { - return im_part; - } else { - re.to_string() - } - } else if re.fract() == 0.0 { - re.to_string() - } else { - to_string(re) - }; - let mut result = - String::with_capacity(re_part.len() + im_part.len() + 2 + im.is_sign_positive() as usize); - result.push('('); - result.push_str(&re_part); - if im.is_sign_positive() || im.is_nan() { - result.push('+'); - } - result.push_str(&im_part); - result.push(')'); - result -} - pub fn from_hex(s: &str) -> Option<f64> { if let Ok(f) = hexf_parse::parse_hexf64(s, false) { return Some(f); diff --git a/compiler/literal/src/lib.rs b/compiler/literal/src/lib.rs index 9b9620573d..2997107012 100644 --- a/compiler/literal/src/lib.rs +++ b/compiler/literal/src/lib.rs @@ -1,4 +1,5 @@ pub mod char; +pub mod complex; pub mod escape; pub mod float; pub mod format; diff --git a/vm/src/builtins/complex.rs b/vm/src/builtins/complex.rs index a3a6d4d681..d48707261c 100644 --- a/vm/src/builtins/complex.rs +++ b/vm/src/builtins/complex.rs @@ -179,13 +179,13 @@ impl Constructor for PyComplex { "complex() can't take second arg if first is a string".to_owned(), )); } - let value = s + let (re, im) = s .to_str() - .and_then(|s| parse_str(s.trim())) + .and_then(rustpython_literal::complex::parse_str) .ok_or_else(|| { vm.new_value_error("complex() arg is a malformed string".to_owned()) })?; - return Self::from(value) + return Self::from(Complex64 { re, im }) .into_ref_with_type(vm, cls) .map(Into::into); } else { @@ -494,7 +494,7 @@ impl Representable for PyComplex { // TODO: when you fix this, move it to rustpython_common::complex::repr and update // ast/src/unparse.rs + impl Display for Constant in ast/src/constant.rs let Complex64 { re, im } = zelf.value; - Ok(rustpython_literal::float::complex_to_string(re, im)) + Ok(rustpython_literal::complex::to_string(re, im)) } } @@ -519,40 +519,3 @@ pub struct ComplexArgs { #[pyarg(any, optional)] imag: OptionalArg<PyObjectRef>, } - -fn parse_str(s: &str) -> Option<Complex64> { - // Handle parentheses - let s = match s.strip_prefix('(') { - None => s, - Some(s) => match s.strip_suffix(')') { - None => return None, - Some(s) => s.trim(), - }, - }; - - let value = match s.strip_suffix(|c| c == 'j' || c == 'J') { - None => Complex64::new(crate::literal::float::parse_str(s)?, 0.0), - Some(mut s) => { - let mut real = 0.0; - // Find the central +/- operator. If it exists, parse the real part. - for (i, w) in s.as_bytes().windows(2).enumerate() { - if (w[1] == b'+' || w[1] == b'-') && !(w[0] == b'e' || w[0] == b'E') { - real = crate::literal::float::parse_str(&s[..=i])?; - s = &s[i + 1..]; - break; - } - } - - let imag = match s { - // "j", "+j" - "" | "+" => 1.0, - // "-j" - "-" => -1.0, - s => crate::literal::float::parse_str(s)?, - }; - - Complex64::new(real, imag) - } - }; - Some(value) -} diff --git a/vm/src/builtins/float.rs b/vm/src/builtins/float.rs index 48ccd2c437..27f1f3273f 100644 --- a/vm/src/builtins/float.rs +++ b/vm/src/builtins/float.rs @@ -159,9 +159,31 @@ impl Constructor for PyFloat { } fn float_from_string(val: PyObjectRef, vm: &VirtualMachine) -> PyResult<f64> { - let (bytearray, buffer, buffer_lock); + let (bytearray, buffer, buffer_lock, mapped_string); let b = if let Some(s) = val.payload_if_subclass::<PyStr>(vm) { - s.as_wtf8().trim().as_bytes() + use crate::common::str::PyKindStr; + match s.as_str_kind() { + PyKindStr::Ascii(s) => s.trim().as_bytes(), + PyKindStr::Utf8(s) => { + mapped_string = s + .trim() + .chars() + .map(|c| { + if let Some(n) = rustpython_common::str::char_to_decimal(c) { + char::from_digit(n.into(), 10).unwrap() + } else if c.is_whitespace() { + ' ' + } else { + c + } + }) + .collect::<String>(); + mapped_string.as_bytes() + } + // if there are surrogates, it's not gonna parse anyway, + // so we can just choose a known bad value + PyKindStr::Wtf8(_) => b"", + } } else if let Some(bytes) = val.payload_if_subclass::<PyBytes>(vm) { bytes.as_bytes() } else if let Some(buf) = val.payload_if_subclass::<PyByteArray>(vm) {