Skip to content

Commit cd6159b

Browse files
committed
Optionally support Python's WTF-8 string litterals.
eg. '-\u5171\u0141\u2661\u0363\uDC80'
1 parent 7c0354b commit cd6159b

File tree

7 files changed

+134
-66
lines changed

7 files changed

+134
-66
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ script:
88
- cargo test --no-default-features --features "$FEATURES"
99
env:
1010
matrix:
11-
- FEATURES="bigint"
11+
- FEATURES="bigint wtf8"
1212
- FEATURES=""

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ authors = ["Valentin Lorentz <progval+git@progval.net>"]
55
license = "GPL-3.0-or-later"
66

77
[features]
8-
default = ["bigint"]
8+
default = ["bigint", "wtf8"]
99
bigint = ["num-traits", "num-bigint"]
1010

1111
[[bin]]
@@ -20,6 +20,7 @@ unicode-xid = "^0.1"
2020
#unicode_names = "^0.1.7"
2121
num-traits = { version="^0.2.4", optional=true }
2222
num-bigint = { version="^0.2.0", optional=true }
23+
wtf8 = { version="^0.0.3", optional=true }
2324

2425
[dev-dependencies]
2526
pretty_assertions = "^0.4"

src/ast.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,26 @@
11
use std::fmt;
2+
23
#[cfg(feature="bigint")]
34
use num_bigint::BigUint;
45

6+
#[cfg(feature="wtf8")]
7+
use wtf8;
8+
59
#[cfg(feature="bigint")]
610
pub type IntegerType = BigUint;
711
#[cfg(not(feature="bigint"))]
812
pub type IntegerType = u64;
913

14+
#[cfg(feature="wtf8")]
15+
pub type PyStringContent = wtf8::Wtf8Buf;
16+
#[cfg(feature="wtf8")]
17+
pub type PyStringCodePoint = wtf8::CodePoint;
18+
19+
#[cfg(not(feature="wtf8"))]
20+
pub type PyStringContent = String;
21+
#[cfg(not(feature="wtf8"))]
22+
pub type PyStringCodePoint = char;
23+
1024
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1125
pub enum ArgumentError {
1226
KeywordExpression,
@@ -206,7 +220,7 @@ pub enum SetItem {
206220
#[derive(Clone, Debug, PartialEq, Eq)]
207221
pub struct PyString {
208222
pub prefix: String,
209-
pub content: String,
223+
pub content: PyStringContent,
210224
}
211225

212226
#[derive(Clone, Debug, PartialEq)]

src/expressions.rs

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -592,43 +592,51 @@ named!(pub yield_expr<StrSpan, Expression>,
592592
mod tests {
593593
use helpers::{NewlinesAreNotSpaces, make_strspan, assert_parse_eq};
594594
use super::*;
595+
596+
#[cfg(feature="wtf8")]
597+
fn new_pystring(prefix: &str, s: &str) -> PyString {
598+
PyString { prefix: prefix.to_string(), content: PyStringContent::from_str(s) }
599+
}
600+
601+
#[cfg(not(feature="wtf8"))]
602+
fn new_pystring(prefix: &str, s: &str) -> PyString {
603+
PyString { prefix: prefix.to_string(), content: s.to_string() }
604+
}
595605

596606
#[test]
597607
fn test_string() {
598608
let atom = ExpressionParser::<NewlinesAreNotSpaces>::atom;
599-
let new_pystring = |s: &str| PyString { prefix: "".to_string(), content: s.to_string() };
600609
assert_parse_eq(atom(make_strspan(r#""foo" "#)), Ok((make_strspan(" "),
601-
Box::new(Expression::String(vec![new_pystring("foo")])))
610+
Box::new(Expression::String(vec![new_pystring("", "foo")])))
602611
));
603612
assert_parse_eq(atom(make_strspan(r#""foo" "bar""#)), Ok((make_strspan(""),
604-
Box::new(Expression::String(vec![new_pystring("foo"), new_pystring("bar")])))
613+
Box::new(Expression::String(vec![new_pystring("", "foo"), new_pystring("", "bar")])))
605614
));
606615
assert_parse_eq(atom(make_strspan(r#""fo\"o" "#)), Ok((make_strspan(" "),
607-
Box::new(Expression::String(vec![new_pystring("fo\"o")])))
616+
Box::new(Expression::String(vec![new_pystring("", "fo\"o")])))
608617
));
609618
assert_parse_eq(atom(make_strspan(r#""fo"o" "#)), Ok((make_strspan(r#"o" "#),
610-
Box::new(Expression::String(vec![new_pystring("fo")])))
619+
Box::new(Expression::String(vec![new_pystring("", "fo")])))
611620
));
612621
assert_parse_eq(atom(make_strspan(r#""fo \" o" "#)), Ok((make_strspan(" "),
613-
Box::new(Expression::String(vec![new_pystring("fo \" o")])))
622+
Box::new(Expression::String(vec![new_pystring("", "fo \" o")])))
614623
));
615624
assert_parse_eq(atom(make_strspan(r#"'fo \' o' "#)), Ok((make_strspan(" "),
616-
Box::new(Expression::String(vec![new_pystring("fo ' o")])))
625+
Box::new(Expression::String(vec![new_pystring("", "fo ' o")])))
617626
));
618627
assert_parse_eq(atom(make_strspan(r#"r'fo \' o' "#)), Ok((make_strspan(" "),
619-
Box::new(Expression::String(vec![PyString { prefix: "r".to_string(), content: "fo \\' o".to_string() }])))
628+
Box::new(Expression::String(vec![new_pystring("r", "fo \\' o")])))
620629
));
621630

622631
assert_parse_eq(atom(make_strspan(r#"'\x8a'"#)), Ok((make_strspan(""),
623-
Box::new(Expression::String(vec![new_pystring("\u{8a}")])))
632+
Box::new(Expression::String(vec![new_pystring("", "\u{8a}")])))
624633
));
625634
}
626635

627636
#[test]
628637
fn test_triple_quotes_string() {
629-
let new_pystring = |s: &str| PyString { prefix: "".to_string(), content: s.to_string() };
630638
let atom = ExpressionParser::<NewlinesAreNotSpaces>::atom;
631-
assert_parse_eq(atom(make_strspan(r#"'''fo ' o''' "#)), Ok((make_strspan(" "), Box::new(Expression::String(vec![new_pystring("fo ' o")])))));
639+
assert_parse_eq(atom(make_strspan(r#"'''fo ' o''' "#)), Ok((make_strspan(" "), Box::new(Expression::String(vec![new_pystring("", "fo ' o")])))));
632640
}
633641

634642
#[test]

src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ extern crate num_traits;
1616
#[cfg(feature="bigint")]
1717
extern crate num_bigint;
1818

19+
#[cfg(feature="wtf8")]
20+
extern crate wtf8;
21+
1922
#[macro_use]
2023
mod helpers;
2124
#[macro_use]

src/strings.rs

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,53 @@
11
use nom::anychar;
22

3+
#[cfg(feature="wtf8")]
4+
use wtf8;
5+
36
use helpers::StrSpan;
47
use ast::*;
58

6-
named!(escapedchar<StrSpan, Option<char>>,
9+
#[cfg(feature="wtf8")]
10+
fn cp_from_char(c: char) -> wtf8::CodePoint {
11+
wtf8::CodePoint::from_char(c)
12+
}
13+
#[cfg(feature="wtf8")]
14+
fn cp_from_u32(n: u32) -> Option<wtf8::CodePoint> {
15+
wtf8::CodePoint::from_u32(n)
16+
}
17+
#[cfg(not(feature="wtf8"))]
18+
fn cp_from_char(c: char) -> char {
19+
c
20+
}
21+
#[cfg(not(feature="wtf8"))]
22+
fn cp_from_u32(n: u32) -> Option<char> {
23+
::std::char::from_u32(n)
24+
}
25+
26+
named!(escapedchar<StrSpan, Option<PyStringCodePoint>>,
727
preceded!(char!('\\'),
828
alt!(
929
char!('\n') => { |_| None }
10-
| char!('\\') => { |_| Some('\\') }
11-
| char!('\'') => { |_| Some('\'') }
12-
| char!('"') => { |_| Some('"') }
13-
| char!('a') => { |_| Some('\x07') } // BEL
14-
| char!('b') => { |_| Some('\x08') } // BS
15-
| char!('f') => { |_| Some('\x0c') } // FF
16-
| char!('n') => { |_| Some('\n') }
17-
| char!('r') => { |_| Some('\r') }
18-
| char!('t') => { |_| Some('\t') }
19-
| char!('v') => { |_| Some('\x0b') } // VT
30+
| char!('\\') => { |_| Some(cp_from_char('\\')) }
31+
| char!('\'') => { |_| Some(cp_from_char('\'')) }
32+
| char!('"') => { |_| Some(cp_from_char('"')) }
33+
| char!('a') => { |_| Some(cp_from_char('\x07')) } // BEL
34+
| char!('b') => { |_| Some(cp_from_char('\x08')) } // BS
35+
| char!('f') => { |_| Some(cp_from_char('\x0c')) } // FF
36+
| char!('n') => { |_| Some(cp_from_char('\n')) }
37+
| char!('r') => { |_| Some(cp_from_char('\r')) }
38+
| char!('t') => { |_| Some(cp_from_char('\t')) }
39+
| char!('v') => { |_| Some(cp_from_char('\x0b')) } // VT
2040
| tuple!(one_of!("01234567"), opt!(one_of!("01234567")), opt!(one_of!("01234567"))) => { |(c1, c2, c3): (char, Option<char>, Option<char>)|
2141
match (c1.to_digit(8), c2.and_then(|c| c.to_digit(8)), c3.and_then(|c| c.to_digit(8))) {
22-
(Some(d1), Some(d2), Some(d3)) => ::std::char::from_u32((d1 << 6) + (d2 << 3) + d3),
23-
(Some(d1), Some(d2), None ) => ::std::char::from_u32((d1 << 3) + d2),
24-
(Some(d1), None, None ) => ::std::char::from_u32(d1),
42+
(Some(d1), Some(d2), Some(d3)) => cp_from_u32((d1 << 6) + (d2 << 3) + d3),
43+
(Some(d1), Some(d2), None ) => cp_from_u32((d1 << 3) + d2),
44+
(Some(d1), None, None ) => cp_from_u32(d1),
2545
_ => unreachable!(),
2646
}
2747
}
2848
| preceded!(char!('x'), tuple!(one_of!("0123456789abcdefABCDEF"), one_of!("0123456789abcdefABCDEF"))) => { |(c1, c2): (char, char)|
2949
match (c1.to_digit(16), c2.to_digit(16)) {
30-
(Some(d1), Some(d2)) => ::std::char::from_u32((d1 << 4) + d2),
50+
(Some(d1), Some(d2)) => cp_from_u32((d1 << 4) + d2),
3151
_ => unreachable!(),
3252
}
3353
}
@@ -38,14 +58,14 @@ named!(escapedchar<StrSpan, Option<char>>,
3858
| preceded!(char!('u'), count!(one_of!("0123456789abcdefABCDEF"), 4)) => { |v: Vec<char>| {
3959
let it: Vec<u32> = v.iter().map(|c| c.to_digit(16).unwrap()).collect();
4060
if let [d1, d2, d3, d4] = &it[..] {
41-
::std::char::from_u32((d1 << 12) + (d2 << 8) + (d3 << 4) + d4)
61+
cp_from_u32((d1 << 12) + (d2 << 8) + (d3 << 4) + d4)
4262
}
4363
else { unreachable!() }
4464
}}
4565
| preceded!(char!('U'), count!(one_of!("0123456789abcdefABCDEF"), 8)) => { |v: Vec<char>| {
4666
let it: Vec<u32> = v.iter().map(|c| c.to_digit(16).unwrap()).collect();
4767
if let [d1, d2, d3, d4, d5, d6, d7, d8] = &it[..] {
48-
::std::char::from_u32((d1 << 28) + (d2 << 24) + (d3 << 20) + (d4 << 16) +
68+
cp_from_u32((d1 << 28) + (d2 << 24) + (d3 << 20) + (d4 << 16) +
4969
(d5 << 12) + (d6 << 8) + (d7 << 4) + d8)
5070
}
5171
else { unreachable!() }
@@ -54,51 +74,51 @@ named!(escapedchar<StrSpan, Option<char>>,
5474
)
5575
);
5676

57-
named_args!(shortstring(quote: char) <StrSpan, String>,
77+
named_args!(shortstring(quote: char) <StrSpan, PyStringContent>,
5878
fold_many0!(
5979
alt!(
6080
call!(escapedchar)
61-
| verify!(anychar, |c:char| c != quote) => { |c:char| Some(c) }
81+
| verify!(anychar, |c:char| c != quote) => { |c:char| Some(cp_from_char(c)) }
6282
),
63-
String::new(),
64-
|mut acc:String, c:Option<char>| { match c { Some(c) => acc.push_str(&c.to_string()), None => () }; acc }
83+
PyStringContent::new(),
84+
|mut acc:PyStringContent, c:Option<PyStringCodePoint>| { match c { Some(c) => acc.push(c), None => () }; acc }
6585
)
6686
);
6787

68-
named_args!(longstring(quote: char) <StrSpan, String>,
88+
named_args!(longstring(quote: char) <StrSpan, PyStringContent>,
6989
fold_many0!(
7090
alt!(
7191
call!(escapedchar)
72-
| verify!(tuple!(peek!(take!(3)), anychar), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| Some(c) }
92+
| verify!(tuple!(peek!(take!(3)), anychar), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| Some(cp_from_char(c)) }
7393
),
74-
String::new(),
75-
|mut acc:String, c:Option<char>| { match c { Some(c) => acc.push_str(&c.to_string()), None => () }; acc }
94+
PyStringContent::new(),
95+
|mut acc:PyStringContent, c:Option<PyStringCodePoint>| { match c { Some(c) => acc.push(c), None => () }; acc }
7696
)
7797
);
7898

79-
named_args!(shortrawstring(quote: char) <StrSpan, String>,
99+
named_args!(shortrawstring(quote: char) <StrSpan, PyStringContent>,
80100
fold_many0!(
81101
alt!(
82-
tuple!(char!('\\'), anychar) => { |(c1,c2)| (c1, Some(c2)) }
83-
| verify!(none_of!("\\"), |c:char| c != quote) => { |c:char| (c, None) }
102+
tuple!(char!('\\'), anychar) => { |(c1,c2)| (cp_from_char(c1), Some(cp_from_char(c2))) }
103+
| verify!(none_of!("\\"), |c:char| c != quote) => { |c:char| (cp_from_char(c), None) }
84104
),
85-
String::new(),
86-
|mut acc:String, (c1,c2):(char, Option<char>)| {
105+
PyStringContent::new(),
106+
|mut acc:PyStringContent, (c1,c2):(PyStringCodePoint, Option<PyStringCodePoint>)| {
87107
acc.push(c1);
88108
match c2 { Some(c) => acc.push(c), None => () };
89109
acc
90110
}
91111
)
92112
);
93113

94-
named_args!(longrawstring(quote: char) <StrSpan, String>,
114+
named_args!(longrawstring(quote: char) <StrSpan, PyStringContent>,
95115
fold_many0!(
96116
alt!(
97-
tuple!(char!('\\'), anychar) => { |(c1,c2)| (c1, Some(c2)) }
98-
| verify!(tuple!(peek!(take!(3)), none_of!("\\")), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| (c, None) }
117+
tuple!(char!('\\'), anychar) => { |(c1,c2)| (cp_from_char(c1), Some(cp_from_char(c2))) }
118+
| verify!(tuple!(peek!(take!(3)), none_of!("\\")), |(s,_):(StrSpan,_)| { s.fragment.0.chars().collect::<Vec<char>>() != vec![quote,quote,quote] }) => { |(_,c)| (cp_from_char(c), None) }
99119
),
100-
String::new(),
101-
|mut acc:String, (c1,c2):(char, Option<char>)| {
120+
PyStringContent::new(),
121+
|mut acc:PyStringContent, (c1,c2):(PyStringCodePoint, Option<PyStringCodePoint>)| {
102122
acc.push(c1);
103123
match c2 { Some(c) => acc.push(c), None => () };
104124
acc
@@ -123,7 +143,7 @@ named!(pub string<StrSpan, PyString>,
123143
| delimited!(char!('\''), call!(shortrawstring, '\''), char!('\''))
124144
| delimited!(char!('"'), call!(shortrawstring, '"'), char!('"'))
125145
)
126-
) >> (PyString { prefix: prefix.to_string(), content: content.to_string() })
146+
) >> (PyString { prefix: prefix.to_string(), content: content })
127147
)
128148
);
129149

src/visitors/printer.rs

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,43 @@ fn format_float(n: f64) -> String {
449449
s
450450
}
451451

452+
#[cfg(feature="wtf8")]
453+
fn format_string(v: &Vec<PyString>) -> String {
454+
space_join(v.iter().map(|PyString { prefix, content }|
455+
format!("{}\"{}\"", prefix.to_ascii_lowercase().replace("r", ""), content.code_points().map(|c| match c.to_u32() {
456+
0xd => "\\r".to_string(),
457+
0xa => "\\n".to_string(),
458+
0x9 => "\\t".to_string(),
459+
0x5c => "\\\\".to_string(),
460+
0x22 => "\\\"".to_string(),
461+
0x20...0x7e => c.to_char().unwrap().to_string(), // unwrap can't panic
462+
0x00...0x1f | 0x7f | 0x80...0xff => format!("\\x{:02x}", c.to_u32()),
463+
0x100...0xffff => format!("\\u{:04x}", c.to_u32()),
464+
0x10000...0x10ffff => format!("\\U{:08x}", c.to_u32()),
465+
_ => unreachable!(),
466+
}).collect::<Vec<_>>()[..].concat())
467+
))
468+
}
469+
470+
#[cfg(not(feature="wtf8"))]
471+
fn format_string(v: &Vec<PyString>) -> String {
472+
space_join(v.iter().map(|PyString { prefix, content }|
473+
format!("{}\"{}\"", prefix.to_ascii_lowercase().replace("r", ""), content.chars().map(|c| match c {
474+
'\r' => "\\r".to_string(),
475+
'\n' => "\\n".to_string(),
476+
'\t' => "\\t".to_string(),
477+
'\\' => "\\\\".to_string(),
478+
'"' => "\\\"".to_string(),
479+
'\x20'...'\x7e' => c.to_string(),
480+
'\x00'...'\x1f' | '\x7f' | '\u{80}'...'\u{ff}' => format!("\\x{:02x}", c as u8),
481+
'\u{100}'...'\u{ffff}' => format!("\\u{:04x}", c as u16),
482+
'\u{10000}'...'\u{10ffff}' => format!("\\U{:08x}", c as u32),
483+
_ => unreachable!(),
484+
}).collect::<Vec<_>>()[..].concat())
485+
))
486+
}
487+
488+
452489
fn format_expr(e: &Expression) -> String {
453490
match e {
454491
Expression::Ellipsis => "...".to_string(),
@@ -460,22 +497,7 @@ fn format_expr(e: &Expression) -> String {
460497
Expression::ImaginaryInt(ref n) => format!("{}j", n),
461498
Expression::Float(ref n) => format_float(*n),
462499
Expression::ImaginaryFloat(ref n) => format!("{}j", format_float(*n)),
463-
Expression::String(ref v) => {
464-
space_join(v.iter().map(|PyString { prefix, content }|
465-
format!("{}\"{}\"", prefix.to_ascii_lowercase().replace("r", ""), content.chars().map(|c| match c {
466-
'\r' => "\\r".to_string(),
467-
'\n' => "\\n".to_string(),
468-
'\t' => "\\t".to_string(),
469-
'\\' => "\\\\".to_string(),
470-
'"' => "\\\"".to_string(),
471-
'\x20'...'\x7e' => c.to_string(),
472-
'\x00'...'\x1f' | '\x7f' | '\u{80}'...'\u{ff}' => format!("\\x{:02x}", c as u8),
473-
'\u{100}'...'\u{ffff}' => format!("\\u{:04x}", c as u16),
474-
'\u{10000}'...'\u{10ffff}' => format!("\\U{:08x}", c as u32),
475-
_ => unreachable!(),
476-
}).collect::<Vec<_>>()[..].concat())
477-
))
478-
},
500+
Expression::String(ref v) => format_string(v),
479501
Expression::Bytes(ref content) => {
480502
format!("b\"{}\"", content.iter().map(|b| match b {
481503
b'\r' => "\\r".to_string(),

0 commit comments

Comments
 (0)