28
28
29
29
use std:: io;
30
30
31
+ use itertools:: Itertools ;
32
+ use rustpython_common:: wtf8:: { CodePoint , Wtf8 , Wtf8Buf } ;
33
+
31
34
static ESCAPE_CHARS : [ & str ; 0x20 ] = [
32
35
"\\ u0000" , "\\ u0001" , "\\ u0002" , "\\ u0003" , "\\ u0004" , "\\ u0005" , "\\ u0006" , "\\ u0007" , "\\ b" ,
33
36
"\\ t" , "\\ n" , "\\ u000" , "\\ f" , "\\ r" , "\\ u000e" , "\\ u000f" , "\\ u0010" , "\\ u0011" , "\\ u0012" ,
@@ -111,39 +114,39 @@ impl DecodeError {
111
114
}
112
115
113
116
enum StrOrChar < ' a > {
114
- Str ( & ' a str ) ,
115
- Char ( char ) ,
117
+ Str ( & ' a Wtf8 ) ,
118
+ Char ( CodePoint ) ,
116
119
}
117
120
impl StrOrChar < ' _ > {
118
121
fn len ( & self ) -> usize {
119
122
match self {
120
123
StrOrChar :: Str ( s) => s. len ( ) ,
121
- StrOrChar :: Char ( c) => c. len_utf8 ( ) ,
124
+ StrOrChar :: Char ( c) => c. len_wtf8 ( ) ,
122
125
}
123
126
}
124
127
}
125
128
pub fn scanstring < ' a > (
126
- s : & ' a str ,
129
+ s : & ' a Wtf8 ,
127
130
end : usize ,
128
131
strict : bool ,
129
- ) -> Result < ( String , usize ) , DecodeError > {
132
+ ) -> Result < ( Wtf8Buf , usize ) , DecodeError > {
130
133
let mut chunks: Vec < StrOrChar < ' a > > = Vec :: new ( ) ;
131
134
let mut output_len = 0usize ;
132
135
let mut push_chunk = |chunk : StrOrChar < ' a > | {
133
136
output_len += chunk. len ( ) ;
134
137
chunks. push ( chunk) ;
135
138
} ;
136
139
let unterminated_err = || DecodeError :: new ( "Unterminated string starting at" , end - 1 ) ;
137
- let mut chars = s. char_indices ( ) . enumerate ( ) . skip ( end) . peekable ( ) ;
140
+ let mut chars = s. code_point_indices ( ) . enumerate ( ) . skip ( end) . peekable ( ) ;
138
141
let & ( _, ( mut chunk_start, _) ) = chars. peek ( ) . ok_or_else ( unterminated_err) ?;
139
142
while let Some ( ( char_i, ( byte_i, c) ) ) = chars. next ( ) {
140
- match c {
143
+ match c. to_char_lossy ( ) {
141
144
'"' => {
142
145
push_chunk ( StrOrChar :: Str ( & s[ chunk_start..byte_i] ) ) ;
143
- let mut out = String :: with_capacity ( output_len) ;
146
+ let mut out = Wtf8Buf :: with_capacity ( output_len) ;
144
147
for x in chunks {
145
148
match x {
146
- StrOrChar :: Str ( s) => out. push_str ( s) ,
149
+ StrOrChar :: Str ( s) => out. push_wtf8 ( s) ,
147
150
StrOrChar :: Char ( c) => out. push ( c) ,
148
151
}
149
152
}
@@ -152,7 +155,7 @@ pub fn scanstring<'a>(
152
155
'\\' => {
153
156
push_chunk ( StrOrChar :: Str ( & s[ chunk_start..byte_i] ) ) ;
154
157
let ( _, ( _, c) ) = chars. next ( ) . ok_or_else ( unterminated_err) ?;
155
- let esc = match c {
158
+ let esc = match c. to_char_lossy ( ) {
156
159
'"' => "\" " ,
157
160
'\\' => "\\ " ,
158
161
'/' => "/" ,
@@ -162,41 +165,33 @@ pub fn scanstring<'a>(
162
165
'r' => "\r " ,
163
166
't' => "\t " ,
164
167
'u' => {
165
- let surrogate_err = || DecodeError :: new ( "unpaired surrogate" , char_i) ;
166
168
let mut uni = decode_unicode ( & mut chars, char_i) ?;
167
169
chunk_start = byte_i + 6 ;
168
- if ( 0xd800 ..= 0xdbff ) . contains ( & uni ) {
170
+ if let Some ( lead ) = uni . to_lead_surrogate ( ) {
169
171
// uni is a surrogate -- try to find its pair
170
- if let Some ( & ( pos2, ( _, '\\' ) ) ) = chars. peek ( ) {
171
- // ok, the next char starts an escape
172
- chars. next ( ) ;
173
- if let Some ( ( _, ( _, 'u' ) ) ) = chars. peek ( ) {
174
- // ok, it's a unicode escape
175
- chars. next ( ) ;
176
- let uni2 = decode_unicode ( & mut chars, pos2) ?;
172
+ let mut chars2 = chars. clone ( ) ;
173
+ if let Some ( ( ( pos2, _) , ( _, _) ) ) = chars2
174
+ . next_tuple ( )
175
+ . filter ( |( ( _, ( _, c1) ) , ( _, ( _, c2) ) ) | * c1 == '\\' && * c2 == 'u' )
176
+ {
177
+ let uni2 = decode_unicode ( & mut chars2, pos2) ?;
178
+ if let Some ( trail) = uni2. to_trail_surrogate ( ) {
179
+ // ok, we found what we were looking for -- \uXXXX\uXXXX, both surrogates
180
+ uni = lead. merge ( trail) . into ( ) ;
177
181
chunk_start = pos2 + 6 ;
178
- if ( 0xdc00 ..=0xdfff ) . contains ( & uni2) {
179
- // ok, we found what we were looking for -- \uXXXX\uXXXX, both surrogates
180
- uni = 0x10000 + ( ( ( uni - 0xd800 ) << 10 ) | ( uni2 - 0xdc00 ) ) ;
181
- } else {
182
- // if we don't find a matching surrogate, error -- until str
183
- // isn't utf8 internally, we can't parse surrogates
184
- return Err ( surrogate_err ( ) ) ;
185
- }
186
- } else {
187
- return Err ( surrogate_err ( ) ) ;
182
+ chars = chars2;
188
183
}
189
184
}
190
185
}
191
- push_chunk ( StrOrChar :: Char (
192
- std:: char:: from_u32 ( uni) . ok_or_else ( surrogate_err) ?,
193
- ) ) ;
186
+ push_chunk ( StrOrChar :: Char ( uni) ) ;
194
187
continue ;
195
188
}
196
- _ => return Err ( DecodeError :: new ( format ! ( "Invalid \\ escape: {c:?}" ) , char_i) ) ,
189
+ _ => {
190
+ return Err ( DecodeError :: new ( format ! ( "Invalid \\ escape: {c:?}" ) , char_i) ) ;
191
+ }
197
192
} ;
198
193
chunk_start = byte_i + 2 ;
199
- push_chunk ( StrOrChar :: Str ( esc) ) ;
194
+ push_chunk ( StrOrChar :: Str ( esc. as_ref ( ) ) ) ;
200
195
}
201
196
'\x00' ..='\x1f' if strict => {
202
197
return Err ( DecodeError :: new (
@@ -211,16 +206,16 @@ pub fn scanstring<'a>(
211
206
}
212
207
213
208
#[ inline]
214
- fn decode_unicode < I > ( it : & mut I , pos : usize ) -> Result < u32 , DecodeError >
209
+ fn decode_unicode < I > ( it : & mut I , pos : usize ) -> Result < CodePoint , DecodeError >
215
210
where
216
- I : Iterator < Item = ( usize , ( usize , char ) ) > ,
211
+ I : Iterator < Item = ( usize , ( usize , CodePoint ) ) > ,
217
212
{
218
213
let err = || DecodeError :: new ( "Invalid \\ uXXXX escape" , pos) ;
219
214
let mut uni = 0 ;
220
215
for x in ( 0 ..4 ) . rev ( ) {
221
216
let ( _, ( _, c) ) = it. next ( ) . ok_or_else ( err) ?;
222
- let d = c. to_digit ( 16 ) . ok_or_else ( err) ?;
223
- uni += d * 16u32 . pow ( x) ;
217
+ let d = c. to_char ( ) . and_then ( |c| c . to_digit ( 16 ) ) . ok_or_else ( err) ? as u16 ;
218
+ uni += d * 16u16 . pow ( x) ;
224
219
}
225
- Ok ( uni)
220
+ Ok ( uni. into ( ) )
226
221
}
0 commit comments