@@ -9,6 +9,8 @@ use std::char::from_digit;
9
9
use std:: ffi:: OsStr ;
10
10
use std:: fmt;
11
11
12
+ use crate :: os_str_as_bytes;
13
+
12
14
// These are characters with special meaning in the shell (e.g. bash).
13
15
// The first const contains characters that only have a special meaning when they appear at the beginning of a name.
14
16
const SPECIAL_SHELL_CHARS_START : & [ char ] = & [ '~' , '#' ] ;
@@ -73,7 +75,7 @@ enum EscapeState {
73
75
}
74
76
75
77
struct EscapeOctal {
76
- c : char ,
78
+ c : u32 ,
77
79
state : EscapeOctalState ,
78
80
idx : usize ,
79
81
}
@@ -95,7 +97,7 @@ impl Iterator for EscapeOctal {
95
97
Some ( '\\' )
96
98
}
97
99
EscapeOctalState :: Value => {
98
- let octal_digit = ( ( self . c as u32 ) >> ( self . idx * 3 ) ) & 0o7 ;
100
+ let octal_digit = ( ( self . c ) >> ( self . idx * 3 ) ) & 0o7 ;
99
101
if self . idx == 0 {
100
102
self . state = EscapeOctalState :: Done ;
101
103
} else {
@@ -108,9 +110,17 @@ impl Iterator for EscapeOctal {
108
110
}
109
111
110
112
impl EscapeOctal {
111
- fn from ( c : char ) -> Self {
113
+ fn from_char ( c : char ) -> Self {
114
+ Self {
115
+ c : c as u32 ,
116
+ idx : 2 ,
117
+ state : EscapeOctalState :: Backslash ,
118
+ }
119
+ }
120
+
121
+ fn from_byte ( c : u8 ) -> Self {
112
122
Self {
113
- c,
123
+ c : c as u32 ,
114
124
idx : 2 ,
115
125
state : EscapeOctalState :: Backslash ,
116
126
}
@@ -148,7 +158,7 @@ impl EscapedChar {
148
158
_ => Char ( ' ' ) ,
149
159
} ,
150
160
':' if dirname => Backslash ( ':' ) ,
151
- _ if c. is_ascii_control ( ) => Octal ( EscapeOctal :: from ( c) ) ,
161
+ _ if c. is_ascii_control ( ) => Octal ( EscapeOctal :: from_char ( c) ) ,
152
162
_ => Char ( c) ,
153
163
} ;
154
164
Self { state : init_state }
@@ -165,7 +175,7 @@ impl EscapedChar {
165
175
'\x0B' => Backslash ( 'v' ) ,
166
176
'\x0C' => Backslash ( 'f' ) ,
167
177
'\r' => Backslash ( 'r' ) ,
168
- '\x00' ..='\x1F' | '\x7F' => Octal ( EscapeOctal :: from ( c) ) ,
178
+ '\x00' ..='\x1F' | '\x7F' => Octal ( EscapeOctal :: from_char ( c) ) ,
169
179
'\'' => match quotes {
170
180
Quotes :: Single => Backslash ( '\'' ) ,
171
181
_ => Char ( '\'' ) ,
@@ -176,6 +186,15 @@ impl EscapedChar {
176
186
Self { state : init_state }
177
187
}
178
188
189
+ fn new_byte ( b : u8 , escape : bool ) -> Self {
190
+ let init_state = if escape {
191
+ EscapeState :: Octal ( EscapeOctal :: from_byte ( b) )
192
+ } else {
193
+ EscapeState :: Char ( '?' )
194
+ } ;
195
+ Self { state : init_state }
196
+ }
197
+
179
198
fn hide_control ( self ) -> Self {
180
199
match self . state {
181
200
EscapeState :: Char ( c) if c. is_control ( ) => Self {
@@ -205,18 +224,92 @@ impl Iterator for EscapedChar {
205
224
}
206
225
}
207
226
208
- fn shell_without_escape ( name : & str , quotes : Quotes , show_control_chars : bool ) -> ( String , bool ) {
209
- let mut must_quote = false ;
210
- let mut escaped_str = String :: with_capacity ( name. len ( ) ) ;
227
+ enum NonUtf8StringPart < ' a > {
228
+ Valid ( & ' a str ) ,
229
+ Invalid ( & ' a [ u8 ] ) ,
230
+ }
211
231
212
- for c in name. chars ( ) {
213
- let escaped = {
214
- let ec = EscapedChar :: new_shell ( c, false , quotes) ;
215
- if show_control_chars {
216
- ec
232
+ impl < ' a > NonUtf8StringPart < ' a > {
233
+ fn valid ( & self ) -> Option < & ' a str > {
234
+ match self {
235
+ NonUtf8StringPart :: Valid ( s) => Some ( s) ,
236
+ NonUtf8StringPart :: Invalid ( _) => None ,
237
+ }
238
+ }
239
+ }
240
+
241
+ /// Represent a string which might contains non UTF-8 characters.
242
+ struct MaybeNonUtf8String < ' a > {
243
+ source : Vec < NonUtf8StringPart < ' a > > ,
244
+ }
245
+
246
+ impl < ' a > MaybeNonUtf8String < ' a > {
247
+ fn new ( source : & ' a [ u8 ] ) -> Self {
248
+ Self {
249
+ source : source
250
+ . utf8_chunks ( )
251
+ . flat_map ( |chunk| {
252
+ let mut parts = vec ! [ ] ;
253
+ if !chunk. valid ( ) . is_empty ( ) {
254
+ parts. push ( NonUtf8StringPart :: Valid ( chunk. valid ( ) ) ) ;
255
+ }
256
+ if !chunk. invalid ( ) . is_empty ( ) {
257
+ parts. push ( NonUtf8StringPart :: Invalid ( chunk. invalid ( ) ) ) ;
258
+ }
259
+ parts
260
+ } )
261
+ . collect ( ) ,
262
+ }
263
+ }
264
+
265
+ fn contains_chars ( & self , s : & [ char ] ) -> bool {
266
+ self . source
267
+ . iter ( )
268
+ . any ( |chunk| chunk. valid ( ) . is_some_and ( |valid| valid. contains ( s) ) )
269
+ }
270
+
271
+ fn contains_char ( & self , c : char ) -> bool {
272
+ self . source
273
+ . iter ( )
274
+ . any ( |chunk| chunk. valid ( ) . is_some_and ( |valid| valid. contains ( c) ) )
275
+ }
276
+
277
+ fn starts_with ( & self , chars : & [ char ] ) -> bool {
278
+ self . source . first ( ) . is_some_and ( |chunk| {
279
+ if let NonUtf8StringPart :: Valid ( s) = chunk {
280
+ s. starts_with ( chars)
217
281
} else {
218
- ec . hide_control ( )
282
+ false
219
283
}
284
+ } )
285
+ }
286
+
287
+ fn estimated_len ( & self ) -> usize {
288
+ self . source . iter ( ) . fold ( 0 , |i, chunk| match chunk {
289
+ NonUtf8StringPart :: Valid ( s) => i + s. len ( ) ,
290
+ NonUtf8StringPart :: Invalid ( b) => i + b. len ( ) ,
291
+ } )
292
+ }
293
+
294
+ fn iter ( & self ) -> impl Iterator < Item = & NonUtf8StringPart < ' a > > {
295
+ self . source . iter ( )
296
+ }
297
+ }
298
+
299
+ fn shell_without_escape (
300
+ name : & MaybeNonUtf8String < ' _ > ,
301
+ quotes : Quotes ,
302
+ show_control_chars : bool ,
303
+ ) -> ( String , bool ) {
304
+ let mut must_quote = false ;
305
+ let mut escaped_str = String :: with_capacity ( name. estimated_len ( ) ) ;
306
+ let chunks = name. iter ( ) ;
307
+
308
+ let mut push_to_str = |ec : EscapedChar | {
309
+ let escaped = if show_control_chars {
310
+ ec
311
+ } else {
312
+ ec. hide_control ( )
220
313
} ;
221
314
222
315
match escaped. state {
@@ -231,53 +324,85 @@ fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) ->
231
324
}
232
325
}
233
326
}
327
+ } ;
328
+
329
+ for chunk in chunks {
330
+ match chunk {
331
+ NonUtf8StringPart :: Valid ( s) => {
332
+ for c in s. chars ( ) {
333
+ let escaped = EscapedChar :: new_shell ( c, false , quotes) ;
334
+ push_to_str ( escaped)
335
+ }
336
+ }
337
+ NonUtf8StringPart :: Invalid ( bytes) => {
338
+ for b in * bytes {
339
+ let escaped = EscapedChar :: new_byte ( * b, false ) ;
340
+ push_to_str ( escaped)
341
+ }
342
+ }
343
+ }
234
344
}
235
345
236
346
must_quote = must_quote || name. starts_with ( SPECIAL_SHELL_CHARS_START ) ;
237
347
( escaped_str, must_quote)
238
348
}
239
349
240
- fn shell_with_escape ( name : & str , quotes : Quotes ) -> ( String , bool ) {
350
+ fn shell_with_escape ( name : & MaybeNonUtf8String < ' _ > , quotes : Quotes ) -> ( String , bool ) {
241
351
// We need to keep track of whether we are in a dollar expression
242
352
// because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
243
353
let mut in_dollar = false ;
244
354
let mut must_quote = false ;
245
- let mut escaped_str = String :: with_capacity ( name. len ( ) ) ;
355
+ let mut escaped_str = String :: with_capacity ( name. estimated_len ( ) ) ;
356
+ let chunks = name. iter ( ) ;
246
357
247
- for c in name. chars ( ) {
248
- let escaped = EscapedChar :: new_shell ( c, true , quotes) ;
249
- match escaped. state {
250
- EscapeState :: Char ( x) => {
251
- if in_dollar {
252
- escaped_str. push_str ( "''" ) ;
253
- in_dollar = false ;
254
- }
255
- escaped_str. push ( x) ;
256
- }
257
- EscapeState :: ForceQuote ( x) => {
258
- if in_dollar {
259
- escaped_str. push_str ( "''" ) ;
260
- in_dollar = false ;
261
- }
262
- must_quote = true ;
263
- escaped_str. push ( x) ;
358
+ let mut push_to_string = |escaped : EscapedChar | match escaped. state {
359
+ EscapeState :: Char ( x) => {
360
+ if in_dollar {
361
+ escaped_str. push_str ( "''" ) ;
362
+ in_dollar = false ;
264
363
}
265
- // Single quotes are not put in dollar expressions, but are escaped
266
- // if the string also contains double quotes. In that case, they must
267
- // be handled separately.
268
- EscapeState :: Backslash ( '\'' ) => {
269
- must_quote = true ;
364
+ escaped_str . push ( x ) ;
365
+ }
366
+ EscapeState :: ForceQuote ( x ) => {
367
+ if in_dollar {
368
+ escaped_str . push_str ( "''" ) ;
270
369
in_dollar = false ;
271
- escaped_str. push_str ( "'\\ ''" ) ;
272
370
}
273
- _ => {
274
- if !in_dollar {
275
- escaped_str. push_str ( "'$'" ) ;
276
- in_dollar = true ;
371
+ must_quote = true ;
372
+ escaped_str. push ( x) ;
373
+ }
374
+ // Single quotes are not put in dollar expressions, but are escaped
375
+ // if the string also contains double quotes. In that case, they must
376
+ // be handled separately.
377
+ EscapeState :: Backslash ( '\'' ) => {
378
+ must_quote = true ;
379
+ in_dollar = false ;
380
+ escaped_str. push_str ( "'\\ ''" ) ;
381
+ }
382
+ _ => {
383
+ if !in_dollar {
384
+ escaped_str. push_str ( "'$'" ) ;
385
+ in_dollar = true ;
386
+ }
387
+ must_quote = true ;
388
+ for char in escaped {
389
+ escaped_str. push ( char) ;
390
+ }
391
+ }
392
+ } ;
393
+
394
+ for chunk in chunks {
395
+ match chunk {
396
+ NonUtf8StringPart :: Valid ( s) => {
397
+ for c in s. chars ( ) {
398
+ let escaped = EscapedChar :: new_shell ( c, true , quotes) ;
399
+ push_to_string ( escaped)
277
400
}
278
- must_quote = true ;
279
- for char in escaped {
280
- escaped_str. push ( char) ;
401
+ }
402
+ NonUtf8StringPart :: Invalid ( bytes) => {
403
+ for b in * bytes {
404
+ let escaped = EscapedChar :: new_byte ( * b, true ) ;
405
+ push_to_string ( escaped)
281
406
}
282
407
}
283
408
}
@@ -309,6 +434,12 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
309
434
/// This inner function provides an additional flag `dirname` which
310
435
/// is meant for ls' directory name display.
311
436
fn escape_name_inner ( name : & OsStr , style : & QuotingStyle , dirname : bool ) -> String {
437
+ // utf8_chunks separates good from bad UTF8 in a byte sequence.
438
+ let name_bytes = os_str_as_bytes ( name)
439
+ . map ( ToOwned :: to_owned)
440
+ . unwrap_or_else ( |_| name. to_string_lossy ( ) . as_bytes ( ) . to_vec ( ) ) ;
441
+ let name_chunks = MaybeNonUtf8String :: new ( & name_bytes) ;
442
+
312
443
match style {
313
444
QuotingStyle :: Literal { show_control } => {
314
445
if * show_control {
@@ -321,10 +452,21 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
321
452
}
322
453
}
323
454
QuotingStyle :: C { quotes } => {
324
- let escaped_str: String = name
325
- . to_string_lossy ( )
326
- . chars ( )
327
- . flat_map ( |c| EscapedChar :: new_c ( c, * quotes, dirname) )
455
+ let escaped_str: String = name_chunks
456
+ . iter ( )
457
+ . flat_map ( |chunk| {
458
+ let x: Box < dyn Iterator < Item = char > > = match chunk {
459
+ NonUtf8StringPart :: Valid ( s) => Box :: new (
460
+ s. chars ( )
461
+ . flat_map ( |c| EscapedChar :: new_c ( c, * quotes, dirname) ) ,
462
+ ) ,
463
+ NonUtf8StringPart :: Invalid ( bytes) => {
464
+ Box :: new ( bytes. iter ( ) . flat_map ( |b| EscapedChar :: new_byte ( * b, true ) ) )
465
+ }
466
+ } ;
467
+
468
+ x
469
+ } )
328
470
. collect ( ) ;
329
471
330
472
match quotes {
@@ -338,11 +480,11 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
338
480
always_quote,
339
481
show_control,
340
482
} => {
341
- let name = name . to_string_lossy ( ) ;
483
+ let escaped_char_set = shell_escaped_char_set ( dirname ) ;
342
484
343
- let ( quotes, must_quote) = if name . contains ( shell_escaped_char_set ( dirname ) ) {
485
+ let ( quotes, must_quote) = if name_chunks . contains_chars ( escaped_char_set ) {
344
486
( Quotes :: Single , true )
345
- } else if name . contains ( '\'' ) {
487
+ } else if name_chunks . contains_char ( '\'' ) {
346
488
( Quotes :: Double , true )
347
489
} else if * always_quote {
348
490
( Quotes :: Single , true )
@@ -351,9 +493,9 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
351
493
} ;
352
494
353
495
let ( escaped_str, contains_quote_chars) = if * escape {
354
- shell_with_escape ( & name , quotes)
496
+ shell_with_escape ( & name_chunks , quotes)
355
497
} else {
356
- shell_without_escape ( & name , quotes, * show_control)
498
+ shell_without_escape ( & name_chunks , quotes, * show_control)
357
499
} ;
358
500
359
501
match ( must_quote | contains_quote_chars, quotes) {
0 commit comments