Skip to content

Commit b2bc2a5

Browse files
committed
quoting: add support for non-UTF8 for all quoting styles except literal
1 parent 971da04 commit b2bc2a5

File tree

1 file changed

+198
-56
lines changed

1 file changed

+198
-56
lines changed

src/uucore/src/lib/features/quoting_style.rs

Lines changed: 198 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ use std::char::from_digit;
99
use std::ffi::OsStr;
1010
use std::fmt;
1111

12+
use crate::os_str_as_bytes;
13+
1214
// These are characters with special meaning in the shell (e.g. bash).
1315
// The first const contains characters that only have a special meaning when they appear at the beginning of a name.
1416
const SPECIAL_SHELL_CHARS_START: &[char] = &['~', '#'];
@@ -73,7 +75,7 @@ enum EscapeState {
7375
}
7476

7577
struct EscapeOctal {
76-
c: char,
78+
c: u32,
7779
state: EscapeOctalState,
7880
idx: usize,
7981
}
@@ -95,7 +97,7 @@ impl Iterator for EscapeOctal {
9597
Some('\\')
9698
}
9799
EscapeOctalState::Value => {
98-
let octal_digit = ((self.c as u32) >> (self.idx * 3)) & 0o7;
100+
let octal_digit = ((self.c) >> (self.idx * 3)) & 0o7;
99101
if self.idx == 0 {
100102
self.state = EscapeOctalState::Done;
101103
} else {
@@ -108,9 +110,17 @@ impl Iterator for EscapeOctal {
108110
}
109111

110112
impl EscapeOctal {
111-
fn from(c: char) -> Self {
113+
fn from_char(c: char) -> Self {
114+
Self {
115+
c: c as u32,
116+
idx: 2,
117+
state: EscapeOctalState::Backslash,
118+
}
119+
}
120+
121+
fn from_byte(c: u8) -> Self {
112122
Self {
113-
c,
123+
c: c as u32,
114124
idx: 2,
115125
state: EscapeOctalState::Backslash,
116126
}
@@ -148,7 +158,7 @@ impl EscapedChar {
148158
_ => Char(' '),
149159
},
150160
':' if dirname => Backslash(':'),
151-
_ if c.is_ascii_control() => Octal(EscapeOctal::from(c)),
161+
_ if c.is_ascii_control() => Octal(EscapeOctal::from_char(c)),
152162
_ => Char(c),
153163
};
154164
Self { state: init_state }
@@ -165,7 +175,7 @@ impl EscapedChar {
165175
'\x0B' => Backslash('v'),
166176
'\x0C' => Backslash('f'),
167177
'\r' => Backslash('r'),
168-
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c)),
178+
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from_char(c)),
169179
'\'' => match quotes {
170180
Quotes::Single => Backslash('\''),
171181
_ => Char('\''),
@@ -176,6 +186,15 @@ impl EscapedChar {
176186
Self { state: init_state }
177187
}
178188

189+
fn new_byte(b: u8, escape: bool) -> Self {
190+
let init_state = if escape {
191+
EscapeState::Octal(EscapeOctal::from_byte(b))
192+
} else {
193+
EscapeState::Char('?')
194+
};
195+
Self { state: init_state }
196+
}
197+
179198
fn hide_control(self) -> Self {
180199
match self.state {
181200
EscapeState::Char(c) if c.is_control() => Self {
@@ -205,18 +224,92 @@ impl Iterator for EscapedChar {
205224
}
206225
}
207226

208-
fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) -> (String, bool) {
209-
let mut must_quote = false;
210-
let mut escaped_str = String::with_capacity(name.len());
227+
enum NonUtf8StringPart<'a> {
228+
Valid(&'a str),
229+
Invalid(&'a [u8]),
230+
}
211231

212-
for c in name.chars() {
213-
let escaped = {
214-
let ec = EscapedChar::new_shell(c, false, quotes);
215-
if show_control_chars {
216-
ec
232+
impl<'a> NonUtf8StringPart<'a> {
233+
fn valid(&self) -> Option<&'a str> {
234+
match self {
235+
NonUtf8StringPart::Valid(s) => Some(s),
236+
NonUtf8StringPart::Invalid(_) => None,
237+
}
238+
}
239+
}
240+
241+
/// Represent a string which might contains non UTF-8 characters.
242+
struct MaybeNonUtf8String<'a> {
243+
source: Vec<NonUtf8StringPart<'a>>,
244+
}
245+
246+
impl<'a> MaybeNonUtf8String<'a> {
247+
fn new(source: &'a [u8]) -> Self {
248+
Self {
249+
source: source
250+
.utf8_chunks()
251+
.flat_map(|chunk| {
252+
let mut parts = vec![];
253+
if !chunk.valid().is_empty() {
254+
parts.push(NonUtf8StringPart::Valid(chunk.valid()));
255+
}
256+
if !chunk.invalid().is_empty() {
257+
parts.push(NonUtf8StringPart::Invalid(chunk.invalid()));
258+
}
259+
parts
260+
})
261+
.collect(),
262+
}
263+
}
264+
265+
fn contains_chars(&self, s: &[char]) -> bool {
266+
self.source
267+
.iter()
268+
.any(|chunk| chunk.valid().is_some_and(|valid| valid.contains(s)))
269+
}
270+
271+
fn contains_char(&self, c: char) -> bool {
272+
self.source
273+
.iter()
274+
.any(|chunk| chunk.valid().is_some_and(|valid| valid.contains(c)))
275+
}
276+
277+
fn starts_with(&self, chars: &[char]) -> bool {
278+
self.source.first().is_some_and(|chunk| {
279+
if let NonUtf8StringPart::Valid(s) = chunk {
280+
s.starts_with(chars)
217281
} else {
218-
ec.hide_control()
282+
false
219283
}
284+
})
285+
}
286+
287+
fn estimated_len(&self) -> usize {
288+
self.source.iter().fold(0, |i, chunk| match chunk {
289+
NonUtf8StringPart::Valid(s) => i + s.len(),
290+
NonUtf8StringPart::Invalid(b) => i + b.len(),
291+
})
292+
}
293+
294+
fn iter(&self) -> impl Iterator<Item = &NonUtf8StringPart<'a>> {
295+
self.source.iter()
296+
}
297+
}
298+
299+
fn shell_without_escape(
300+
name: &MaybeNonUtf8String<'_>,
301+
quotes: Quotes,
302+
show_control_chars: bool,
303+
) -> (String, bool) {
304+
let mut must_quote = false;
305+
let mut escaped_str = String::with_capacity(name.estimated_len());
306+
let chunks = name.iter();
307+
308+
let mut push_to_str = |ec: EscapedChar| {
309+
let escaped = if show_control_chars {
310+
ec
311+
} else {
312+
ec.hide_control()
220313
};
221314

222315
match escaped.state {
@@ -231,53 +324,85 @@ fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) ->
231324
}
232325
}
233326
}
327+
};
328+
329+
for chunk in chunks {
330+
match chunk {
331+
NonUtf8StringPart::Valid(s) => {
332+
for c in s.chars() {
333+
let escaped = EscapedChar::new_shell(c, false, quotes);
334+
push_to_str(escaped)
335+
}
336+
}
337+
NonUtf8StringPart::Invalid(bytes) => {
338+
for b in *bytes {
339+
let escaped = EscapedChar::new_byte(*b, false);
340+
push_to_str(escaped)
341+
}
342+
}
343+
}
234344
}
235345

236346
must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START);
237347
(escaped_str, must_quote)
238348
}
239349

240-
fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) {
350+
fn shell_with_escape(name: &MaybeNonUtf8String<'_>, quotes: Quotes) -> (String, bool) {
241351
// We need to keep track of whether we are in a dollar expression
242352
// because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
243353
let mut in_dollar = false;
244354
let mut must_quote = false;
245-
let mut escaped_str = String::with_capacity(name.len());
355+
let mut escaped_str = String::with_capacity(name.estimated_len());
356+
let chunks = name.iter();
246357

247-
for c in name.chars() {
248-
let escaped = EscapedChar::new_shell(c, true, quotes);
249-
match escaped.state {
250-
EscapeState::Char(x) => {
251-
if in_dollar {
252-
escaped_str.push_str("''");
253-
in_dollar = false;
254-
}
255-
escaped_str.push(x);
256-
}
257-
EscapeState::ForceQuote(x) => {
258-
if in_dollar {
259-
escaped_str.push_str("''");
260-
in_dollar = false;
261-
}
262-
must_quote = true;
263-
escaped_str.push(x);
358+
let mut push_to_string = |escaped: EscapedChar| match escaped.state {
359+
EscapeState::Char(x) => {
360+
if in_dollar {
361+
escaped_str.push_str("''");
362+
in_dollar = false;
264363
}
265-
// Single quotes are not put in dollar expressions, but are escaped
266-
// if the string also contains double quotes. In that case, they must
267-
// be handled separately.
268-
EscapeState::Backslash('\'') => {
269-
must_quote = true;
364+
escaped_str.push(x);
365+
}
366+
EscapeState::ForceQuote(x) => {
367+
if in_dollar {
368+
escaped_str.push_str("''");
270369
in_dollar = false;
271-
escaped_str.push_str("'\\''");
272370
}
273-
_ => {
274-
if !in_dollar {
275-
escaped_str.push_str("'$'");
276-
in_dollar = true;
371+
must_quote = true;
372+
escaped_str.push(x);
373+
}
374+
// Single quotes are not put in dollar expressions, but are escaped
375+
// if the string also contains double quotes. In that case, they must
376+
// be handled separately.
377+
EscapeState::Backslash('\'') => {
378+
must_quote = true;
379+
in_dollar = false;
380+
escaped_str.push_str("'\\''");
381+
}
382+
_ => {
383+
if !in_dollar {
384+
escaped_str.push_str("'$'");
385+
in_dollar = true;
386+
}
387+
must_quote = true;
388+
for char in escaped {
389+
escaped_str.push(char);
390+
}
391+
}
392+
};
393+
394+
for chunk in chunks {
395+
match chunk {
396+
NonUtf8StringPart::Valid(s) => {
397+
for c in s.chars() {
398+
let escaped = EscapedChar::new_shell(c, true, quotes);
399+
push_to_string(escaped)
277400
}
278-
must_quote = true;
279-
for char in escaped {
280-
escaped_str.push(char);
401+
}
402+
NonUtf8StringPart::Invalid(bytes) => {
403+
for b in *bytes {
404+
let escaped = EscapedChar::new_byte(*b, true);
405+
push_to_string(escaped)
281406
}
282407
}
283408
}
@@ -309,6 +434,12 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
309434
/// This inner function provides an additional flag `dirname` which
310435
/// is meant for ls' directory name display.
311436
fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> String {
437+
// utf8_chunks separates good from bad UTF8 in a byte sequence.
438+
let name_bytes = os_str_as_bytes(name)
439+
.map(ToOwned::to_owned)
440+
.unwrap_or_else(|_| name.to_string_lossy().as_bytes().to_vec());
441+
let name_chunks = MaybeNonUtf8String::new(&name_bytes);
442+
312443
match style {
313444
QuotingStyle::Literal { show_control } => {
314445
if *show_control {
@@ -321,10 +452,21 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
321452
}
322453
}
323454
QuotingStyle::C { quotes } => {
324-
let escaped_str: String = name
325-
.to_string_lossy()
326-
.chars()
327-
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname))
455+
let escaped_str: String = name_chunks
456+
.iter()
457+
.flat_map(|chunk| {
458+
let x: Box<dyn Iterator<Item = char>> = match chunk {
459+
NonUtf8StringPart::Valid(s) => Box::new(
460+
s.chars()
461+
.flat_map(|c| EscapedChar::new_c(c, *quotes, dirname)),
462+
),
463+
NonUtf8StringPart::Invalid(bytes) => {
464+
Box::new(bytes.iter().flat_map(|b| EscapedChar::new_byte(*b, true)))
465+
}
466+
};
467+
468+
x
469+
})
328470
.collect();
329471

330472
match quotes {
@@ -338,11 +480,11 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
338480
always_quote,
339481
show_control,
340482
} => {
341-
let name = name.to_string_lossy();
483+
let escaped_char_set = shell_escaped_char_set(dirname);
342484

343-
let (quotes, must_quote) = if name.contains(shell_escaped_char_set(dirname)) {
485+
let (quotes, must_quote) = if name_chunks.contains_chars(escaped_char_set) {
344486
(Quotes::Single, true)
345-
} else if name.contains('\'') {
487+
} else if name_chunks.contains_char('\'') {
346488
(Quotes::Double, true)
347489
} else if *always_quote {
348490
(Quotes::Single, true)
@@ -351,9 +493,9 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
351493
};
352494

353495
let (escaped_str, contains_quote_chars) = if *escape {
354-
shell_with_escape(&name, quotes)
496+
shell_with_escape(&name_chunks, quotes)
355497
} else {
356-
shell_without_escape(&name, quotes, *show_control)
498+
shell_without_escape(&name_chunks, quotes, *show_control)
357499
};
358500

359501
match (must_quote | contains_quote_chars, quotes) {

0 commit comments

Comments
 (0)