diff --git a/.clippy.toml b/.clippy.toml index 72e8c35cfd8..6339ccf21b4 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -1,4 +1,4 @@ -msrv = "1.77.0" +msrv = "1.79.0" cognitive-complexity-threshold = 24 missing-docs-in-crate-items = true check-private-items = true diff --git a/.github/workflows/CICD.yml b/.github/workflows/CICD.yml index 56418dd6e04..f1f9661b36b 100644 --- a/.github/workflows/CICD.yml +++ b/.github/workflows/CICD.yml @@ -11,7 +11,7 @@ env: PROJECT_NAME: coreutils PROJECT_DESC: "Core universal (cross-platform) utilities" PROJECT_AUTH: "uutils" - RUST_MIN_SRV: "1.77.0" + RUST_MIN_SRV: "1.79.0" # * style job configuration STYLE_FAIL_ON_FAULT: true ## (bool) fail the build if a style job contains a fault (error or warning); may be overridden on a per-job basis diff --git a/Cargo.toml b/Cargo.toml index a6b9958d4a0..1991679d8e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ repository = "https://github.com/uutils/coreutils" readme = "README.md" keywords = ["coreutils", "uutils", "cross-platform", "cli", "utility"] categories = ["command-line-utilities"] -rust-version = "1.77.0" +rust-version = "1.79.0" edition = "2021" build = "build.rs" diff --git a/README.md b/README.md index 9f7d1c2ae09..37c5a596b3d 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ [![dependency status](https://deps.rs/repo/github/uutils/coreutils/status.svg)](https://deps.rs/repo/github/uutils/coreutils) [![CodeCov](https://codecov.io/gh/uutils/coreutils/branch/master/graph/badge.svg)](https://codecov.io/gh/uutils/coreutils) -![MSRV](https://img.shields.io/badge/MSRV-1.77.0-brightgreen) +![MSRV](https://img.shields.io/badge/MSRV-1.79.0-brightgreen) @@ -70,7 +70,7 @@ the [coreutils docs](https://github.com/uutils/uutils.github.io) repository. ### Rust Version uutils follows Rust's release channels and is tested against stable, beta and -nightly. The current Minimum Supported Rust Version (MSRV) is `1.77.0`. +nightly. The current Minimum Supported Rust Version (MSRV) is `1.79.0`. ## Building diff --git a/src/uu/ls/src/ls.rs b/src/uu/ls/src/ls.rs index f4e34714704..9a22006e097 100644 --- a/src/uu/ls/src/ls.rs +++ b/src/uu/ls/src/ls.rs @@ -21,7 +21,7 @@ use std::os::windows::fs::MetadataExt; use std::{ cmp::Reverse, error::Error, - ffi::OsString, + ffi::{OsStr, OsString}, fmt::{Display, Write as FmtWrite}, fs::{self, DirEntry, FileType, Metadata, ReadDir}, io::{stdout, BufWriter, ErrorKind, Stdout, Write}, @@ -55,7 +55,7 @@ use uucore::libc::{dev_t, major, minor}; #[cfg(unix)] use uucore::libc::{S_IXGRP, S_IXOTH, S_IXUSR}; use uucore::line_ending::LineEnding; -use uucore::quoting_style::{escape_dir_name, escape_name, QuotingStyle}; +use uucore::quoting_style::{self, QuotingStyle}; use uucore::{ display::Quotable, error::{set_exit_code, UError, UResult}, @@ -2048,7 +2048,11 @@ impl PathData { /// file11 /// ``` fn show_dir_name(path_data: &PathData, out: &mut BufWriter, config: &Config) { - let escaped_name = escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style); + // FIXME: replace this with appropriate behavior for literal unprintable bytes + let escaped_name = + quoting_style::escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style) + .to_string_lossy() + .to_string(); let name = if config.hyperlink && !config.dired { create_hyperlink(&escaped_name, path_data) @@ -3002,7 +3006,6 @@ use std::sync::Mutex; #[cfg(unix)] use uucore::entries; use uucore::fs::FileInformation; -use uucore::quoting_style; #[cfg(unix)] fn cached_uid2usr(uid: u32) -> String { @@ -3542,3 +3545,10 @@ fn calculate_padding_collection( padding_collections } + +// FIXME: replace this with appropriate behavior for literal unprintable bytes +fn escape_name(name: &OsStr, style: &QuotingStyle) -> String { + quoting_style::escape_name(name, style) + .to_string_lossy() + .to_string() +} diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 33b70ee62f5..1c2d99628f7 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -13,7 +13,7 @@ mod word_count; use std::{ borrow::{Borrow, Cow}, cmp::max, - ffi::OsString, + ffi::{OsStr, OsString}, fs::{self, File}, io::{self, Write}, iter, @@ -28,7 +28,7 @@ use utf8::{BufReadDecoder, BufReadDecoderError}; use uucore::{ error::{FromIo, UError, UResult}, format_usage, help_about, help_usage, - quoting_style::{escape_name, QuotingStyle}, + quoting_style::{self, QuotingStyle}, shortcut_value_parser::ShortcutValueParser, show, }; @@ -259,7 +259,7 @@ impl<'a> Input<'a> { match self { Self::Path(path) => Some(match path.to_str() { Some(s) if !s.contains('\n') => Cow::Borrowed(s), - _ => Cow::Owned(escape_name(path.as_os_str(), QS_ESCAPE)), + _ => Cow::Owned(escape_name_wrapper(path.as_os_str())), }), Self::Stdin(StdinKind::Explicit) => Some(Cow::Borrowed(STDIN_REPR)), Self::Stdin(StdinKind::Implicit) => None, @@ -269,7 +269,7 @@ impl<'a> Input<'a> { /// Converts input into the form that appears in errors. fn path_display(&self) -> String { match self { - Self::Path(path) => escape_name(path.as_os_str(), QS_ESCAPE), + Self::Path(path) => escape_name_wrapper(path.as_os_str()), Self::Stdin(_) => String::from("standard input"), } } @@ -361,7 +361,7 @@ impl WcError { Some((input, idx)) => { let path = match input { Input::Stdin(_) => STDIN_REPR.into(), - Input::Path(path) => escape_name(path.as_os_str(), QS_ESCAPE).into(), + Input::Path(path) => escape_name_wrapper(path.as_os_str()).into(), }; Self::ZeroLengthFileNameCtx { path, idx } } @@ -761,7 +761,9 @@ fn files0_iter_file<'a>(path: &Path) -> UResult Err(e.map_err_context(|| { format!( "cannot open {} for reading", - escape_name(path.as_os_str(), QS_QUOTE_ESCAPE) + quoting_style::escape_name(path.as_os_str(), QS_QUOTE_ESCAPE) + .into_string() + .expect("All escaped names with the escaping option return valid strings.") ) })), } @@ -793,9 +795,9 @@ fn files0_iter<'a>( Ok(Input::Path(PathBuf::from(s).into())) } } - Err(e) => Err(e.map_err_context(|| { - format!("{}: read error", escape_name(&err_path, QS_ESCAPE)) - }) as Box), + Err(e) => Err(e + .map_err_context(|| format!("{}: read error", escape_name_wrapper(&err_path))) + as Box), }), ); // Loop until there is an error; yield that error and then nothing else. @@ -808,6 +810,12 @@ fn files0_iter<'a>( }) } +fn escape_name_wrapper(name: &OsStr) -> String { + quoting_style::escape_name(name, QS_ESCAPE) + .into_string() + .expect("All escaped names with the escaping option return valid strings.") +} + fn wc(inputs: &Inputs, settings: &Settings) -> UResult<()> { let mut total_word_count = WordCount::default(); let mut num_inputs: usize = 0; diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index 75851049895..5cdd0342122 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -112,7 +112,8 @@ fn extract_value(p: Result>, input: &str) -> T Default::default() } ParseError::PartialMatch(v, rest) => { - if input.starts_with('\'') { + let bytes = input.as_encoded_bytes(); + if !bytes.is_empty() && bytes[0] == b'\'' { show_warning!( "{}: character(s) following character constant have been ignored", &rest, diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index 581e1fa0624..81dbc1ebc29 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -353,20 +353,20 @@ impl Spec { writer.write_all(&parsed).map_err(FormatError::IoError) } Self::QuotedString => { - let s = args.get_str(); - writer - .write_all( - escape_name( - s.as_ref(), - &QuotingStyle::Shell { - escape: true, - always_quote: false, - show_control: false, - }, - ) - .as_bytes(), - ) - .map_err(FormatError::IoError) + let s = escape_name( + args.get_str().as_ref(), + &QuotingStyle::Shell { + escape: true, + always_quote: false, + show_control: false, + }, + ); + #[cfg(unix)] + let bytes = std::os::unix::ffi::OsStringExt::into_vec(s); + #[cfg(not(unix))] + let bytes = s.to_string_lossy().as_bytes().to_owned(); + + writer.write_all(&bytes).map_err(FormatError::IoError) } Self::SignedInt { width, diff --git a/src/uucore/src/lib/features/quoting_style.rs b/src/uucore/src/lib/features/quoting_style.rs index 1efa6f746b7..6d0265dc625 100644 --- a/src/uucore/src/lib/features/quoting_style.rs +++ b/src/uucore/src/lib/features/quoting_style.rs @@ -6,39 +6,43 @@ //! Set of functions for escaping names according to different quoting styles. use std::char::from_digit; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; use std::fmt; // These are characters with special meaning in the shell (e.g. bash). // The first const contains characters that only have a special meaning when they appear at the beginning of a name. -const SPECIAL_SHELL_CHARS_START: &[char] = &['~', '#']; +const SPECIAL_SHELL_CHARS_START: &[u8] = b"~#"; // PR#6559 : Remove `]{}` from special shell chars. const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! "; /// The quoting style to use when escaping a name. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum QuotingStyle { - /// Escape the name as a literal string. + /// Escape the name as a shell string. + /// Used in, e.g., `ls --quoting-style=shell`. Shell { /// Whether to escape characters in the name. + /// True in, e.g., `ls --quoting-style=shell-escape`. escape: bool, /// Whether to always quote the name. always_quote: bool, - /// Whether to show control characters. + /// Whether to show control and non-unicode characters, or replace them with `?`. show_control: bool, }, /// Escape the name as a C string. + /// Used in, e.g., `ls --quote-name`. C { /// The type of quotes to use. quotes: Quotes, }, - /// Escape the name as a literal string. + /// Do not escape the string. + /// Used in, e.g., `ls --literal`. Literal { - /// Whether to show control characters. + /// Whether to show control and non-unicode characters, or replace them with `?`. show_control: bool, }, } @@ -72,16 +76,24 @@ enum EscapeState { Octal(EscapeOctal), } +/// Bytes we need to present as escaped octal, in the form of `\nnn` per byte. +/// Only supports characters up to 2 bytes long in UTF-8. struct EscapeOctal { - c: char, + c: [u8; 2], state: EscapeOctalState, - idx: usize, + idx: u8, } enum EscapeOctalState { Done, - Backslash, - Value, + FirstBackslash, + FirstValue, + LastBackslash, + LastValue, +} + +fn byte_to_octal_digit(byte: u8, idx: u8) -> u8 { + (byte >> (idx * 3)) & 0o7 } impl Iterator for EscapeOctal { @@ -90,29 +102,57 @@ impl Iterator for EscapeOctal { fn next(&mut self) -> Option { match self.state { EscapeOctalState::Done => None, - EscapeOctalState::Backslash => { - self.state = EscapeOctalState::Value; + EscapeOctalState::FirstBackslash => { + self.state = EscapeOctalState::FirstValue; Some('\\') } - EscapeOctalState::Value => { - let octal_digit = ((self.c as u32) >> (self.idx * 3)) & 0o7; + EscapeOctalState::LastBackslash => { + self.state = EscapeOctalState::LastValue; + Some('\\') + } + EscapeOctalState::FirstValue => { + let octal_digit = byte_to_octal_digit(self.c[0], self.idx); + if self.idx == 0 { + self.state = EscapeOctalState::LastBackslash; + self.idx = 2; + } else { + self.idx -= 1; + } + Some(from_digit(octal_digit.into(), 8).unwrap()) + } + EscapeOctalState::LastValue => { + let octal_digit = byte_to_octal_digit(self.c[1], self.idx); if self.idx == 0 { self.state = EscapeOctalState::Done; } else { self.idx -= 1; } - Some(from_digit(octal_digit, 8).unwrap()) + Some(from_digit(octal_digit.into(), 8).unwrap()) } } } } impl EscapeOctal { - fn from(c: char) -> Self { + fn from_char(c: char) -> Self { + if c.len_utf8() == 1 { + return Self::from_byte(c as u8); + } + + let mut buf = [0; 2]; + let _s = c.encode_utf8(&mut buf); + Self { + c: buf, + idx: 2, + state: EscapeOctalState::FirstBackslash, + } + } + + fn from_byte(b: u8) -> Self { Self { - c, + c: [0, b], idx: 2, - state: EscapeOctalState::Backslash, + state: EscapeOctalState::LastBackslash, } } } @@ -124,6 +164,12 @@ impl EscapedChar { } } + fn new_octal(b: u8) -> Self { + Self { + state: EscapeState::Octal(EscapeOctal::from_byte(b)), + } + } + fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self { use EscapeState::*; let init_state = match c { @@ -148,7 +194,7 @@ impl EscapedChar { _ => Char(' '), }, ':' if dirname => Backslash(':'), - _ if c.is_ascii_control() => Octal(EscapeOctal::from(c)), + _ if c.is_control() => Octal(EscapeOctal::from_char(c)), _ => Char(c), }; Self { state: init_state } @@ -165,11 +211,11 @@ impl EscapedChar { '\x0B' => Backslash('v'), '\x0C' => Backslash('f'), '\r' => Backslash('r'), - '\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c)), '\'' => match quotes { Quotes::Single => Backslash('\''), _ => Char('\''), }, + _ if c.is_control() => Octal(EscapeOctal::from_char(c)), _ if SPECIAL_SHELL_CHARS.contains(c) => ForceQuote(c), _ => Char(c), }; @@ -205,102 +251,124 @@ impl Iterator for EscapedChar { } } -fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) -> (String, bool) { - let mut must_quote = false; - let mut escaped_str = String::with_capacity(name.len()); +/// Check whether `bytes` starts with any byte in `pattern`. +fn bytes_start_with(bytes: &[u8], pattern: &[u8]) -> bool { + !bytes.is_empty() && pattern.contains(&bytes[0]) +} - for c in name.chars() { - let escaped = { - let ec = EscapedChar::new_shell(c, false, quotes); - if show_control_chars { - ec - } else { - ec.hide_control() - } - }; +fn shell_without_escape(name: &[u8], quotes: Quotes, show_control_chars: bool) -> (Vec, bool) { + let mut must_quote = false; + let mut escaped_str = Vec::with_capacity(name.len()); + let mut utf8_buf = vec![0; 4]; + + for s in name.utf8_chunks() { + for c in s.valid().chars() { + let escaped = { + let ec = EscapedChar::new_shell(c, false, quotes); + if show_control_chars { + ec + } else { + ec.hide_control() + } + }; - match escaped.state { - EscapeState::Backslash('\'') => escaped_str.push_str("'\\''"), - EscapeState::ForceQuote(x) => { - must_quote = true; - escaped_str.push(x); - } - _ => { - for char in escaped { - escaped_str.push(char); + match escaped.state { + EscapeState::Backslash('\'') => escaped_str.extend_from_slice(b"'\\''"), + EscapeState::ForceQuote(x) => { + must_quote = true; + escaped_str.extend_from_slice(x.encode_utf8(&mut utf8_buf).as_bytes()); + } + _ => { + for c in escaped { + escaped_str.extend_from_slice(c.encode_utf8(&mut utf8_buf).as_bytes()); + } } } } + + if show_control_chars { + escaped_str.extend_from_slice(s.invalid()); + } else { + escaped_str.resize(escaped_str.len() + s.invalid().len(), b'?'); + } } - must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); + must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START); (escaped_str, must_quote) } -fn shell_with_escape(name: &str, quotes: Quotes) -> (String, bool) { +fn shell_with_escape(name: &[u8], quotes: Quotes) -> (Vec, bool) { // We need to keep track of whether we are in a dollar expression // because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n' let mut in_dollar = false; let mut must_quote = false; let mut escaped_str = String::with_capacity(name.len()); - for c in name.chars() { - let escaped = EscapedChar::new_shell(c, true, quotes); - match escaped.state { - EscapeState::Char(x) => { - if in_dollar { - escaped_str.push_str("''"); - in_dollar = false; + for s in name.utf8_chunks() { + for c in s.valid().chars() { + let escaped = EscapedChar::new_shell(c, true, quotes); + match escaped.state { + EscapeState::Char(x) => { + if in_dollar { + escaped_str.push_str("''"); + in_dollar = false; + } + escaped_str.push(x); } - escaped_str.push(x); - } - EscapeState::ForceQuote(x) => { - if in_dollar { - escaped_str.push_str("''"); - in_dollar = false; + EscapeState::ForceQuote(x) => { + if in_dollar { + escaped_str.push_str("''"); + in_dollar = false; + } + must_quote = true; + escaped_str.push(x); } - must_quote = true; - escaped_str.push(x); - } - // Single quotes are not put in dollar expressions, but are escaped - // if the string also contains double quotes. In that case, they must - // be handled separately. - EscapeState::Backslash('\'') => { - must_quote = true; - in_dollar = false; - escaped_str.push_str("'\\''"); - } - _ => { - if !in_dollar { - escaped_str.push_str("'$'"); - in_dollar = true; + // Single quotes are not put in dollar expressions, but are escaped + // if the string also contains double quotes. In that case, they must + // be handled separately. + EscapeState::Backslash('\'') => { + must_quote = true; + in_dollar = false; + escaped_str.push_str("'\\''"); } - must_quote = true; - for char in escaped { - escaped_str.push(char); + _ => { + if !in_dollar { + escaped_str.push_str("'$'"); + in_dollar = true; + } + must_quote = true; + for char in escaped { + escaped_str.push(char); + } } } } + if !s.invalid().is_empty() { + if !in_dollar { + escaped_str.push_str("'$'"); + in_dollar = true; + } + must_quote = true; + let escaped_bytes: String = s + .invalid() + .iter() + .flat_map(|b| EscapedChar::new_octal(*b)) + .collect(); + escaped_str.push_str(&escaped_bytes); + } } - must_quote = must_quote || name.starts_with(SPECIAL_SHELL_CHARS_START); - (escaped_str, must_quote) + must_quote = must_quote || bytes_start_with(name, SPECIAL_SHELL_CHARS_START); + (escaped_str.into(), must_quote) } /// Return a set of characters that implies quoting of the word in /// shell-quoting mode. -fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] { - const ESCAPED_CHARS: &[char] = &[ - // the ':' colon character only induce quoting in the - // context of ls displaying a directory name before listing its content. - // (e.g. with the recursive flag -R) - ':', - // Under this line are the control characters that should be - // quoted in shell mode in all cases. - '"', '`', '$', '\\', '^', '\n', '\t', '\r', '=', - ]; - +fn shell_escaped_char_set(is_dirname: bool) -> &'static [u8] { + const ESCAPED_CHARS: &[u8] = b":\"`$\\^\n\t\r="; + // the ':' colon character only induce quoting in the + // context of ls displaying a directory name before listing its content. + // (e.g. with the recursive flag -R) let start_index = if is_dirname { 0 } else { 1 }; - &ESCAPED_CHARS[start_index..] } @@ -308,41 +376,57 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] { /// /// This inner function provides an additional flag `dirname` which /// is meant for ls' directory name display. -fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> String { +fn escape_name_inner(name: &[u8], style: &QuotingStyle, dirname: bool) -> Vec { match style { QuotingStyle::Literal { show_control } => { if *show_control { - name.to_string_lossy().into_owned() + name.to_owned() } else { - name.to_string_lossy() - .chars() - .flat_map(|c| EscapedChar::new_literal(c).hide_control()) - .collect() + name.utf8_chunks() + .map(|s| { + let valid: String = s + .valid() + .chars() + .flat_map(|c| EscapedChar::new_literal(c).hide_control()) + .collect(); + let invalid = "?".repeat(s.invalid().len()); + valid + &invalid + }) + .collect::() + .into() } } QuotingStyle::C { quotes } => { let escaped_str: String = name - .to_string_lossy() - .chars() - .flat_map(|c| EscapedChar::new_c(c, *quotes, dirname)) - .collect(); + .utf8_chunks() + .flat_map(|s| { + let valid = s + .valid() + .chars() + .flat_map(|c| EscapedChar::new_c(c, *quotes, dirname)); + let invalid = s.invalid().iter().flat_map(|b| EscapedChar::new_octal(*b)); + valid.chain(invalid) + }) + .collect::(); match quotes { Quotes::Single => format!("'{escaped_str}'"), Quotes::Double => format!("\"{escaped_str}\""), Quotes::None => escaped_str, } + .into() } QuotingStyle::Shell { escape, always_quote, show_control, } => { - let name = name.to_string_lossy(); - - let (quotes, must_quote) = if name.contains(shell_escaped_char_set(dirname)) { + let (quotes, must_quote) = if name + .iter() + .any(|c| shell_escaped_char_set(dirname).contains(c)) + { (Quotes::Single, true) - } else if name.contains('\'') { + } else if name.contains(&b'\'') { (Quotes::Double, true) } else if *always_quote { (Quotes::Single, true) @@ -351,30 +435,43 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin }; let (escaped_str, contains_quote_chars) = if *escape { - shell_with_escape(&name, quotes) + shell_with_escape(name, quotes) } else { - shell_without_escape(&name, quotes, *show_control) + shell_without_escape(name, quotes, *show_control) }; - match (must_quote | contains_quote_chars, quotes) { - (true, Quotes::Single) => format!("'{escaped_str}'"), - (true, Quotes::Double) => format!("\"{escaped_str}\""), - _ => escaped_str, + if must_quote | contains_quote_chars && quotes != Quotes::None { + let mut quoted_str = Vec::::with_capacity(escaped_str.len() + 2); + let quote = if quotes == Quotes::Single { + b'\'' + } else { + b'"' + }; + quoted_str.push(quote); + quoted_str.extend(escaped_str); + quoted_str.push(quote); + quoted_str + } else { + escaped_str } } } } /// Escape a filename with respect to the given style. -pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> String { - escape_name_inner(name, style, false) +pub fn escape_name(name: &OsStr, style: &QuotingStyle) -> OsString { + let name = crate::os_str_as_bytes_lossy(name); + crate::os_string_from_vec(escape_name_inner(&name, style, false)) + .expect("all byte sequences should be valid for platform, or already replaced in name") } /// Escape a directory name with respect to the given style. /// This is mainly meant to be used for ls' directory name printing and is not /// likely to be used elsewhere. -pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> String { - escape_name_inner(dir_name, style, true) +pub fn escape_dir_name(dir_name: &OsStr, style: &QuotingStyle) -> OsString { + let name = crate::os_str_as_bytes_lossy(dir_name); + crate::os_string_from_vec(escape_name_inner(&name, style, true)) + .expect("all byte sequences should be valid for platform, or already replaced in name") } impl fmt::Display for QuotingStyle { @@ -415,7 +512,7 @@ impl fmt::Display for Quotes { #[cfg(test)] mod tests { - use crate::quoting_style::{escape_name, Quotes, QuotingStyle}; + use crate::quoting_style::{escape_name_inner, Quotes, QuotingStyle}; // spell-checker:ignore (tests/words) one\'two one'two @@ -465,14 +562,31 @@ mod tests { } } + fn check_names_inner(name: &[u8], map: &[(T, &str)]) -> Vec> { + map.iter() + .map(|(_, style)| escape_name_inner(name, &get_style(style), false)) + .collect() + } + fn check_names(name: &str, map: &[(&str, &str)]) { assert_eq!( map.iter() - .map(|(_, style)| escape_name(name.as_ref(), &get_style(style))) - .collect::>(), + .map(|(correct, _)| *correct) + .collect::>(), + check_names_inner(name.as_bytes(), map) + .iter() + .map(|bytes| std::str::from_utf8(bytes) + .expect("valid str goes in, valid str comes out")) + .collect::>() + ); + } + + fn check_names_raw(name: &[u8], map: &[(&[u8], &str)]) { + assert_eq!( map.iter() - .map(|(correct, _)| correct.to_string()) - .collect::>() + .map(|(correct, _)| *correct) + .collect::>(), + check_names_inner(name, map) ); } @@ -487,10 +601,10 @@ mod tests { ("\"one_two\"", "c"), ("one_two", "shell"), ("one_two", "shell-show"), - ("\'one_two\'", "shell-always"), - ("\'one_two\'", "shell-always-show"), + ("'one_two'", "shell-always"), + ("'one_two'", "shell-always-show"), ("one_two", "shell-escape"), - ("\'one_two\'", "shell-escape-always"), + ("'one_two'", "shell-escape-always"), ], ); } @@ -504,12 +618,12 @@ mod tests { ("one two", "literal-show"), ("one\\ two", "escape"), ("\"one two\"", "c"), - ("\'one two\'", "shell"), - ("\'one two\'", "shell-show"), - ("\'one two\'", "shell-always"), - ("\'one two\'", "shell-always-show"), - ("\'one two\'", "shell-escape"), - ("\'one two\'", "shell-escape-always"), + ("'one two'", "shell"), + ("'one two'", "shell-show"), + ("'one two'", "shell-always"), + ("'one two'", "shell-always-show"), + ("'one two'", "shell-escape"), + ("'one two'", "shell-escape-always"), ], ); @@ -551,7 +665,7 @@ mod tests { // One single quote check_names( - "one\'two", + "one'two", &[ ("one'two", "literal"), ("one'two", "literal-show"), @@ -637,7 +751,7 @@ mod tests { ], ); - // The first 16 control characters. NUL is also included, even though it is of + // The first 16 ASCII control characters. NUL is also included, even though it is of // no importance for file names. check_names( "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", @@ -676,7 +790,7 @@ mod tests { ], ); - // The last 16 control characters. + // The last 16 ASCII control characters. check_names( "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", &[ @@ -730,6 +844,265 @@ mod tests { ("''$'\\177'", "shell-escape-always"), ], ); + + // The first 16 Unicode control characters. + let test_str = std::str::from_utf8(b"\xC2\x80\xC2\x81\xC2\x82\xC2\x83\xC2\x84\xC2\x85\xC2\x86\xC2\x87\xC2\x88\xC2\x89\xC2\x8A\xC2\x8B\xC2\x8C\xC2\x8D\xC2\x8E\xC2\x8F").unwrap(); + check_names( + test_str, + &[ + ("????????????????", "literal"), + (test_str, "literal-show"), + ("\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217", "escape"), + ("\"\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217\"", "c"), + ("????????????????", "shell"), + (test_str, "shell-show"), + ("'????????????????'", "shell-always"), + (&format!("'{}'", test_str), "shell-always-show"), + ("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape"), + ("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape-always"), + ], + ); + + // The last 16 Unicode control characters. + let test_str = std::str::from_utf8(b"\xC2\x90\xC2\x91\xC2\x92\xC2\x93\xC2\x94\xC2\x95\xC2\x96\xC2\x97\xC2\x98\xC2\x99\xC2\x9A\xC2\x9B\xC2\x9C\xC2\x9D\xC2\x9E\xC2\x9F").unwrap(); + check_names( + test_str, + &[ + ("????????????????", "literal"), + (test_str, "literal-show"), + ("\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237", "escape"), + ("\"\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237\"", "c"), + ("????????????????", "shell"), + (test_str, "shell-show"), + ("'????????????????'", "shell-always"), + (&format!("'{}'", test_str), "shell-always-show"), + ("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape"), + ("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape-always"), + ], + ); + } + + #[test] + fn test_non_unicode_bytes() { + let ascii = b'_'; + let continuation = b'\xA7'; + let first2byte = b'\xC2'; + let first3byte = b'\xE0'; + let first4byte = b'\xF0'; + let invalid = b'\xC0'; + + // a single byte value invalid outside of additional context in UTF-8 + check_names_raw( + &[continuation], + &[ + (b"?", "literal"), + (b"\xA7", "literal-show"), + (b"\\247", "escape"), + (b"\"\\247\"", "c"), + (b"?", "shell"), + (b"\xA7", "shell-show"), + (b"'?'", "shell-always"), + (b"'\xA7'", "shell-always-show"), + (b"''$'\\247'", "shell-escape"), + (b"''$'\\247'", "shell-escape-always"), + ], + ); + + // ...but the byte becomes valid with appropriate context + // (this is just the ยง character in UTF-8, written as bytes) + check_names_raw( + &[first2byte, continuation], + &[ + (b"\xC2\xA7", "literal"), + (b"\xC2\xA7", "literal-show"), + (b"\xC2\xA7", "escape"), + (b"\"\xC2\xA7\"", "c"), + (b"\xC2\xA7", "shell"), + (b"\xC2\xA7", "shell-show"), + (b"'\xC2\xA7'", "shell-always"), + (b"'\xC2\xA7'", "shell-always-show"), + (b"\xC2\xA7", "shell-escape"), + (b"'\xC2\xA7'", "shell-escape-always"), + ], + ); + + // mixed with valid characters + check_names_raw( + &[continuation, ascii], + &[ + (b"?_", "literal"), + (b"\xA7_", "literal-show"), + (b"\\247_", "escape"), + (b"\"\\247_\"", "c"), + (b"?_", "shell"), + (b"\xA7_", "shell-show"), + (b"'?_'", "shell-always"), + (b"'\xA7_'", "shell-always-show"), + (b"''$'\\247''_'", "shell-escape"), + (b"''$'\\247''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[ascii, continuation], + &[ + (b"_?", "literal"), + (b"_\xA7", "literal-show"), + (b"_\\247", "escape"), + (b"\"_\\247\"", "c"), + (b"_?", "shell"), + (b"_\xA7", "shell-show"), + (b"'_?'", "shell-always"), + (b"'_\xA7'", "shell-always-show"), + (b"'_'$'\\247'", "shell-escape"), + (b"'_'$'\\247'", "shell-escape-always"), + ], + ); + check_names_raw( + &[ascii, continuation, ascii], + &[ + (b"_?_", "literal"), + (b"_\xA7_", "literal-show"), + (b"_\\247_", "escape"), + (b"\"_\\247_\"", "c"), + (b"_?_", "shell"), + (b"_\xA7_", "shell-show"), + (b"'_?_'", "shell-always"), + (b"'_\xA7_'", "shell-always-show"), + (b"'_'$'\\247''_'", "shell-escape"), + (b"'_'$'\\247''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[continuation, ascii, continuation], + &[ + (b"?_?", "literal"), + (b"\xA7_\xA7", "literal-show"), + (b"\\247_\\247", "escape"), + (b"\"\\247_\\247\"", "c"), + (b"?_?", "shell"), + (b"\xA7_\xA7", "shell-show"), + (b"'?_?'", "shell-always"), + (b"'\xA7_\xA7'", "shell-always-show"), + (b"''$'\\247''_'$'\\247'", "shell-escape"), + (b"''$'\\247''_'$'\\247'", "shell-escape-always"), + ], + ); + + // contiguous invalid bytes + check_names_raw( + &[ + ascii, + invalid, + ascii, + continuation, + continuation, + ascii, + continuation, + continuation, + continuation, + ascii, + continuation, + continuation, + continuation, + continuation, + ascii, + ], + &[ + (b"_?_??_???_????_", "literal"), + ( + b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_", + "literal-show", + ), + ( + b"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_", + "escape", + ), + ( + b"\"_\\300_\\247\\247_\\247\\247\\247_\\247\\247\\247\\247_\"", + "c", + ), + (b"_?_??_???_????_", "shell"), + ( + b"_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_", + "shell-show", + ), + (b"'_?_??_???_????_'", "shell-always"), + ( + b"'_\xC0_\xA7\xA7_\xA7\xA7\xA7_\xA7\xA7\xA7\xA7_'", + "shell-always-show", + ), + ( + b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'", + "shell-escape", + ), + ( + b"'_'$'\\300''_'$'\\247\\247''_'$'\\247\\247\\247''_'$'\\247\\247\\247\\247''_'", + "shell-escape-always", + ), + ], + ); + + // invalid multi-byte sequences that start valid + check_names_raw( + &[first2byte, ascii], + &[ + (b"?_", "literal"), + (b"\xC2_", "literal-show"), + (b"\\302_", "escape"), + (b"\"\\302_\"", "c"), + (b"?_", "shell"), + (b"\xC2_", "shell-show"), + (b"'?_'", "shell-always"), + (b"'\xC2_'", "shell-always-show"), + (b"''$'\\302''_'", "shell-escape"), + (b"''$'\\302''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[first2byte, first2byte, continuation], + &[ + (b"?\xC2\xA7", "literal"), + (b"\xC2\xC2\xA7", "literal-show"), + (b"\\302\xC2\xA7", "escape"), + (b"\"\\302\xC2\xA7\"", "c"), + (b"?\xC2\xA7", "shell"), + (b"\xC2\xC2\xA7", "shell-show"), + (b"'?\xC2\xA7'", "shell-always"), + (b"'\xC2\xC2\xA7'", "shell-always-show"), + (b"''$'\\302''\xC2\xA7'", "shell-escape"), + (b"''$'\\302''\xC2\xA7'", "shell-escape-always"), + ], + ); + check_names_raw( + &[first3byte, continuation, ascii], + &[ + (b"??_", "literal"), + (b"\xE0\xA7_", "literal-show"), + (b"\\340\\247_", "escape"), + (b"\"\\340\\247_\"", "c"), + (b"??_", "shell"), + (b"\xE0\xA7_", "shell-show"), + (b"'??_'", "shell-always"), + (b"'\xE0\xA7_'", "shell-always-show"), + (b"''$'\\340\\247''_'", "shell-escape"), + (b"''$'\\340\\247''_'", "shell-escape-always"), + ], + ); + check_names_raw( + &[first4byte, continuation, continuation, ascii], + &[ + (b"???_", "literal"), + (b"\xF0\xA7\xA7_", "literal-show"), + (b"\\360\\247\\247_", "escape"), + (b"\"\\360\\247\\247_\"", "c"), + (b"???_", "shell"), + (b"\xF0\xA7\xA7_", "shell-show"), + (b"'???_'", "shell-always"), + (b"'\xF0\xA7\xA7_'", "shell-always-show"), + (b"''$'\\360\\247\\247''_'", "shell-escape"), + (b"''$'\\360\\247\\247''_'", "shell-escape-always"), + ], + ); } #[test] @@ -765,7 +1138,7 @@ mod tests { ("one\\\\two", "escape"), ("\"one\\\\two\"", "c"), ("'one\\two'", "shell"), - ("\'one\\two\'", "shell-always"), + ("'one\\two'", "shell-always"), ("'one\\two'", "shell-escape"), ("'one\\two'", "shell-escape-always"), ], diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index 6142e688d7c..e98a22815d4 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -253,9 +253,10 @@ pub fn read_yes() -> bool { } } -/// Helper function for processing delimiter values (which could be non UTF-8) -/// It converts OsString to &[u8] for unix targets only -/// On non-unix (i.e. Windows) it will just return an error if delimiter value is not UTF-8 +/// Converts an `OsStr` to a UTF-8 `&[u8]`. +/// +/// This always succeeds on unix platforms, +/// and fails on other platforms if the string can't be coerced to UTF-8. pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { #[cfg(unix)] let bytes = os_string.as_bytes(); @@ -271,13 +272,28 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { Ok(bytes) } -/// Helper function for converting a slice of bytes into an &OsStr -/// or OsString in non-unix targets. +/// Performs a potentially lossy conversion from `OsStr` to UTF-8 bytes. +/// +/// This is always lossless on unix platforms, +/// and wraps [`OsStr::to_string_lossy`] on non-unix platforms. +pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> { + #[cfg(unix)] + let bytes = Cow::from(os_string.as_bytes()); + + #[cfg(not(unix))] + let bytes = match os_string.to_string_lossy() { + Cow::Borrowed(slice) => Cow::from(slice.as_bytes()), + Cow::Owned(owned) => Cow::from(owned.into_bytes()), + }; + + bytes +} + +/// Converts a `&[u8]` to an `&OsStr`, +/// or parses it as UTF-8 into an [`OsString`] on non-unix platforms. /// -/// It converts `&[u8]` to `Cow` for unix targets only. -/// On non-unix (i.e. Windows), the conversion goes through the String type -/// and thus undergo UTF-8 validation, making it fail if the stream contains -/// non-UTF-8 characters. +/// This always succeeds on unix platforms, +/// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { #[cfg(unix)] let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); @@ -289,9 +305,10 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { Ok(os_str) } -/// Helper function for making an `OsString` from a byte field -/// It converts `Vec` to `OsString` for unix targets only. -/// On non-unix (i.e. Windows) it may fail if the bytes are not valid UTF-8 +/// Converts a `Vec` into an `OsString`, parsing as UTF-8 on non-unix platforms. +/// +/// This always succeeds on unix platforms, +/// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { #[cfg(unix)] let s = OsString::from_vec(vec);