diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 35152c85b66..17dfea455d2 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -10,12 +10,11 @@ use clap::{builder::ValueParser, crate_version, Arg, ArgAction, ArgMatches, Comm use std::ffi::OsString; use std::fs::File; use std::io::{stdin, stdout, BufReader, BufWriter, IsTerminal, Read, Write}; -#[cfg(unix)] -use std::os::unix::ffi::OsStrExt; use std::path::Path; use uucore::display::Quotable; use uucore::error::{set_exit_code, FromIo, UResult, USimpleError}; use uucore::line_ending::LineEnding; +use uucore::os_str_as_bytes; use self::searcher::Searcher; use matcher::{ExactMatcher, Matcher, WhitespaceMatcher}; @@ -59,7 +58,7 @@ impl Default for Delimiter<'_> { impl<'a> From<&'a OsString> for Delimiter<'a> { fn from(s: &'a OsString) -> Self { - Self::Slice(os_string_as_bytes(s).unwrap()) + Self::Slice(os_str_as_bytes(s).unwrap()) } } @@ -347,27 +346,6 @@ fn cut_files(mut filenames: Vec, mode: &Mode) { } } -// Helper function for processing delimiter values (which could be non UTF-8) -// It converts OsString to &[u8] for unix targets only -// On non-unix (i.e. Windows) it will just return an error if delimiter value is not UTF-8 -fn os_string_as_bytes(os_string: &OsString) -> UResult<&[u8]> { - #[cfg(unix)] - let bytes = os_string.as_bytes(); - - #[cfg(not(unix))] - let bytes = os_string - .to_str() - .ok_or_else(|| { - uucore::error::UUsageError::new( - 1, - "invalid UTF-8 was detected in one or more arguments", - ) - })? - .as_bytes(); - - Ok(bytes) -} - // Get delimiter and output delimiter from `-d`/`--delimiter` and `--output-delimiter` options respectively // Allow either delimiter to have a value that is neither UTF-8 nor ASCII to align with GNU behavior fn get_delimiters( @@ -395,7 +373,7 @@ fn get_delimiters( } else { // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior - let bytes = os_string_as_bytes(os_string)?; + let bytes = os_str_as_bytes(os_string)?; if os_string.to_str().is_some_and(|s| s.chars().count() > 1) || os_string.to_str().is_none() && bytes.len() > 1 { @@ -422,7 +400,7 @@ fn get_delimiters( if os_string.is_empty() || os_string == "''" { b"\0" } else { - os_string_as_bytes(os_string).unwrap() + os_str_as_bytes(os_string).unwrap() } }); Ok((delim, out_delim)) diff --git a/src/uu/tr/src/tr.rs b/src/uu/tr/src/tr.rs index 4c0389bd2b8..defe575af06 100644 --- a/src/uu/tr/src/tr.rs +++ b/src/uu/tr/src/tr.rs @@ -8,12 +8,13 @@ mod operation; mod unicode_table; -use clap::{crate_version, Arg, ArgAction, Command}; +use clap::{crate_version, value_parser, Arg, ArgAction, Command}; use operation::{ translate_input, Sequence, SqueezeOperation, SymbolTranslator, TranslateOperation, }; +use std::ffi::OsString; use std::io::{stdin, stdout, BufWriter}; -use uucore::{format_usage, help_about, help_section, help_usage, show}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes, show}; use crate::operation::DeleteOperation; use uucore::display::Quotable; @@ -43,7 +44,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { // Ultimately this should be OsString, but we might want to wait for the // pattern API on OsStr let sets: Vec<_> = matches - .get_many::(options::SETS) + .get_many::(options::SETS) .into_iter() .flatten() .map(ToOwned::to_owned) @@ -97,7 +98,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } if let Some(first) = sets.first() { - if first.ends_with('\\') { + if let Some(b'\\') = os_str_as_bytes(first)?.last() { show!(USimpleError::new( 0, "warning: an unescaped backslash at end of string is not portable" @@ -113,10 +114,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { // According to the man page: translating only happens if deleting or if a second set is given let translating = !delete_flag && sets.len() > 1; - let mut sets_iter = sets.iter().map(|c| c.as_str()); + let mut sets_iter = sets.iter().map(|c| c.as_os_str()); let (set1, set2) = Sequence::solve_set_characters( - sets_iter.next().unwrap_or_default().as_bytes(), - sets_iter.next().unwrap_or_default().as_bytes(), + os_str_as_bytes(sets_iter.next().unwrap_or_default())?, + os_str_as_bytes(sets_iter.next().unwrap_or_default())?, complement_flag, // if we are not translating then we don't truncate set1 truncate_set1_flag && translating, @@ -195,5 +196,9 @@ pub fn uu_app() -> Command { .action(ArgAction::SetTrue) .overrides_with(options::TRUNCATE_SET1), ) - .arg(Arg::new(options::SETS).num_args(1..)) + .arg( + Arg::new(options::SETS) + .num_args(1..) + .value_parser(value_parser!(OsString)), + ) } diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index 08d0029f20d..044267d25ea 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -100,7 +100,10 @@ pub use crate::features::fsxattr; //## core functions +use std::ffi::OsStr; use std::ffi::OsString; +#[cfg(unix)] +use std::os::unix::ffi::OsStrExt; use std::sync::atomic::Ordering; use once_cell::sync::Lazy; @@ -219,6 +222,24 @@ pub fn read_yes() -> bool { } } +// Helper function for processing delimiter values (which could be non UTF-8) +// It converts OsString to &[u8] for unix targets only +// On non-unix (i.e. Windows) it will just return an error if delimiter value is not UTF-8 +pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { + #[cfg(unix)] + let bytes = os_string.as_bytes(); + + #[cfg(not(unix))] + let bytes = os_string + .to_str() + .ok_or_else(|| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })? + .as_bytes(); + + Ok(bytes) +} + /// Prompt the user with a formatted string and returns `true` if they reply `'y'` or `'Y'` /// /// This macro functions accepts the same syntax as `format!`. The prompt is written to diff --git a/tests/by-util/test_tr.rs b/tests/by-util/test_tr.rs index 209c4f2846b..c0421c2485f 100644 --- a/tests/by-util/test_tr.rs +++ b/tests/by-util/test_tr.rs @@ -5,6 +5,9 @@ // spell-checker:ignore aabbaa aabbcc aabc abbb abbbcddd abcc abcdefabcdef abcdefghijk abcdefghijklmn abcdefghijklmnop ABCDEFGHIJKLMNOPQRS abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ ABCDEFZZ abcxyz ABCXYZ abcxyzabcxyz ABCXYZABCXYZ acbdef alnum amzamz AMZXAMZ bbbd cclass cefgm cntrl compl dabcdef dncase Gzabcdefg PQRST upcase wxyzz xdigit XXXYYY xycde xyyye xyyz xyzzzzxyzzzz ZABCDEF Zamz Cdefghijkl Cdefghijklmn asdfqqwweerr qwerr asdfqwer qwer aassddffqwer asdfqwer use crate::common::util::TestScenario; +#[cfg(unix)] +use std::{ffi::OsStr, os::unix::ffi::OsStrExt}; + #[test] fn test_invalid_arg() { new_ucmd!().arg("--definitely-invalid").fails().code_is(1); @@ -1427,3 +1430,18 @@ fn check_complement_set2_too_big() { .fails() .stderr_contains("when translating with complemented character classes,\nstring2 must map all characters in the domain to one"); } + +#[test] +#[cfg(unix)] +fn test_truncate_non_utf8_set() { + let stdin = &[b'\x01', b'a', b'm', b'p', 0xfe_u8, 0xff_u8]; + let set1 = OsStr::from_bytes(&[b'a', 0xfe_u8, 0xff_u8, b'z']); + let set2 = OsStr::from_bytes(b"01234"); + + new_ucmd!() + .arg(set1) + .arg(set2) + .pipe_in(*stdin) + .succeeds() + .stdout_is_bytes(b"\x010mp12"); +}