From 57d4750663d3b4e618526cff372d8f1089399d70 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 22 Oct 2024 13:38:14 -0500 Subject: [PATCH 1/3] printf: accept non-UTF-8 input in FORMAT and ARGUMENT arguments Other implementations of `printf` permit arbitrary data to be passed to `printf`. The only restriction is that a null byte terminates FORMAT and ARGUMENT argument strings (since they are C strings). The current implementation only accepts FORMAT and ARGUMENT arguments that are valid UTF-8 (this is being enforced by clap). This commit removes the UTF-8 validation by switching to OsStr and OsString. This allows users to use `printf` to transmit or reformat null-safe but not UTF-8-safe data, such as text encoded in an 8-bit text encoding. See the `non_utf_8_input` test for an example (ISO-8859-1 text). --- src/uu/echo/src/echo.rs | 30 +-- src/uu/printf/src/printf.rs | 30 ++- .../src/lib/features/format/argument.rs | 233 ++++++++++++------ src/uucore/src/lib/features/format/mod.rs | 34 ++- src/uucore/src/lib/features/format/spec.rs | 77 +++--- src/uucore/src/lib/lib.rs | 125 ++++++++-- tests/by-util/test_printf.rs | 35 +++ 7 files changed, 379 insertions(+), 185 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 4df76634843..fe0843bd5e6 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -6,11 +6,11 @@ use clap::builder::ValueParser; use clap::{Arg, ArgAction, Command}; use std::env; -use std::ffi::{OsStr, OsString}; +use std::ffi::OsString; use std::io::{self, StdoutLock, Write}; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; use uucore::format::{FormatChar, OctalParsing, parse_escape_only}; -use uucore::{format_usage, help_about, help_section, help_usage}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose}; const ABOUT: &str = help_about!("echo.md"); const USAGE: &str = help_usage!("echo.md"); @@ -178,13 +178,9 @@ fn execute( escaped: bool, ) -> UResult<()> { for (i, input) in arguments_after_options.into_iter().enumerate() { - let Some(bytes) = bytes_from_os_string(input.as_os_str()) else { - return Err(USimpleError::new( - 1, - "Non-UTF-8 arguments provided, but this platform does not support them", - )); - }; + let bytes = os_str_as_bytes_verbose(&input)?; + // Don't print a space before the first argument if i > 0 { stdout_lock.write_all(b" ")?; } @@ -206,19 +202,3 @@ fn execute( Ok(()) } - -fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> { - #[cfg(target_family = "unix")] - { - use std::os::unix::ffi::OsStrExt; - - Some(input.as_bytes()) - } - - #[cfg(not(target_family = "unix"))] - { - // TODO - // Verify that this works correctly on these platforms - input.to_str().map(|st| st.as_bytes()) - } -} diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index 887ad4107a7..a7f73342818 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -3,11 +3,16 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. use clap::{Arg, ArgAction, Command}; +use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; -use uucore::format::{FormatArgument, FormatArguments, FormatItem, parse_spec_and_escape}; -use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes, show_warning}; +use uucore::format::{ + FormatArgument, FormatArguments, FormatError, FormatItem, parse_spec_and_escape, +}; +use uucore::{ + format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose, show_warning, +}; const VERSION: &str = "version"; const HELP: &str = "help"; @@ -19,21 +24,19 @@ mod options { pub const FORMAT: &str = "FORMAT"; pub const ARGUMENT: &str = "ARGUMENT"; } + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().get_matches_from(args); let format = matches - .get_one::(options::FORMAT) + .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, "missing operand"))?; - let format = os_str_as_bytes(format)?; + let format = os_str_as_bytes_verbose(format).map_err(FormatError::from)?; - let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { - // FIXME: use os_str_as_bytes once FormatArgument supports Vec + let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { Some(s) => s - .map(|os_string| { - FormatArgument::Unparsed(std::ffi::OsStr::to_string_lossy(os_string).to_string()) - }) + .map(|os_string| FormatArgument::Unparsed(os_string.to_owned())) .collect(), None => vec![], }; @@ -59,7 +62,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let Some(FormatArgument::Unparsed(arg_str)) = args.peek_arg() else { unreachable!("All args are transformed to Unparsed") }; - show_warning!("ignoring excess arguments, starting with '{arg_str}'"); + show_warning!( + "ignoring excess arguments, starting with '{}'", + arg_str.to_string_lossy() + ); } return Ok(()); } @@ -98,10 +104,10 @@ pub fn uu_app() -> Command { .help("Print version information") .action(ArgAction::Version), ) - .arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(std::ffi::OsString))) + .arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(OsString))) .arg( Arg::new(options::ARGUMENT) .action(ArgAction::Append) - .value_parser(clap::value_parser!(std::ffi::OsString)), + .value_parser(clap::value_parser!(OsString)), ) } diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index f3edbae5576..c26cba2a5ed 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -3,16 +3,20 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use super::ExtendedBigDecimal; +use super::{ExtendedBigDecimal, FormatError}; use crate::format::spec::ArgumentLocation; use crate::{ error::set_exit_code, + os_str_as_bytes_verbose, os_str_as_str_verbose, parser::num_parser::{ExtendedParser, ExtendedParserError}, quoting_style::{Quotes, QuotingStyle, escape_name}, show_error, show_warning, }; use os_display::Quotable; -use std::{ffi::OsStr, num::NonZero}; +use std::{ + ffi::{OsStr, OsString}, + num::NonZero, +}; /// An argument for formatting /// @@ -24,12 +28,12 @@ use std::{ffi::OsStr, num::NonZero}; #[derive(Clone, Debug, PartialEq)] pub enum FormatArgument { Char(char), - String(String), + String(OsString), UnsignedInt(u64), SignedInt(i64), Float(ExtendedBigDecimal), /// Special argument that gets coerced into the other variants - Unparsed(String), + Unparsed(OsString), } /// A struct that holds a slice of format arguments and provides methods to access them @@ -69,33 +73,41 @@ impl<'a> FormatArguments<'a> { self.next_arg_position = self.current_offset; } - pub fn next_char(&mut self, position: &ArgumentLocation) -> u8 { + pub fn next_char(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::Char(c)) => *c as u8, - Some(FormatArgument::Unparsed(s)) => s.bytes().next().unwrap_or(b'\0'), - _ => b'\0', + Some(FormatArgument::Char(c)) => Ok(*c as u8), + Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes_verbose(os)?.first() { + Some(&byte) => Ok(byte), + None => Ok(b'\0'), + }, + _ => Ok(b'\0'), } } - pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a str { + pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a OsStr { match self.next_arg(position) { - Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s, - _ => "", + Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os, + _ => "".as_ref(), } } - pub fn next_i64(&mut self, position: &ArgumentLocation) -> i64 { + pub fn next_i64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::SignedInt(n)) => *n, - Some(FormatArgument::Unparsed(s)) => extract_value(i64::extended_parse(s), s), - _ => 0, + Some(FormatArgument::SignedInt(n)) => Ok(*n), + Some(FormatArgument::Unparsed(os)) => { + let str = os_str_as_str_verbose(os)?; + + Ok(extract_value(i64::extended_parse(str), str)) + } + _ => Ok(0), } } - pub fn next_u64(&mut self, position: &ArgumentLocation) -> u64 { + pub fn next_u64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::UnsignedInt(n)) => *n, - Some(FormatArgument::Unparsed(s)) => { + Some(FormatArgument::UnsignedInt(n)) => Ok(*n), + Some(FormatArgument::Unparsed(os)) => { + let s = os_str_as_str_verbose(os)?; // Check if the string is a character literal enclosed in quotes if s.starts_with(['"', '\'']) { // Extract the content between the quotes safely using chars @@ -109,23 +121,28 @@ impl<'a> FormatArguments<'a> { remaining ); } - return first_char as u64; // Use only the first character + return Ok(first_char as u64); // Use only the first character } - return 0; // Empty quotes + return Ok(0); // Empty quotes } - extract_value(u64::extended_parse(s), s) + Ok(extract_value(u64::extended_parse(s), s)) } - _ => 0, + _ => Ok(0), } } - pub fn next_extended_big_decimal(&mut self, position: &ArgumentLocation) -> ExtendedBigDecimal { + pub fn next_extended_big_decimal( + &mut self, + position: &ArgumentLocation, + ) -> Result { match self.next_arg(position) { - Some(FormatArgument::Float(n)) => n.clone(), - Some(FormatArgument::Unparsed(s)) => { - extract_value(ExtendedBigDecimal::extended_parse(s), s) + Some(FormatArgument::Float(n)) => Ok(n.clone()), + Some(FormatArgument::Unparsed(os)) => { + let s = os_str_as_str_verbose(os)?; + + Ok(extract_value(ExtendedBigDecimal::extended_parse(s), s)) } - _ => ExtendedBigDecimal::zero(), + _ => Ok(ExtendedBigDecimal::zero()), } } @@ -181,6 +198,7 @@ fn extract_value(p: Result>, input: &s } else { show_error!("{}: value not completely converted", input.quote()); } + v } } @@ -205,7 +223,10 @@ mod tests { assert!(!args.is_exhausted()); assert_eq!(Some(&FormatArgument::Char('a')), args.peek_arg()); assert!(!args.is_exhausted()); // Peek shouldn't consume - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(args.is_exhausted()); // After batch, exhausted with a single arg assert_eq!(None, args.peek_arg()); @@ -226,26 +247,50 @@ mod tests { ]); // First batch - two sequential calls - assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'y', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'z', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'y', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'x', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'w', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'x', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'w', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern - assert_eq!(b'v', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'u', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'v', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'u', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Fourth batch - same pattern (last batch) - assert_eq!(b't', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b's', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b't', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b's', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -255,28 +300,37 @@ mod tests { // Test with different method types in sequence let args = [ FormatArgument::Char('a'), - FormatArgument::String("hello".to_string()), - FormatArgument::Unparsed("123".to_string()), - FormatArgument::String("world".to_string()), + FormatArgument::String("hello".into()), + FormatArgument::Unparsed("123".into()), + FormatArgument::String("world".into()), FormatArgument::Char('z'), - FormatArgument::String("test".to_string()), + FormatArgument::String("test".into()), ]; let mut args = FormatArguments::new(&args); // First batch - next_char followed by next_string - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'1', args.next_char(&ArgumentLocation::NextArgument)); // First byte of 123 + assert_eq!( + b'1', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // First byte of 123 assert_eq!("world", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern (last batch) - assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'z', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); assert_eq!("test", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(args.is_exhausted()); @@ -302,23 +356,23 @@ mod tests { ]); // First batch - positional access - assert_eq!(b'b', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'a', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'b', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same positional pattern - assert_eq!(b'e', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'd', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'e', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'd', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same positional pattern (last batch) - assert_eq!(b'h', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'g', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'i', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'h', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'g', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'i', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(args.is_exhausted()); } @@ -338,20 +392,29 @@ mod tests { ]); // First batch - mix of sequential and positional - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Positional + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same mixed pattern - assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Positional + assert_eq!( + b'd', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Last batch - same mixed pattern - assert_eq!(b'g', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds + assert_eq!( + b'g', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } @@ -370,17 +433,21 @@ mod tests { let mut args = FormatArguments::new(&args); // First batch - i64, u64, decimal - assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument)); - assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument)); - let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); + assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); + let result = args + .next_extended_big_decimal(&ArgumentLocation::NextArgument) + .unwrap(); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument)); - assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument)); - let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); + assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); + let result = args + .next_extended_big_decimal(&ArgumentLocation::NextArgument) + .unwrap(); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(args.is_exhausted()); @@ -390,22 +457,22 @@ mod tests { fn test_unparsed_arguments() { // Test with unparsed arguments that get coerced let args = [ - FormatArgument::Unparsed("hello".to_string()), - FormatArgument::Unparsed("123".to_string()), - FormatArgument::Unparsed("hello".to_string()), - FormatArgument::Unparsed("456".to_string()), + FormatArgument::Unparsed("hello".into()), + FormatArgument::Unparsed("123".into()), + FormatArgument::Unparsed("hello".into()), + FormatArgument::Unparsed("456".into()), ]; let mut args = FormatArguments::new(&args); // First batch - string, number assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -415,25 +482,25 @@ mod tests { // Test with mixed types and positional access let args = [ FormatArgument::Char('a'), - FormatArgument::String("test".to_string()), + FormatArgument::String("test".into()), FormatArgument::UnsignedInt(42), FormatArgument::Char('b'), - FormatArgument::String("more".to_string()), + FormatArgument::String("more".into()), FormatArgument::UnsignedInt(99), ]; let mut args = FormatArguments::new(&args); // First batch - positional access of different types - assert_eq!(b'a', args.next_char(&non_zero_pos(1))); + assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); assert_eq!("test", args.next_string(&non_zero_pos(2))); - assert_eq!(42, args.next_u64(&non_zero_pos(3))); + assert_eq!(42, args.next_u64(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'b', args.next_char(&non_zero_pos(1))); + assert_eq!(b'b', args.next_char(&non_zero_pos(1)).unwrap()); assert_eq!("more", args.next_string(&non_zero_pos(2))); - assert_eq!(99, args.next_u64(&non_zero_pos(3))); + assert_eq!(99, args.next_u64(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -450,14 +517,20 @@ mod tests { ]); // First batch - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch (partial) - assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds + assert_eq!( + b'd', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index ee17d96da79..a0b61db325d 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -37,8 +37,12 @@ pub mod human; pub mod num_format; mod spec; +pub use self::escape::{EscapedChar, OctalParsing}; use crate::extendedbigdecimal::ExtendedBigDecimal; -pub use argument::*; +pub use argument::{FormatArgument, FormatArguments}; + +use self::{escape::parse_escape_code, num_format::Formatter}; +use crate::{NonUtf8OsStrError, OsStrConversionType, error::UError}; pub use spec::Spec; use std::{ error::Error, @@ -50,13 +54,6 @@ use std::{ use os_display::Quotable; -use crate::error::UError; - -pub use self::{ - escape::{EscapedChar, OctalParsing, parse_escape_code}, - num_format::Formatter, -}; - #[derive(Debug)] pub enum FormatError { SpecError(Vec), @@ -74,6 +71,7 @@ pub enum FormatError { /// The hexadecimal characters represent a code point that cannot represent a /// Unicode character (e.g., a surrogate code point) InvalidCharacter(char, Vec), + InvalidEncoding(NonUtf8OsStrError), } impl Error for FormatError {} @@ -85,6 +83,12 @@ impl From for FormatError { } } +impl From for FormatError { + fn from(value: NonUtf8OsStrError) -> FormatError { + FormatError::InvalidEncoding(value) + } +} + impl Display for FormatError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -119,6 +123,20 @@ impl Display for FormatError { escape_char, String::from_utf8_lossy(digits) ), + Self::InvalidEncoding(no) => { + use os_display::Quotable; + + let quoted = no.input_lossy_string.quote(); + + match no.conversion_type { + OsStrConversionType::ToBytes => f.write_fmt(format_args!( + "invalid (non-UTF-8) argument like {quoted} encountered when converting argument to bytes on a platform that doesn't use UTF-8", + )), + OsStrConversionType::ToString => f.write_fmt(format_args!( + "invalid (non-UTF-8) argument like {quoted} encountered", + )), + } + } } } } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index d2262659012..afac0215f9c 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -5,8 +5,6 @@ // spell-checker:ignore (vars) intmax ptrdiff padlen -use crate::quoting_style::{QuotingStyle, escape_name}; - use super::{ ExtendedBigDecimal, FormatChar, FormatError, OctalParsing, num_format::{ @@ -15,7 +13,11 @@ use super::{ }, parse_escape_only, }; -use crate::format::FormatArguments; +use crate::{ + format::FormatArguments, + os_str_as_bytes_verbose, + quoting_style::{QuotingStyle, escape_name}, +}; use std::{io::Write, num::NonZero, ops::ControlFlow}; /// A parsed specification for formatting a value @@ -355,7 +357,7 @@ impl Spec { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or_default(); write_padded( writer, - &[args.next_char(position)], + &[args.next_char(position)?], width, *align_left || neg_width, ) @@ -375,22 +377,25 @@ impl Spec { // TODO: We need to not use Rust's formatting for aligning the output, // so that we can just write bytes to stdout without panicking. let precision = resolve_asterisk_precision(*precision, args); - let s = args.next_string(position); + + let os_str = args.next_string(position); + + let bytes = os_str_as_bytes_verbose(os_str)?; + let truncated = match precision { - Some(p) if p < s.len() => &s[..p], - _ => s, + Some(p) if p < os_str.len() => &bytes[..p], + _ => bytes, }; - write_padded( - writer, - truncated.as_bytes(), - width, - *align_left || neg_width, - ) + write_padded(writer, truncated, width, *align_left || neg_width) } Self::EscapedString { position } => { - let s = args.next_string(position); - let mut parsed = Vec::new(); - for c in parse_escape_only(s.as_bytes(), OctalParsing::ThreeDigits) { + let os_str = args.next_string(position); + + let bytes = os_str_as_bytes_verbose(os_str)?; + + let mut parsed = Vec::::new(); + + for c in parse_escape_only(bytes, OctalParsing::ThreeDigits) { match c.write(&mut parsed)? { ControlFlow::Continue(()) => {} ControlFlow::Break(()) => { @@ -403,19 +408,15 @@ impl Spec { } Self::QuotedString { position } => { let s = escape_name( - args.next_string(position).as_ref(), + args.next_string(position), &QuotingStyle::Shell { escape: true, always_quote: false, show_control: false, }, ); - #[cfg(unix)] - let bytes = std::os::unix::ffi::OsStringExt::into_vec(s); - #[cfg(not(unix))] - let bytes = s.to_string_lossy().as_bytes().to_owned(); - - writer.write_all(&bytes).map_err(FormatError::IoError) + let bytes = os_str_as_bytes_verbose(&s)?; + writer.write_all(bytes).map_err(FormatError::IoError) } Self::SignedInt { width, @@ -426,7 +427,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_i64(position); + let i = args.next_i64(position)?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -454,7 +455,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_u64(position); + let i = args.next_u64(position)?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -485,7 +486,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args); - let f: ExtendedBigDecimal = args.next_extended_big_decimal(position); + let f: ExtendedBigDecimal = args.next_extended_big_decimal(position)?; if precision.is_some_and(|p| p as u64 > i32::MAX as u64) { return Err(FormatError::InvalidPrecision( @@ -522,7 +523,7 @@ fn resolve_asterisk_width( match option { None => None, Some(CanAsterisk::Asterisk(loc)) => { - let nb = args.next_i64(&loc); + let nb = args.next_i64(&loc).unwrap_or(0); if nb < 0 { Some((usize::try_from(-(nb as isize)).ok().unwrap_or(0), true)) } else { @@ -541,7 +542,7 @@ fn resolve_asterisk_precision( ) -> Option { match option { None => None, - Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc) { + Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc).unwrap_or(0) { v if v >= 0 => usize::try_from(v).ok(), v if v < 0 => Some(0usize), _ => None, @@ -650,7 +651,7 @@ mod tests { Some((42, false)), resolve_asterisk_width( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("42".into())]), ) ); @@ -665,7 +666,7 @@ mod tests { Some((42, true)), resolve_asterisk_width( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".into())]), ) ); @@ -676,9 +677,9 @@ mod tests { NonZero::new(2).unwrap() ))), &mut FormatArguments::new(&[ - FormatArgument::Unparsed("1".to_string()), - FormatArgument::Unparsed("2".to_string()), - FormatArgument::Unparsed("3".to_string()) + FormatArgument::Unparsed("1".into()), + FormatArgument::Unparsed("2".into()), + FormatArgument::Unparsed("3".into()) ]), ) ); @@ -721,7 +722,7 @@ mod tests { Some(42), resolve_asterisk_precision( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("42".into())]), ) ); @@ -736,7 +737,7 @@ mod tests { Some(0), resolve_asterisk_precision( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".into())]), ) ); assert_eq!( @@ -746,9 +747,9 @@ mod tests { NonZero::new(2).unwrap() ))), &mut FormatArguments::new(&[ - FormatArgument::Unparsed("1".to_string()), - FormatArgument::Unparsed("2".to_string()), - FormatArgument::Unparsed("3".to_string()) + FormatArgument::Unparsed("1".into()), + FormatArgument::Unparsed("2".into()), + FormatArgument::Unparsed("3".into()) ]), ) ); diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index b1a9363f728..8b2f7f0c796 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -278,21 +278,32 @@ pub fn read_yes() -> bool { } } +fn os_str_as_bytes_core(os_string: &OsStr) -> Option<&[u8]> { + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let option = { + #[cfg(unix)] + { + Some(os_string.as_bytes()) + } + + #[cfg(not(unix))] + { + os_string.to_str().map(|op| op.as_bytes()) + } + }; + + option +} + /// Converts an `OsStr` to a UTF-8 `&[u8]`. /// /// This always succeeds on unix platforms, /// and fails on other platforms if the string can't be coerced to UTF-8. pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { - #[cfg(unix)] - let bytes = os_string.as_bytes(); - - #[cfg(not(unix))] - let bytes = os_string - .to_str() - .ok_or_else(|| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })? - .as_bytes(); + let bytes = os_str_as_bytes_core(os_string).ok_or_else(|| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })?; Ok(bytes) } @@ -314,18 +325,76 @@ pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> { bytes } +#[derive(Debug)] +enum OsStrConversionType { + ToBytes, + ToString, +} + +#[derive(Debug)] +pub struct NonUtf8OsStrError { + conversion_type: OsStrConversionType, + input_lossy_string: String, +} + +impl std::fmt::Display for NonUtf8OsStrError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use os_display::Quotable; + + let quoted = self.input_lossy_string.quote(); + + match self.conversion_type { + OsStrConversionType::ToBytes => f.write_fmt(format_args!( + "invalid (non-UTF-8) input like {quoted} encountered when converting input to bytes on a platform that doesn't use UTF-8", + )), + OsStrConversionType::ToString => f.write_fmt(format_args!( + "invalid (non-UTF-8) string like {quoted} encountered", + )), + } + } +} + +impl std::error::Error for NonUtf8OsStrError {} +impl error::UError for NonUtf8OsStrError {} + +pub fn os_str_as_bytes_verbose(input: &OsStr) -> Result<&[u8], NonUtf8OsStrError> { + os_str_as_bytes_core(input).ok_or_else(|| NonUtf8OsStrError { + conversion_type: OsStrConversionType::ToBytes, + input_lossy_string: input.to_string_lossy().into_owned(), + }) +} + +pub fn os_str_as_str_verbose(os_str: &OsStr) -> Result<&str, NonUtf8OsStrError> { + match os_str.to_str() { + Some(st) => Ok(st), + None => Err(NonUtf8OsStrError { + conversion_type: OsStrConversionType::ToString, + input_lossy_string: os_str.to_string_lossy().into_owned(), + }), + } +} + /// Converts a `&[u8]` to an `&OsStr`, /// or parses it as UTF-8 into an [`OsString`] on non-unix platforms. /// /// This always succeeds on unix platforms, /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { - #[cfg(unix)] - let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); - #[cfg(not(unix))] - let os_str = Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { - mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") - })?)); + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let os_str = { + #[cfg(unix)] + { + Cow::Borrowed(OsStr::from_bytes(bytes)) + } + + #[cfg(not(unix))] + { + Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { + mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") + })?)) + } + }; Ok(os_str) } @@ -335,12 +404,24 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { /// This always succeeds on unix platforms, /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { - #[cfg(unix)] - let s = OsString::from_vec(vec); - #[cfg(not(unix))] - let s = OsString::from(String::from_utf8(vec).map_err(|_| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })?); + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let s = { + #[cfg(unix)] + { + OsString::from_vec(vec) + } + + #[cfg(not(unix))] + { + OsString::from(String::from_utf8(vec).map_err(|_| { + mods::error::UUsageError::new( + 1, + "invalid UTF-8 was detected in one or more arguments", + ) + })?) + } + }; Ok(s) } diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index c0e9c41b3aa..f40689030d4 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -1362,3 +1362,38 @@ fn positional_format_specifiers() { .succeeds() .stdout_only("Octal: 115, Int: 42, Float: 3.141590, String: hello, Hex: ff, Scientific: 1.000000e-05, Char: A, Unsigned: 100, Integer: 123"); } + +#[test] +#[cfg(target_family = "unix")] +fn non_utf_8_input() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // ISO-8859-1 encoded text + // spell-checker:disable + const INPUT_AND_OUTPUT: &[u8] = + b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre."; + // spell-checker:enable + + let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%s") + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%d") + .arg(os_str) + .fails() + // spell-checker:disable + .stderr_only("printf: invalid (non-UTF-8) argument like 'Swer an rehte g�ete wendet s�n gem�ete, dem volget s�lde und �re.' encountered +"); + // spell-checker:enable +} From ee3ac832a3fa526ae45133a4e3b73c7856200bf5 Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Fri, 2 May 2025 20:53:47 -0400 Subject: [PATCH 2/3] uucore, printf: improve non-UTF-8 format arguments This fixes handling of format arguments, in part by eliminating duplicate implementations. Utilities with format arguments other than printf will no longer accept things like "'a" as numbers, etc. --- src/uu/echo/src/echo.rs | 4 +- src/uu/printf/src/printf.rs | 10 +- src/uucore/src/lib/features/checksum.rs | 2 +- .../src/lib/features/extendedbigdecimal.rs | 12 ++ .../src/lib/features/format/argument.rs | 73 +++++---- src/uucore/src/lib/features/format/mod.rs | 17 +- src/uucore/src/lib/features/format/spec.rs | 12 +- .../src/lib/features/parser/num_parser.rs | 15 -- src/uucore/src/lib/lib.rs | 147 +++++------------- tests/by-util/test_printf.rs | 82 ++++++++-- 10 files changed, 172 insertions(+), 202 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index fe0843bd5e6..a7c27818dc2 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -10,7 +10,7 @@ use std::ffi::OsString; use std::io::{self, StdoutLock, Write}; use uucore::error::UResult; use uucore::format::{FormatChar, OctalParsing, parse_escape_only}; -use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes}; const ABOUT: &str = help_about!("echo.md"); const USAGE: &str = help_usage!("echo.md"); @@ -178,7 +178,7 @@ fn execute( escaped: bool, ) -> UResult<()> { for (i, input) in arguments_after_options.into_iter().enumerate() { - let bytes = os_str_as_bytes_verbose(&input)?; + let bytes = os_str_as_bytes(&input)?; // Don't print a space before the first argument if i > 0 { diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index a7f73342818..b564fbae905 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -7,12 +7,8 @@ use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; -use uucore::format::{ - FormatArgument, FormatArguments, FormatError, FormatItem, parse_spec_and_escape, -}; -use uucore::{ - format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose, show_warning, -}; +use uucore::format::{FormatArgument, FormatArguments, FormatItem, parse_spec_and_escape}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes, show_warning}; const VERSION: &str = "version"; const HELP: &str = "help"; @@ -32,7 +28,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let format = matches .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, "missing operand"))?; - let format = os_str_as_bytes_verbose(format).map_err(FormatError::from)?; + let format = os_str_as_bytes(format)?; let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { Some(s) => s diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index f7b36a9b8a8..a93e811cd0d 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -956,7 +956,7 @@ fn process_checksum_line( cached_line_format: &mut Option, last_algo: &mut Option, ) -> Result<(), LineCheckError> { - let line_bytes = os_str_as_bytes(line)?; + let line_bytes = os_str_as_bytes(line).map_err(|e| LineCheckError::UError(Box::new(e)))?; // Early return on empty or commented lines. if line.is_empty() || line_bytes.starts_with(b"#") { diff --git a/src/uucore/src/lib/features/extendedbigdecimal.rs b/src/uucore/src/lib/features/extendedbigdecimal.rs index 396b6f35941..5748b6f1ab9 100644 --- a/src/uucore/src/lib/features/extendedbigdecimal.rs +++ b/src/uucore/src/lib/features/extendedbigdecimal.rs @@ -101,6 +101,18 @@ impl From for ExtendedBigDecimal { } } +impl From for ExtendedBigDecimal { + fn from(val: u8) -> Self { + Self::BigDecimal(val.into()) + } +} + +impl From for ExtendedBigDecimal { + fn from(val: u32) -> Self { + Self::BigDecimal(val.into()) + } +} + impl ExtendedBigDecimal { pub fn zero() -> Self { Self::BigDecimal(0.into()) diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index c26cba2a5ed..cb813577280 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -7,7 +7,7 @@ use super::{ExtendedBigDecimal, FormatError}; use crate::format::spec::ArgumentLocation; use crate::{ error::set_exit_code, - os_str_as_bytes_verbose, os_str_as_str_verbose, + os_str_as_bytes, parser::num_parser::{ExtendedParser, ExtendedParserError}, quoting_style::{Quotes, QuotingStyle, escape_name}, show_error, show_warning, @@ -76,7 +76,7 @@ impl<'a> FormatArguments<'a> { pub fn next_char(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { Some(FormatArgument::Char(c)) => Ok(*c as u8), - Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes_verbose(os)?.first() { + Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes(os)?.first() { Some(&byte) => Ok(byte), None => Ok(b'\0'), }, @@ -94,11 +94,7 @@ impl<'a> FormatArguments<'a> { pub fn next_i64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { Some(FormatArgument::SignedInt(n)) => Ok(*n), - Some(FormatArgument::Unparsed(os)) => { - let str = os_str_as_str_verbose(os)?; - - Ok(extract_value(i64::extended_parse(str), str)) - } + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), _ => Ok(0), } } @@ -106,27 +102,7 @@ impl<'a> FormatArguments<'a> { pub fn next_u64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { Some(FormatArgument::UnsignedInt(n)) => Ok(*n), - Some(FormatArgument::Unparsed(os)) => { - let s = os_str_as_str_verbose(os)?; - // Check if the string is a character literal enclosed in quotes - if s.starts_with(['"', '\'']) { - // Extract the content between the quotes safely using chars - let mut chars = s.trim_matches(|c| c == '"' || c == '\'').chars(); - if let Some(first_char) = chars.next() { - if chars.clone().count() > 0 { - // Emit a warning if there are additional characters - let remaining: String = chars.collect(); - show_warning!( - "{}: character(s) following character constant have been ignored", - remaining - ); - } - return Ok(first_char as u64); // Use only the first character - } - return Ok(0); // Empty quotes - } - Ok(extract_value(u64::extended_parse(s), s)) - } + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), _ => Ok(0), } } @@ -137,13 +113,46 @@ impl<'a> FormatArguments<'a> { ) -> Result { match self.next_arg(position) { Some(FormatArgument::Float(n)) => Ok(n.clone()), - Some(FormatArgument::Unparsed(os)) => { - let s = os_str_as_str_verbose(os)?; + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), + _ => Ok(ExtendedBigDecimal::zero()), + } + } - Ok(extract_value(ExtendedBigDecimal::extended_parse(s), s)) + fn get_num(os: &OsStr) -> Result + where + T: ExtendedParser + From + From + Default, + { + let s = os_str_as_bytes(os)?; + // Check if the string begins with a quote, and is therefore a literal + if let Some((&first, bytes)) = s.split_first() { + if (first == b'"' || first == b'\'') && !bytes.is_empty() { + let (val, len) = if let Some(c) = bytes + .utf8_chunks() + .next() + .expect("bytes should not be empty") + .valid() + .chars() + .next() + { + // Valid UTF-8 character, cast the codepoint to u32 then T + // (largest unicode codepoint is only 3 bytes, so this is safe) + ((c as u32).into(), c.len_utf8()) + } else { + // Not a valid UTF-8 character, use the first byte + (bytes[0].into(), 1) + }; + // Emit a warning if there are additional characters + if bytes.len() > len { + show_warning!( + "{}: character(s) following character constant have been ignored", + String::from_utf8_lossy(&bytes[len..]) + ); + } + return Ok(val); } - _ => Ok(ExtendedBigDecimal::zero()), } + let s = os.to_string_lossy(); + Ok(extract_value(T::extended_parse(&s), &s)) } fn get_at_relative_position(&mut self, pos: NonZero) -> Option<&'a FormatArgument> { diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index a0b61db325d..dc8a61dda22 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -42,7 +42,7 @@ use crate::extendedbigdecimal::ExtendedBigDecimal; pub use argument::{FormatArgument, FormatArguments}; use self::{escape::parse_escape_code, num_format::Formatter}; -use crate::{NonUtf8OsStrError, OsStrConversionType, error::UError}; +use crate::{NonUtf8OsStrError, error::UError}; pub use spec::Spec; use std::{ error::Error, @@ -123,20 +123,7 @@ impl Display for FormatError { escape_char, String::from_utf8_lossy(digits) ), - Self::InvalidEncoding(no) => { - use os_display::Quotable; - - let quoted = no.input_lossy_string.quote(); - - match no.conversion_type { - OsStrConversionType::ToBytes => f.write_fmt(format_args!( - "invalid (non-UTF-8) argument like {quoted} encountered when converting argument to bytes on a platform that doesn't use UTF-8", - )), - OsStrConversionType::ToString => f.write_fmt(format_args!( - "invalid (non-UTF-8) argument like {quoted} encountered", - )), - } - } + Self::InvalidEncoding(no) => no.fmt(f), } } } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index afac0215f9c..06cf4498553 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -15,7 +15,7 @@ use super::{ }; use crate::{ format::FormatArguments, - os_str_as_bytes_verbose, + os_str_as_bytes, quoting_style::{QuotingStyle, escape_name}, }; use std::{io::Write, num::NonZero, ops::ControlFlow}; @@ -377,10 +377,8 @@ impl Spec { // TODO: We need to not use Rust's formatting for aligning the output, // so that we can just write bytes to stdout without panicking. let precision = resolve_asterisk_precision(*precision, args); - let os_str = args.next_string(position); - - let bytes = os_str_as_bytes_verbose(os_str)?; + let bytes = os_str_as_bytes(os_str)?; let truncated = match precision { Some(p) if p < os_str.len() => &bytes[..p], @@ -390,9 +388,7 @@ impl Spec { } Self::EscapedString { position } => { let os_str = args.next_string(position); - - let bytes = os_str_as_bytes_verbose(os_str)?; - + let bytes = os_str_as_bytes(os_str)?; let mut parsed = Vec::::new(); for c in parse_escape_only(bytes, OctalParsing::ThreeDigits) { @@ -415,7 +411,7 @@ impl Spec { show_control: false, }, ); - let bytes = os_str_as_bytes_verbose(&s)?; + let bytes = os_str_as_bytes(&s)?; writer.write_all(bytes).map_err(FormatError::IoError) } Self::SignedInt { diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs index 3ee07e41357..655c17eece4 100644 --- a/src/uucore/src/lib/features/parser/num_parser.rs +++ b/src/uucore/src/lib/features/parser/num_parser.rs @@ -481,21 +481,6 @@ pub(crate) fn parse<'a>( target: ParseTarget, allowed_suffixes: &[(char, u32)], ) -> Result> { - // Parse the " and ' prefixes separately - if target != ParseTarget::Duration { - if let Some(rest) = input.strip_prefix(['\'', '"']) { - let mut chars = rest.char_indices().fuse(); - let v = chars - .next() - .map(|(_, c)| ExtendedBigDecimal::BigDecimal(u32::from(c).into())); - return match (v, chars.next()) { - (Some(v), None) => Ok(v), - (Some(v), Some((i, _))) => Err(ExtendedParserError::PartialMatch(v, &rest[i..])), - (None, _) => Err(ExtendedParserError::NotNumeric), - }; - } - } - let trimmed_input = input.trim_ascii_start(); // Initial minus/plus sign diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index 8b2f7f0c796..14254cb5fcb 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -278,34 +278,39 @@ pub fn read_yes() -> bool { } } -fn os_str_as_bytes_core(os_string: &OsStr) -> Option<&[u8]> { - // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually - // exclusive - let option = { - #[cfg(unix)] - { - Some(os_string.as_bytes()) - } - - #[cfg(not(unix))] - { - os_string.to_str().map(|op| op.as_bytes()) - } - }; +#[derive(Debug)] +pub struct NonUtf8OsStrError { + input_lossy_string: String, +} - option +impl std::fmt::Display for NonUtf8OsStrError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use os_display::Quotable; + let quoted = self.input_lossy_string.quote(); + f.write_fmt(format_args!( + "invalid UTF-8 input {quoted} encountered when converting to bytes on a platform that doesn't expose byte arguments", + )) + } } +impl std::error::Error for NonUtf8OsStrError {} +impl error::UError for NonUtf8OsStrError {} + /// Converts an `OsStr` to a UTF-8 `&[u8]`. /// /// This always succeeds on unix platforms, /// and fails on other platforms if the string can't be coerced to UTF-8. -pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { - let bytes = os_str_as_bytes_core(os_string).ok_or_else(|| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })?; +pub fn os_str_as_bytes(os_string: &OsStr) -> Result<&[u8], NonUtf8OsStrError> { + #[cfg(unix)] + return Ok(os_string.as_bytes()); - Ok(bytes) + #[cfg(not(unix))] + os_string + .to_str() + .ok_or_else(|| NonUtf8OsStrError { + input_lossy_string: os_string.to_string_lossy().into_owned(), + }) + .map(|s| s.as_bytes()) } /// Performs a potentially lossy conversion from `OsStr` to UTF-8 bytes. @@ -314,63 +319,12 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { /// and wraps [`OsStr::to_string_lossy`] on non-unix platforms. pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> { #[cfg(unix)] - let bytes = Cow::from(os_string.as_bytes()); + return Cow::from(os_string.as_bytes()); #[cfg(not(unix))] - let bytes = match os_string.to_string_lossy() { + match os_string.to_string_lossy() { Cow::Borrowed(slice) => Cow::from(slice.as_bytes()), Cow::Owned(owned) => Cow::from(owned.into_bytes()), - }; - - bytes -} - -#[derive(Debug)] -enum OsStrConversionType { - ToBytes, - ToString, -} - -#[derive(Debug)] -pub struct NonUtf8OsStrError { - conversion_type: OsStrConversionType, - input_lossy_string: String, -} - -impl std::fmt::Display for NonUtf8OsStrError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use os_display::Quotable; - - let quoted = self.input_lossy_string.quote(); - - match self.conversion_type { - OsStrConversionType::ToBytes => f.write_fmt(format_args!( - "invalid (non-UTF-8) input like {quoted} encountered when converting input to bytes on a platform that doesn't use UTF-8", - )), - OsStrConversionType::ToString => f.write_fmt(format_args!( - "invalid (non-UTF-8) string like {quoted} encountered", - )), - } - } -} - -impl std::error::Error for NonUtf8OsStrError {} -impl error::UError for NonUtf8OsStrError {} - -pub fn os_str_as_bytes_verbose(input: &OsStr) -> Result<&[u8], NonUtf8OsStrError> { - os_str_as_bytes_core(input).ok_or_else(|| NonUtf8OsStrError { - conversion_type: OsStrConversionType::ToBytes, - input_lossy_string: input.to_string_lossy().into_owned(), - }) -} - -pub fn os_str_as_str_verbose(os_str: &OsStr) -> Result<&str, NonUtf8OsStrError> { - match os_str.to_str() { - Some(st) => Ok(st), - None => Err(NonUtf8OsStrError { - conversion_type: OsStrConversionType::ToString, - input_lossy_string: os_str.to_string_lossy().into_owned(), - }), } } @@ -380,23 +334,13 @@ pub fn os_str_as_str_verbose(os_str: &OsStr) -> Result<&str, NonUtf8OsStrError> /// This always succeeds on unix platforms, /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { - // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually - // exclusive - let os_str = { - #[cfg(unix)] - { - Cow::Borrowed(OsStr::from_bytes(bytes)) - } - - #[cfg(not(unix))] - { - Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { - mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") - })?)) - } - }; + #[cfg(unix)] + return Ok(Cow::Borrowed(OsStr::from_bytes(bytes))); - Ok(os_str) + #[cfg(not(unix))] + Ok(Cow::Owned(OsString::from(str::from_utf8(bytes).map_err( + |_| mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr"), + )?))) } /// Converts a `Vec` into an `OsString`, parsing as UTF-8 on non-unix platforms. @@ -404,26 +348,13 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { /// This always succeeds on unix platforms, /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { - // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually - // exclusive - let s = { - #[cfg(unix)] - { - OsString::from_vec(vec) - } - - #[cfg(not(unix))] - { - OsString::from(String::from_utf8(vec).map_err(|_| { - mods::error::UUsageError::new( - 1, - "invalid UTF-8 was detected in one or more arguments", - ) - })?) - } - }; + #[cfg(unix)] + return Ok(OsString::from_vec(vec)); - Ok(s) + #[cfg(not(unix))] + Ok(OsString::from(String::from_utf8(vec).map_err(|_| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })?)) } /// Equivalent to `std::BufRead::lines` which outputs each line as a `Vec`, diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index f40689030d4..959d3501cd1 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -807,7 +807,7 @@ fn test_overflow() { fn partial_char() { new_ucmd!() .args(&["%d", "'abc"]) - .fails_with_code(1) + .succeeds() .stdout_is("97") .stderr_is( "printf: warning: bc: character(s) following character constant have been ignored\n", @@ -1291,23 +1291,80 @@ fn float_arg_with_whitespace() { #[test] fn mb_input() { - for format in ["\"á", "\'á", "'\u{e1}"] { + let cases = vec![ + ("%04x\n", "\"á", "00e1\n"), + ("%04x\n", "'á", "00e1\n"), + ("%04x\n", "'\u{e1}", "00e1\n"), + ("%i\n", "\"á", "225\n"), + ("%i\n", "'á", "225\n"), + ("%i\n", "'\u{e1}", "225\n"), + ("%f\n", "'á", "225.000000\n"), + ]; + for (format, arg, stdout) in cases { new_ucmd!() - .args(&["%04x\n", format]) + .args(&[format, arg]) .succeeds() - .stdout_only("00e1\n"); + .stdout_only(stdout); } let cases = vec![ - ("\"á=", "="), - ("\'á-", "-"), - ("\'á=-==", "=-=="), - ("'\u{e1}++", "++"), + ("%04x\n", "\"á=", "00e1\n", "="), + ("%04x\n", "'á-", "00e1\n", "-"), + ("%04x\n", "'á=-==", "00e1\n", "=-=="), + ("%04x\n", "'á'", "00e1\n", "'"), + ("%04x\n", "'\u{e1}++", "00e1\n", "++"), + ("%04x\n", "''á'", "0027\n", "á'"), + ("%i\n", "\"á=", "225\n", "="), ]; + for (format, arg, stdout, stderr) in cases { + new_ucmd!() + .args(&[format, arg]) + .succeeds() + .stdout_is(stdout) + .stderr_is(format!("printf: warning: {stderr}: character(s) following character constant have been ignored\n")); + } - for (format, expected) in cases { + for arg in ["\"", "'"] { new_ucmd!() - .args(&["%04x\n", format]) + .args(&["%04x\n", arg]) + .fails() + .stderr_contains("expected a numeric value"); + } +} + +#[test] +#[cfg(target_family = "unix")] +fn mb_invalid_unicode() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let cases = vec![ + ("%04x\n", b"\"\xe1", "00e1\n"), + ("%04x\n", b"'\xe1", "00e1\n"), + ("%i\n", b"\"\xe1", "225\n"), + ("%i\n", b"'\xe1", "225\n"), + ("%f\n", b"'\xe1", "225.000000\n"), + ]; + for (format, arg, stdout) in cases { + new_ucmd!() + .arg(format) + .arg(OsStr::from_bytes(arg)) + .succeeds() + .stdout_only(stdout); + } + + let cases = vec![ + (b"\"\xe1=".as_slice(), "="), + (b"'\xe1-".as_slice(), "-"), + (b"'\xe1=-==".as_slice(), "=-=="), + (b"'\xe1'".as_slice(), "'"), + // unclear if original or replacement character is better in stderr + //(b"''\xe1'".as_slice(), "'�'"), + ]; + for (arg, expected) in cases { + new_ucmd!() + .arg("%04x\n") + .arg(OsStr::from_bytes(arg)) .succeeds() .stdout_is("00e1\n") .stderr_is(format!("printf: warning: {expected}: character(s) following character constant have been ignored\n")); @@ -1392,8 +1449,5 @@ fn non_utf_8_input() { .arg("%d") .arg(os_str) .fails() - // spell-checker:disable - .stderr_only("printf: invalid (non-UTF-8) argument like 'Swer an rehte g�ete wendet s�n gem�ete, dem volget s�lde und �re.' encountered -"); - // spell-checker:enable + .stderr_contains("expected a numeric value"); } From f9ca1e310f9b68a7a5bed5bcee272f262b3c6644 Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Wed, 28 May 2025 19:39:19 -0400 Subject: [PATCH 3/3] printf: remove passing tests from why-error.md --- util/why-error.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/util/why-error.md b/util/why-error.md index 978545b26fa..095caef4965 100644 --- a/util/why-error.md +++ b/util/why-error.md @@ -39,11 +39,7 @@ This file documents why some tests are failing: * gnu/tests/mv/part-hardlink.sh * gnu/tests/od/od-N.sh * gnu/tests/od/od-float.sh -* gnu/tests/printf/printf-cov.pl -* gnu/tests/printf/printf-indexed.sh -* gnu/tests/printf/printf-mb.sh * gnu/tests/printf/printf-quote.sh -* gnu/tests/printf/printf.sh * gnu/tests/ptx/ptx-overrun.sh * gnu/tests/ptx/ptx.pl * gnu/tests/rm/empty-inacc.sh - https://github.com/uutils/coreutils/issues/7033