Skip to content

Commit 9eddbca

Browse files
andrewliebenowjtracey
authored andcommitted
printf: accept non-UTF-8 input in FORMAT and ARGUMENT arguments
Other implementations of `printf` permit arbitrary data to be passed to `printf`. The only restriction is that a null byte terminates FORMAT and ARGUMENT argument strings (since they are C strings). The current implementation only accepts FORMAT and ARGUMENT arguments that are valid UTF-8 (this is being enforced by clap). This commit removes the UTF-8 validation by switching to OsStr and OsString. This allows users to use `printf` to transmit or reformat null-safe but not UTF-8-safe data, such as text encoded in an 8-bit text encoding. See the `non_utf_8_input` test for an example (ISO-8859-1 text).
1 parent 279629f commit 9eddbca

File tree

5 files changed

+160
-91
lines changed

5 files changed

+160
-91
lines changed

src/uu/echo/src/echo.rs

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
use clap::builder::ValueParser;
77
use clap::{Arg, ArgAction, ArgMatches, Command};
88
use std::env;
9-
use std::ffi::{OsStr, OsString};
9+
use std::ffi::OsString;
1010
use std::io::{self, StdoutLock, Write};
11-
use uucore::error::{UResult, USimpleError};
11+
use uucore::error::UResult;
1212
use uucore::format::{EscapedChar, FormatChar, OctalParsing, parse_escape_only};
1313
use uucore::{format_usage, help_about, help_section, help_usage};
1414

@@ -137,12 +137,7 @@ fn execute(
137137
escaped: bool,
138138
) -> UResult<()> {
139139
for (i, input) in arguments_after_options.into_iter().enumerate() {
140-
let Some(bytes) = bytes_from_os_string(input.as_os_str()) else {
141-
return Err(USimpleError::new(
142-
1,
143-
"Non-UTF-8 arguments provided, but this platform does not support them",
144-
));
145-
};
140+
let bytes = uucore::format::bytes_from_os_str(&input)?;
146141

147142
if i > 0 {
148143
stdout_lock.write_all(b" ")?;
@@ -166,19 +161,3 @@ fn execute(
166161

167162
Ok(())
168163
}
169-
170-
fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> {
171-
#[cfg(target_family = "unix")]
172-
{
173-
use std::os::unix::ffi::OsStrExt;
174-
175-
Some(input.as_bytes())
176-
}
177-
178-
#[cfg(not(target_family = "unix"))]
179-
{
180-
// TODO
181-
// Verify that this works correctly on these platforms
182-
input.to_str().map(|st| st.as_bytes())
183-
}
184-
}

src/uu/printf/src/printf.rs

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55
use clap::{Arg, ArgAction, Command};
6+
use std::ffi::OsString;
67
use std::io::stdout;
78
use std::ops::ControlFlow;
89
use uucore::error::{UResult, UUsageError};
9-
use uucore::format::{FormatArgument, FormatArguments, FormatItem, parse_spec_and_escape};
10-
use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes, show_warning};
10+
use uucore::format::{
11+
FormatArgument, FormatArguments, FormatItem, bytes_from_os_str, parse_spec_and_escape,
12+
};
13+
use uucore::{format_usage, help_about, help_section, help_usage, show_warning};
1114

1215
const VERSION: &str = "version";
1316
const HELP: &str = "help";
@@ -19,21 +22,19 @@ mod options {
1922
pub const FORMAT: &str = "FORMAT";
2023
pub const ARGUMENT: &str = "ARGUMENT";
2124
}
25+
2226
#[uucore::main]
2327
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
2428
let matches = uu_app().get_matches_from(args);
2529

2630
let format = matches
27-
.get_one::<std::ffi::OsString>(options::FORMAT)
31+
.get_one::<OsString>(options::FORMAT)
2832
.ok_or_else(|| UUsageError::new(1, "missing operand"))?;
29-
let format = os_str_as_bytes(format)?;
33+
let format = bytes_from_os_str(format)?;
3034

31-
let values: Vec<_> = match matches.get_many::<std::ffi::OsString>(options::ARGUMENT) {
32-
// FIXME: use os_str_as_bytes once FormatArgument supports Vec<u8>
35+
let values: Vec<_> = match matches.get_many::<OsString>(options::ARGUMENT) {
3336
Some(s) => s
34-
.map(|os_string| {
35-
FormatArgument::Unparsed(std::ffi::OsStr::to_string_lossy(os_string).to_string())
36-
})
37+
.map(|os_string| FormatArgument::Unparsed(os_string.to_owned()))
3738
.collect(),
3839
None => vec![],
3940
};
@@ -59,7 +60,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
5960
let Some(FormatArgument::Unparsed(arg_str)) = args.peek_arg() else {
6061
unreachable!("All args are transformed to Unparsed")
6162
};
62-
show_warning!("ignoring excess arguments, starting with '{arg_str}'");
63+
show_warning!(
64+
"ignoring excess arguments, starting with '{}'",
65+
arg_str.to_string_lossy()
66+
);
6367
}
6468
return Ok(());
6569
}
@@ -98,10 +102,10 @@ pub fn uu_app() -> Command {
98102
.help("Print version information")
99103
.action(ArgAction::Version),
100104
)
101-
.arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(std::ffi::OsString)))
105+
.arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(OsString)))
102106
.arg(
103107
Arg::new(options::ARGUMENT)
104108
.action(ArgAction::Append)
105-
.value_parser(clap::value_parser!(std::ffi::OsString)),
109+
.value_parser(clap::value_parser!(OsString)),
106110
)
107111
}

src/uucore/src/lib/features/format/argument.rs

Lines changed: 82 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
use super::ExtendedBigDecimal;
77
use crate::format::spec::ArgumentLocation;
88
use crate::{
9-
error::set_exit_code,
9+
error::{UResult, USimpleError, set_exit_code},
1010
parser::num_parser::{ExtendedParser, ExtendedParserError},
1111
quoting_style::{Quotes, QuotingStyle, escape_name},
12-
show_error, show_warning,
12+
show, show_error, show_warning,
1313
};
1414
use os_display::Quotable;
15-
use std::{ffi::OsStr, num::NonZero};
15+
use std::{
16+
ffi::{OsStr, OsString},
17+
num::NonZero,
18+
};
1619

1720
/// An argument for formatting
1821
///
@@ -24,12 +27,12 @@ use std::{ffi::OsStr, num::NonZero};
2427
#[derive(Clone, Debug, PartialEq)]
2528
pub enum FormatArgument {
2629
Char(char),
27-
String(String),
30+
String(OsString),
2831
UnsignedInt(u64),
2932
SignedInt(i64),
3033
Float(ExtendedBigDecimal),
3134
/// Special argument that gets coerced into the other variants
32-
Unparsed(String),
35+
Unparsed(OsString),
3336
}
3437

3538
/// A struct that holds a slice of format arguments and provides methods to access them
@@ -72,30 +75,38 @@ impl<'a> FormatArguments<'a> {
7275
pub fn next_char(&mut self, position: &ArgumentLocation) -> u8 {
7376
match self.next_arg(position) {
7477
Some(FormatArgument::Char(c)) => *c as u8,
75-
Some(FormatArgument::Unparsed(s)) => s.bytes().next().unwrap_or(b'\0'),
78+
Some(FormatArgument::Unparsed(os)) => match bytes_from_os_str(os).unwrap().first() {
79+
Some(&byte) => byte,
80+
None => b'\0',
81+
},
7682
_ => b'\0',
7783
}
7884
}
7985

80-
pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a str {
86+
pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a OsStr {
8187
match self.next_arg(position) {
82-
Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s,
83-
_ => "",
88+
Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os,
89+
_ => "".as_ref(),
8490
}
8591
}
8692

8793
pub fn next_i64(&mut self, position: &ArgumentLocation) -> i64 {
8894
match self.next_arg(position) {
8995
Some(FormatArgument::SignedInt(n)) => *n,
90-
Some(FormatArgument::Unparsed(s)) => extract_value(i64::extended_parse(s), s),
96+
Some(FormatArgument::Unparsed(os)) => {
97+
let str = get_str_or_exit_with_error(os);
98+
99+
extract_value(i64::extended_parse(str), str)
100+
}
91101
_ => 0,
92102
}
93103
}
94104

95105
pub fn next_u64(&mut self, position: &ArgumentLocation) -> u64 {
96106
match self.next_arg(position) {
97107
Some(FormatArgument::UnsignedInt(n)) => *n,
98-
Some(FormatArgument::Unparsed(s)) => {
108+
Some(FormatArgument::Unparsed(os)) => {
109+
let s = get_str_or_exit_with_error(os);
99110
// Check if the string is a character literal enclosed in quotes
100111
if s.starts_with(['"', '\'']) {
101112
// Extract the content between the quotes safely using chars
@@ -122,7 +133,9 @@ impl<'a> FormatArguments<'a> {
122133
pub fn next_extended_big_decimal(&mut self, position: &ArgumentLocation) -> ExtendedBigDecimal {
123134
match self.next_arg(position) {
124135
Some(FormatArgument::Float(n)) => n.clone(),
125-
Some(FormatArgument::Unparsed(s)) => {
136+
Some(FormatArgument::Unparsed(os)) => {
137+
let s = get_str_or_exit_with_error(os);
138+
126139
extract_value(ExtendedBigDecimal::extended_parse(s), s)
127140
}
128141
_ => ExtendedBigDecimal::zero(),
@@ -188,6 +201,53 @@ fn extract_value<T: Default>(p: Result<T, ExtendedParserError<'_, T>>, input: &s
188201
}
189202
}
190203

204+
pub fn bytes_from_os_str(input: &OsStr) -> UResult<&[u8]> {
205+
let result = {
206+
#[cfg(target_family = "unix")]
207+
{
208+
use std::os::unix::ffi::OsStrExt;
209+
210+
Ok(input.as_bytes())
211+
}
212+
213+
#[cfg(not(target_family = "unix"))]
214+
{
215+
use crate::error::USimpleError;
216+
217+
// TODO
218+
// Verify that this works correctly on these platforms
219+
match input.to_str().map(|st| st.as_bytes()) {
220+
Some(sl) => Ok(sl),
221+
None => Err(USimpleError::new(
222+
1,
223+
"non-UTF-8 string encountered when not allowed",
224+
)),
225+
}
226+
}
227+
};
228+
229+
result
230+
}
231+
232+
fn get_str_or_exit_with_error(os_str: &OsStr) -> &str {
233+
match os_str.to_str() {
234+
Some(st) => st,
235+
None => {
236+
let cow = os_str.to_string_lossy();
237+
238+
let quoted = cow.quote();
239+
240+
let error = format!(
241+
"argument like {quoted} is not a valid UTF-8 string, and could not be parsed as an integer",
242+
);
243+
244+
show!(USimpleError::new(1, error.clone()));
245+
246+
panic!("{error}");
247+
}
248+
}
249+
}
250+
191251
#[cfg(test)]
192252
mod tests {
193253
use super::*;
@@ -255,11 +315,11 @@ mod tests {
255315
// Test with different method types in sequence
256316
let args = [
257317
FormatArgument::Char('a'),
258-
FormatArgument::String("hello".to_string()),
259-
FormatArgument::Unparsed("123".to_string()),
260-
FormatArgument::String("world".to_string()),
318+
FormatArgument::String("hello".into()),
319+
FormatArgument::Unparsed("123".into()),
320+
FormatArgument::String("world".into()),
261321
FormatArgument::Char('z'),
262-
FormatArgument::String("test".to_string()),
322+
FormatArgument::String("test".into()),
263323
];
264324
let mut args = FormatArguments::new(&args);
265325

@@ -390,10 +450,10 @@ mod tests {
390450
fn test_unparsed_arguments() {
391451
// Test with unparsed arguments that get coerced
392452
let args = [
393-
FormatArgument::Unparsed("hello".to_string()),
394-
FormatArgument::Unparsed("123".to_string()),
395-
FormatArgument::Unparsed("hello".to_string()),
396-
FormatArgument::Unparsed("456".to_string()),
453+
FormatArgument::Unparsed("hello".into()),
454+
FormatArgument::Unparsed("123".into()),
455+
FormatArgument::Unparsed("hello".into()),
456+
FormatArgument::Unparsed("456".into()),
397457
];
398458
let mut args = FormatArguments::new(&args);
399459

@@ -415,10 +475,10 @@ mod tests {
415475
// Test with mixed types and positional access
416476
let args = [
417477
FormatArgument::Char('a'),
418-
FormatArgument::String("test".to_string()),
478+
FormatArgument::String("test".into()),
419479
FormatArgument::UnsignedInt(42),
420480
FormatArgument::Char('b'),
421-
FormatArgument::String("more".to_string()),
481+
FormatArgument::String("more".into()),
422482
FormatArgument::UnsignedInt(99),
423483
];
424484
let mut args = FormatArguments::new(&args);

0 commit comments

Comments
 (0)