Skip to content

Commit cc8f562

Browse files
printf: accept non-UTF-8 input in FORMAT and
ARGUMENT arguments Other implementations of `printf` permit arbitrary data to be passed to `printf`. The only restriction is that a null byte terminates FORMAT and ARGUMENT argument strings (since they are C strings). The current implementation only accepts FORMAT and ARGUMENT arguments that are valid UTF-8 (this is being enforced by clap). This commit removes the UTF-8 validation by switching to OsStr and OsString. This allows users to use `printf` to transmit or reformat null-safe but not UTF-8-safe data, such as text encoded in an 8-bit text encoding. See the `non_utf_8_input` test for an example (ISO-8859-1 text).
1 parent 0ef5b75 commit cc8f562

File tree

6 files changed

+146
-62
lines changed

6 files changed

+146
-62
lines changed

src/uu/echo/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ path = "src/echo.rs"
1818

1919
[dependencies]
2020
clap = { workspace = true }
21-
uucore = { workspace = true }
21+
uucore = { workspace = true, features = ["format"] }
2222

2323
[[bin]]
2424
name = "echo"

src/uu/echo/src/echo.rs

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
use clap::builder::ValueParser;
77
use clap::parser::ValuesRef;
88
use clap::{crate_version, Arg, ArgAction, Command};
9-
use std::ffi::{OsStr, OsString};
9+
use std::ffi::OsString;
1010
use std::io::{self, StdoutLock, Write};
1111
use std::iter::Peekable;
1212
use std::ops::ControlFlow;
1313
use std::slice::Iter;
14-
use uucore::error::{UResult, USimpleError};
14+
use uucore::error::UResult;
1515
use uucore::{format_usage, help_about, help_section, help_usage};
1616

1717
const ABOUT: &str = help_about!("echo.md");
@@ -355,12 +355,7 @@ fn execute(
355355
arguments_after_options: ValuesRef<'_, OsString>,
356356
) -> UResult<()> {
357357
for (i, input) in arguments_after_options.enumerate() {
358-
let Some(bytes) = bytes_from_os_string(input.as_os_str()) else {
359-
return Err(USimpleError::new(
360-
1,
361-
"Non-UTF-8 arguments provided, but this platform does not support them",
362-
));
363-
};
358+
let bytes = uucore::format::bytes_from_os_str(input)?;
364359

365360
if i > 0 {
366361
stdout_lock.write_all(b" ")?;
@@ -381,23 +376,3 @@ fn execute(
381376

382377
Ok(())
383378
}
384-
385-
fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> {
386-
let option = {
387-
#[cfg(target_family = "unix")]
388-
{
389-
use std::os::unix::ffi::OsStrExt;
390-
391-
Some(input.as_bytes())
392-
}
393-
394-
#[cfg(not(target_family = "unix"))]
395-
{
396-
// TODO
397-
// Verify that this works correctly on these platforms
398-
input.to_str().map(|st| st.as_bytes())
399-
}
400-
};
401-
402-
option
403-
}

src/uu/printf/src/printf.rs

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55

66
#![allow(dead_code)]
77

8+
use clap::builder::ValueParser;
89
use clap::{crate_version, Arg, ArgAction, Command};
10+
use std::ffi::OsString;
911
use std::io::stdout;
1012
use std::ops::ControlFlow;
1113
use uucore::error::{UResult, UUsageError};
12-
use uucore::format::{parse_spec_and_escape, FormatArgument, FormatItem};
14+
use uucore::format::{bytes_from_os_str, parse_spec_and_escape, FormatArgument, FormatItem};
1315
use uucore::{format_usage, help_about, help_section, help_usage};
1416

1517
const VERSION: &str = "version";
@@ -28,17 +30,22 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
2830
let matches = uu_app().get_matches_from(args);
2931

3032
let format = matches
31-
.get_one::<String>(options::FORMAT)
33+
.get_one::<OsString>(options::FORMAT)
3234
.ok_or_else(|| UUsageError::new(1, "missing operand"))?;
3335

34-
let values: Vec<_> = match matches.get_many::<String>(options::ARGUMENT) {
35-
Some(s) => s.map(|s| FormatArgument::Unparsed(s.to_string())).collect(),
36-
None => vec![],
36+
let format_bytes = bytes_from_os_str(format)?;
37+
38+
let values = match matches.get_many::<OsString>(options::ARGUMENT) {
39+
Some(os_string) => os_string
40+
.map(|os_string_ref| FormatArgument::Unparsed(os_string_ref.to_owned()))
41+
.collect(),
42+
None => Vec::<FormatArgument>::new(),
3743
};
3844

3945
let mut format_seen = false;
4046
let mut args = values.iter().peekable();
41-
for item in parse_spec_and_escape(format.as_ref()) {
47+
48+
for item in parse_spec_and_escape(format_bytes) {
4249
if let Ok(FormatItem::Spec(_)) = item {
4350
format_seen = true;
4451
}
@@ -55,7 +62,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
5562
}
5663

5764
while args.peek().is_some() {
58-
for item in parse_spec_and_escape(format.as_ref()) {
65+
for item in parse_spec_and_escape(format_bytes) {
5966
match item?.write(stdout(), &mut args)? {
6067
ControlFlow::Continue(()) => {}
6168
ControlFlow::Break(()) => return Ok(()),
@@ -86,6 +93,10 @@ pub fn uu_app() -> Command {
8693
.help("Print version information")
8794
.action(ArgAction::Version),
8895
)
89-
.arg(Arg::new(options::FORMAT))
90-
.arg(Arg::new(options::ARGUMENT).action(ArgAction::Append))
96+
.arg(Arg::new(options::FORMAT).value_parser(ValueParser::os_string()))
97+
.arg(
98+
Arg::new(options::ARGUMENT)
99+
.action(ArgAction::Append)
100+
.value_parser(ValueParser::os_string()),
101+
)
91102
}

src/uucore/src/lib/features/format/argument.rs

Lines changed: 75 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
// file that was distributed with this source code.
55

66
use crate::{
7-
error::set_exit_code,
7+
error::{set_exit_code, UResult, USimpleError},
88
features::format::num_parser::{ParseError, ParsedNumber},
99
quoting_style::{escape_name, Quotes, QuotingStyle},
10-
show_error, show_warning,
10+
show, show_error, show_warning,
1111
};
1212
use os_display::Quotable;
13-
use std::ffi::OsStr;
13+
use std::ffi::{OsStr, OsString};
1414

1515
/// An argument for formatting
1616
///
@@ -22,20 +22,20 @@ use std::ffi::OsStr;
2222
#[derive(Clone, Debug)]
2323
pub enum FormatArgument {
2424
Char(char),
25-
String(String),
25+
String(OsString),
2626
UnsignedInt(u64),
2727
SignedInt(i64),
2828
Float(f64),
2929
/// Special argument that gets coerced into the other variants
30-
Unparsed(String),
30+
Unparsed(OsString),
3131
}
3232

3333
pub trait ArgumentIter<'a>: Iterator<Item = &'a FormatArgument> {
3434
fn get_char(&mut self) -> u8;
3535
fn get_i64(&mut self) -> i64;
3636
fn get_u64(&mut self) -> u64;
3737
fn get_f64(&mut self) -> f64;
38-
fn get_str(&mut self) -> &'a str;
38+
fn get_str(&mut self) -> &'a OsStr;
3939
}
4040

4141
impl<'a, T: Iterator<Item = &'a FormatArgument>> ArgumentIter<'a> for T {
@@ -45,7 +45,10 @@ impl<'a, T: Iterator<Item = &'a FormatArgument>> ArgumentIter<'a> for T {
4545
};
4646
match next {
4747
FormatArgument::Char(c) => *c as u8,
48-
FormatArgument::Unparsed(s) => s.bytes().next().unwrap_or(b'\0'),
48+
FormatArgument::Unparsed(os) => match bytes_from_os_str(os).unwrap().first() {
49+
Some(&byte) => byte,
50+
None => b'\0',
51+
},
4952
_ => b'\0',
5053
}
5154
}
@@ -56,7 +59,11 @@ impl<'a, T: Iterator<Item = &'a FormatArgument>> ArgumentIter<'a> for T {
5659
};
5760
match next {
5861
FormatArgument::UnsignedInt(n) => *n,
59-
FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_u64(s), s),
62+
FormatArgument::Unparsed(os) => {
63+
let str = get_str_or_exit_with_error(os);
64+
65+
extract_value(ParsedNumber::parse_u64(str), str)
66+
}
6067
_ => 0,
6168
}
6269
}
@@ -67,7 +74,11 @@ impl<'a, T: Iterator<Item = &'a FormatArgument>> ArgumentIter<'a> for T {
6774
};
6875
match next {
6976
FormatArgument::SignedInt(n) => *n,
70-
FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_i64(s), s),
77+
FormatArgument::Unparsed(os) => {
78+
let str = get_str_or_exit_with_error(os);
79+
80+
extract_value(ParsedNumber::parse_i64(str), str)
81+
}
7182
_ => 0,
7283
}
7384
}
@@ -78,15 +89,19 @@ impl<'a, T: Iterator<Item = &'a FormatArgument>> ArgumentIter<'a> for T {
7889
};
7990
match next {
8091
FormatArgument::Float(n) => *n,
81-
FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_f64(s), s),
92+
FormatArgument::Unparsed(os) => {
93+
let str = get_str_or_exit_with_error(os);
94+
95+
extract_value(ParsedNumber::parse_f64(str), str)
96+
}
8297
_ => 0.0,
8398
}
8499
}
85100

86-
fn get_str(&mut self) -> &'a str {
101+
fn get_str(&mut self) -> &'a OsStr {
87102
match self.next() {
88-
Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s,
89-
_ => "",
103+
Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os,
104+
_ => "".as_ref(),
90105
}
91106
}
92107
}
@@ -126,3 +141,50 @@ fn extract_value<T: Default>(p: Result<T, ParseError<'_, T>>, input: &str) -> T
126141
}
127142
}
128143
}
144+
145+
pub fn bytes_from_os_str(input: &OsStr) -> UResult<&[u8]> {
146+
let result = {
147+
#[cfg(target_family = "unix")]
148+
{
149+
use std::os::unix::ffi::OsStrExt;
150+
151+
Ok(input.as_bytes())
152+
}
153+
154+
#[cfg(not(target_family = "unix"))]
155+
{
156+
use crate::error::USimpleError;
157+
158+
// TODO
159+
// Verify that this works correctly on these platforms
160+
match input.to_str().map(|st| st.as_bytes()) {
161+
Some(sl) => Ok(sl),
162+
None => Err(USimpleError::new(
163+
1,
164+
"non-UTF-8 string encountered when not allowed",
165+
)),
166+
}
167+
}
168+
};
169+
170+
result
171+
}
172+
173+
fn get_str_or_exit_with_error(os_str: &OsStr) -> &str {
174+
match os_str.to_str() {
175+
Some(st) => st,
176+
None => {
177+
let cow = os_str.to_string_lossy();
178+
179+
let quoted = cow.quote();
180+
181+
let error = format!(
182+
"argument like {quoted} is not a valid UTF-8 string, and could not be parsed as an integer",
183+
);
184+
185+
show!(USimpleError::new(1, error.clone()));
186+
187+
panic!("{error}");
188+
}
189+
}
190+
}

src/uucore/src/lib/features/format/spec.rs

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55

66
// spell-checker:ignore (vars) intmax ptrdiff padlen
77

8-
use crate::quoting_style::{escape_name, QuotingStyle};
9-
108
use super::{
9+
bytes_from_os_str,
1110
num_format::{
1211
self, Case, FloatVariant, ForceDecimal, Formatter, NumberAlignment, PositiveSign, Prefix,
1312
UnsignedIntVariant,
1413
},
1514
parse_escape_only, ArgumentIter, FormatChar, FormatError,
1615
};
16+
use crate::quoting_style::{escape_name, QuotingStyle};
1717
use std::{io::Write, ops::ControlFlow};
1818

1919
/// A parsed specification for formatting a value
@@ -331,17 +331,26 @@ impl Spec {
331331
// TODO: We need to not use Rust's formatting for aligning the output,
332332
// so that we can just write bytes to stdout without panicking.
333333
let precision = resolve_asterisk(*precision, &mut args)?;
334-
let s = args.get_str();
334+
335+
let os_str = args.get_str();
336+
337+
let bytes = bytes_from_os_str(os_str).unwrap();
338+
335339
let truncated = match precision {
336-
Some(p) if p < s.len() => &s[..p],
337-
_ => s,
340+
Some(p) if p < os_str.len() => &bytes[..p],
341+
_ => bytes,
338342
};
339-
write_padded(writer, truncated.as_bytes(), width, *align_left)
343+
344+
write_padded(writer, truncated, width, *align_left)
340345
}
341346
Self::EscapedString => {
342-
let s = args.get_str();
343-
let mut parsed = Vec::new();
344-
for c in parse_escape_only(s.as_bytes()) {
347+
let os_str = args.get_str();
348+
349+
let bytes = bytes_from_os_str(os_str).unwrap();
350+
351+
let mut parsed = Vec::<u8>::new();
352+
353+
for c in parse_escape_only(bytes) {
345354
match c.write(&mut parsed)? {
346355
ControlFlow::Continue(()) => {}
347356
ControlFlow::Break(()) => {
@@ -353,11 +362,12 @@ impl Spec {
353362
writer.write_all(&parsed).map_err(FormatError::IoError)
354363
}
355364
Self::QuotedString => {
356-
let s = args.get_str();
365+
let os = args.get_str();
366+
357367
writer
358368
.write_all(
359369
escape_name(
360-
s.as_ref(),
370+
os,
361371
&QuotingStyle::Shell {
362372
escape: true,
363373
always_quote: false,

tests/by-util/test_printf.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,3 +916,29 @@ fn float_flag_position_space_padding() {
916916
.succeeds()
917917
.stdout_only(" +1.0");
918918
}
919+
920+
#[test]
921+
#[cfg(target_family = "unix")]
922+
fn non_utf_8_input() {
923+
use std::ffi::OsStr;
924+
use std::os::unix::ffi::OsStrExt;
925+
926+
// ISO-8859-1 encoded text
927+
// spell-checker:disable
928+
const INPUT_AND_OUTPUT: &[u8] =
929+
b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre.";
930+
// spell-checker:enable
931+
932+
let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT);
933+
934+
new_ucmd!()
935+
.arg("%s")
936+
.arg(os_str)
937+
.succeeds()
938+
.stdout_only_bytes(INPUT_AND_OUTPUT);
939+
940+
new_ucmd!()
941+
.arg(os_str)
942+
.succeeds()
943+
.stdout_only_bytes(INPUT_AND_OUTPUT);
944+
}

0 commit comments

Comments
 (0)