Skip to content

Commit bb5c991

Browse files
authored
Merge pull request #7128 from jfinkels/split-line-bytes-large-args
split: fix bug with large arguments to -C
2 parents cc4cf70 + 071e72f commit bb5c991

File tree

2 files changed

+126
-216
lines changed

2 files changed

+126
-216
lines changed

src/uu/split/src/split.rs

Lines changed: 109 additions & 216 deletions
Original file line numberDiff line numberDiff line change
@@ -919,204 +919,6 @@ impl Write for LineChunkWriter<'_> {
919919
}
920920
}
921921

922-
/// Write lines to each sequential output files, limited by bytes.
923-
///
924-
/// This struct maintains an underlying writer representing the
925-
/// current chunk of the output. On each call to [`write`], it writes
926-
/// as many lines as possible to the current chunk without exceeding
927-
/// the specified byte limit. If a single line has more bytes than the
928-
/// limit, then fill an entire single chunk with those bytes and
929-
/// handle the remainder of the line as if it were its own distinct
930-
/// line. As many new underlying writers are created as needed to
931-
/// write all the data in the input buffer.
932-
struct LineBytesChunkWriter<'a> {
933-
/// Parameters for creating the underlying writer for each new chunk.
934-
settings: &'a Settings,
935-
936-
/// The maximum number of bytes allowed for a single chunk of output.
937-
chunk_size: u64,
938-
939-
/// Running total of number of chunks that have been completed.
940-
num_chunks_written: usize,
941-
942-
/// Remaining capacity in number of bytes in the current chunk.
943-
///
944-
/// This number starts at `chunk_size` and decreases as lines are
945-
/// written. Once it reaches zero, a writer for a new chunk is
946-
/// initialized and this number gets reset to `chunk_size`.
947-
num_bytes_remaining_in_current_chunk: usize,
948-
949-
/// The underlying writer for the current chunk.
950-
///
951-
/// Once the number of bytes written to this writer exceeds
952-
/// `chunk_size`, a new writer is initialized and assigned to this
953-
/// field.
954-
inner: BufWriter<Box<dyn Write>>,
955-
956-
/// Iterator that yields filenames for each chunk.
957-
filename_iterator: FilenameIterator<'a>,
958-
}
959-
960-
impl<'a> LineBytesChunkWriter<'a> {
961-
fn new(chunk_size: u64, settings: &'a Settings) -> UResult<Self> {
962-
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?;
963-
let filename = filename_iterator
964-
.next()
965-
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
966-
if settings.verbose {
967-
println!("creating file {}", filename.quote());
968-
}
969-
let inner = settings.instantiate_current_writer(&filename, true)?;
970-
Ok(LineBytesChunkWriter {
971-
settings,
972-
chunk_size,
973-
num_bytes_remaining_in_current_chunk: usize::try_from(chunk_size).unwrap(),
974-
num_chunks_written: 0,
975-
inner,
976-
filename_iterator,
977-
})
978-
}
979-
}
980-
981-
impl Write for LineBytesChunkWriter<'_> {
982-
/// Write as many lines to a chunk as possible without
983-
/// exceeding the byte limit. If a single line has more bytes
984-
/// than the limit, then fill an entire single chunk with those
985-
/// bytes and handle the remainder of the line as if it were
986-
/// its own distinct line.
987-
///
988-
/// For example: if the `chunk_size` is 8 and the input is:
989-
///
990-
/// ```text
991-
/// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
992-
/// ```
993-
///
994-
/// then the output gets broken into chunks like this:
995-
///
996-
/// ```text
997-
/// chunk 0 chunk 1 chunk 2 chunk 3
998-
///
999-
/// 0 1 2
1000-
/// 01234567 89 01234 56789 012 345 6
1001-
/// |------| |-------| |--------| |---|
1002-
/// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
1003-
/// ```
1004-
///
1005-
/// Implements `--line-bytes=SIZE`
1006-
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
1007-
// The total number of bytes written during the loop below.
1008-
//
1009-
// It is necessary to keep this running total because we may
1010-
// be making multiple calls to `write()` on multiple different
1011-
// underlying writers and we want the final reported number of
1012-
// bytes written to reflect the total number of bytes written
1013-
// to all of the underlying writers.
1014-
let mut total_bytes_written = 0;
1015-
1016-
// Loop until we have written all bytes in the input buffer
1017-
// (or an IO error occurs).
1018-
loop {
1019-
// If the buffer is empty, then we are done writing.
1020-
if buf.is_empty() {
1021-
return Ok(total_bytes_written);
1022-
}
1023-
1024-
// If we have filled the current chunk with bytes, then
1025-
// start a new chunk and initialize its corresponding
1026-
// writer.
1027-
if self.num_bytes_remaining_in_current_chunk == 0 {
1028-
self.num_chunks_written += 1;
1029-
let filename = self.filename_iterator.next().ok_or_else(|| {
1030-
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
1031-
})?;
1032-
if self.settings.verbose {
1033-
println!("creating file {}", filename.quote());
1034-
}
1035-
self.inner = self.settings.instantiate_current_writer(&filename, true)?;
1036-
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
1037-
}
1038-
1039-
// Find the first separator (default - newline character) in the buffer.
1040-
let sep = self.settings.separator;
1041-
match memchr::memchr(sep, buf) {
1042-
// If there is no separator character and the buffer is
1043-
// not empty, then write as many bytes as we can and
1044-
// then move on to the next chunk if necessary.
1045-
None => {
1046-
let end = self.num_bytes_remaining_in_current_chunk;
1047-
1048-
// This is ugly but here to match GNU behavior. If the input
1049-
// doesn't end with a separator, pretend that it does for handling
1050-
// the second to last segment chunk. See `line-bytes.sh`.
1051-
if end == buf.len()
1052-
&& self.num_bytes_remaining_in_current_chunk
1053-
< self.chunk_size.try_into().unwrap()
1054-
&& buf[buf.len() - 1] != sep
1055-
{
1056-
self.num_bytes_remaining_in_current_chunk = 0;
1057-
} else {
1058-
let num_bytes_written = custom_write(
1059-
&buf[..end.min(buf.len())],
1060-
&mut self.inner,
1061-
self.settings,
1062-
)?;
1063-
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
1064-
total_bytes_written += num_bytes_written;
1065-
buf = &buf[num_bytes_written..];
1066-
}
1067-
}
1068-
1069-
// If there is a separator character and the line
1070-
// (including the separator character) will fit in the
1071-
// current chunk, then write the entire line and
1072-
// continue to the next iteration. (See chunk 1 in the
1073-
// example comment above.)
1074-
Some(i) if i < self.num_bytes_remaining_in_current_chunk => {
1075-
let num_bytes_written =
1076-
custom_write(&buf[..=i], &mut self.inner, self.settings)?;
1077-
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
1078-
total_bytes_written += num_bytes_written;
1079-
buf = &buf[num_bytes_written..];
1080-
}
1081-
1082-
// If there is a separator character, the line
1083-
// (including the separator character) will not fit in
1084-
// the current chunk, *and* no other lines have been
1085-
// written to the current chunk, then write as many
1086-
// bytes as we can and continue to the next
1087-
// iteration. (See chunk 0 in the example comment
1088-
// above.)
1089-
Some(_)
1090-
if self.num_bytes_remaining_in_current_chunk
1091-
== self.chunk_size.try_into().unwrap() =>
1092-
{
1093-
let end = self.num_bytes_remaining_in_current_chunk;
1094-
let num_bytes_written =
1095-
custom_write(&buf[..end], &mut self.inner, self.settings)?;
1096-
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
1097-
total_bytes_written += num_bytes_written;
1098-
buf = &buf[num_bytes_written..];
1099-
}
1100-
1101-
// If there is a separator character, the line
1102-
// (including the separator character) will not fit in
1103-
// the current chunk, and at least one other line has
1104-
// been written to the current chunk, then signal to
1105-
// the next iteration that a new chunk needs to be
1106-
// created and continue to the next iteration of the
1107-
// loop to try writing the line there.
1108-
Some(_) => {
1109-
self.num_bytes_remaining_in_current_chunk = 0;
1110-
}
1111-
}
1112-
}
1113-
}
1114-
1115-
fn flush(&mut self) -> std::io::Result<()> {
1116-
self.inner.flush()
1117-
}
1118-
}
1119-
1120922
/// Output file parameters
1121923
struct OutFile {
1122924
filename: String,
@@ -1629,6 +1431,114 @@ where
16291431
Ok(())
16301432
}
16311433

1434+
/// Like `std::io::Lines`, but includes the line ending character.
1435+
///
1436+
/// This struct is generally created by calling `lines_with_sep` on a
1437+
/// reader.
1438+
pub struct LinesWithSep<R> {
1439+
inner: R,
1440+
separator: u8,
1441+
}
1442+
1443+
impl<R> Iterator for LinesWithSep<R>
1444+
where
1445+
R: BufRead,
1446+
{
1447+
type Item = std::io::Result<Vec<u8>>;
1448+
1449+
/// Read bytes from a buffer up to the requested number of lines.
1450+
fn next(&mut self) -> Option<Self::Item> {
1451+
let mut buf = vec![];
1452+
match self.inner.read_until(self.separator, &mut buf) {
1453+
Ok(0) => None,
1454+
Ok(_) => Some(Ok(buf)),
1455+
Err(e) => Some(Err(e)),
1456+
}
1457+
}
1458+
}
1459+
1460+
/// Like `std::str::lines` but includes the line ending character.
1461+
///
1462+
/// The `separator` defines the character to interpret as the line
1463+
/// ending. For the usual notion of "line", set this to `b'\n'`.
1464+
pub fn lines_with_sep<R>(reader: R, separator: u8) -> LinesWithSep<R>
1465+
where
1466+
R: BufRead,
1467+
{
1468+
LinesWithSep {
1469+
inner: reader,
1470+
separator,
1471+
}
1472+
}
1473+
1474+
fn line_bytes<R>(settings: &Settings, reader: &mut R, chunk_size: usize) -> UResult<()>
1475+
where
1476+
R: BufRead,
1477+
{
1478+
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?;
1479+
1480+
// Initialize the writer just to satisfy the compiler. It is going
1481+
// to be overwritten for sure at the beginning of the loop below
1482+
// because we start with `remaining == 0`, indicating that a new
1483+
// chunk should start.
1484+
let mut writer: BufWriter<Box<dyn Write>> =
1485+
BufWriter::new(Box::new(std::io::Cursor::new(vec![])));
1486+
1487+
let mut remaining = 0;
1488+
for line in lines_with_sep(reader, settings.separator) {
1489+
let line = line?;
1490+
let mut line = &line[..];
1491+
loop {
1492+
if remaining == 0 {
1493+
let filename = filename_iterator
1494+
.next()
1495+
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
1496+
if settings.verbose {
1497+
println!("creating file {}", filename.quote());
1498+
}
1499+
writer = settings.instantiate_current_writer(&filename, true)?;
1500+
remaining = chunk_size;
1501+
}
1502+
1503+
// Special case: if this is the last line and it doesn't end
1504+
// with a newline character, then count its length as though
1505+
// it did end with a newline. If that puts it over the edge
1506+
// of this chunk, continue to the next chunk.
1507+
if line.len() == remaining
1508+
&& remaining < chunk_size
1509+
&& line[line.len() - 1] != settings.separator
1510+
{
1511+
remaining = 0;
1512+
continue;
1513+
}
1514+
1515+
// If the entire line fits in this chunk, write it and
1516+
// continue to the next line.
1517+
if line.len() <= remaining {
1518+
custom_write_all(line, &mut writer, settings)?;
1519+
remaining -= line.len();
1520+
break;
1521+
}
1522+
1523+
// If the line is too large to fit in *any* chunk and we are
1524+
// at the start of a new chunk, write as much as we can of
1525+
// it and pass the remainder along to the next chunk.
1526+
if line.len() > chunk_size && remaining == chunk_size {
1527+
custom_write_all(&line[..chunk_size], &mut writer, settings)?;
1528+
line = &line[chunk_size..];
1529+
remaining = 0;
1530+
continue;
1531+
}
1532+
1533+
// If the line is too large to fit in *this* chunk, but
1534+
// might otherwise fit in the next chunk, then just continue
1535+
// to the next chunk and let it be handled there.
1536+
remaining = 0;
1537+
}
1538+
}
1539+
Ok(())
1540+
}
1541+
16321542
#[allow(clippy::cognitive_complexity)]
16331543
fn split(settings: &Settings) -> UResult<()> {
16341544
let r_box = if settings.input == "-" {
@@ -1701,23 +1611,6 @@ fn split(settings: &Settings) -> UResult<()> {
17011611
},
17021612
}
17031613
}
1704-
Strategy::LineBytes(chunk_size) => {
1705-
let mut writer = LineBytesChunkWriter::new(chunk_size, settings)?;
1706-
match std::io::copy(&mut reader, &mut writer) {
1707-
Ok(_) => Ok(()),
1708-
Err(e) => match e.kind() {
1709-
// TODO Since the writer object controls the creation of
1710-
// new files, we need to rely on the `std::io::Result`
1711-
// returned by its `write()` method to communicate any
1712-
// errors to this calling scope. If a new file cannot be
1713-
// created because we have exceeded the number of
1714-
// allowable filenames, we use `ErrorKind::Other` to
1715-
// indicate that. A special error message needs to be
1716-
// printed in that case.
1717-
ErrorKind::Other => Err(USimpleError::new(1, format!("{e}"))),
1718-
_ => Err(uio_error!(e, "input/output error")),
1719-
},
1720-
}
1721-
}
1614+
Strategy::LineBytes(chunk_size) => line_bytes(settings, &mut reader, chunk_size as usize),
17221615
}
17231616
}

tests/by-util/test_split.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,3 +1973,20 @@ fn test_split_separator_same_multiple() {
19731973
.args(&["-t:", "-t:", "-t,", "fivelines.txt"])
19741974
.fails();
19751975
}
1976+
1977+
#[test]
1978+
fn test_long_lines() {
1979+
let (at, mut ucmd) = at_and_ucmd!();
1980+
let line1 = format!("{:131070}\n", "");
1981+
let line2 = format!("{:1}\n", "");
1982+
let line3 = format!("{:131071}\n", "");
1983+
let infile = [line1, line2, line3].concat();
1984+
ucmd.args(&["-C", "131072"])
1985+
.pipe_in(infile)
1986+
.succeeds()
1987+
.no_output();
1988+
assert_eq!(at.read("xaa").len(), 131_071);
1989+
assert_eq!(at.read("xab").len(), 2);
1990+
assert_eq!(at.read("xac").len(), 131_072);
1991+
assert!(!at.plus("xad").exists());
1992+
}

0 commit comments

Comments
 (0)