Skip to content

Commit c2b55d7

Browse files
committed
split: pass GNU test l-chunk
1 parent a0ac3dd commit c2b55d7

File tree

3 files changed

+259
-171
lines changed

3 files changed

+259
-171
lines changed

src/uu/split/src/split.rs

Lines changed: 124 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,14 +1130,68 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
11301130
}
11311131
}
11321132

1133+
/// Output file parameters
1134+
struct OutFile {
1135+
filename: String,
1136+
maybe_writer: Option<BufWriter<Box<dyn Write>>>,
1137+
}
1138+
1139+
impl OutFile {
1140+
/// Get the writer for the output file
1141+
/// Instantiate the writer if it has not been instantiated upfront
1142+
fn get_writer(&mut self, settings: &Settings) -> UResult<&mut BufWriter<Box<dyn Write>>> {
1143+
if self.maybe_writer.is_some() {
1144+
Ok(self.maybe_writer.as_mut().unwrap())
1145+
} else {
1146+
// Writer was not instantiated upfront
1147+
// Instantiate it and record for future use
1148+
self.maybe_writer = Some(settings.instantiate_current_writer(self.filename.as_str())?);
1149+
Ok(self.maybe_writer.as_mut().unwrap())
1150+
}
1151+
}
1152+
}
1153+
1154+
/// Generate a set of Output Files
1155+
/// This is a helper function to [`n_chunks_by_byte`], [`n_chunks_by_line`]
1156+
/// and [`n_chunks_by_line_round_robin`].
1157+
/// Each OutFile is generated with filename, while the writer for it could be
1158+
/// optional, to be instantiated later by the calling function as needed.
1159+
/// Optional writers could happen in [`n_chunks_by_line`]
1160+
/// if `elide_empty_files` parameter is set to `true`.
1161+
fn get_out_files(
1162+
num_files: u64,
1163+
settings: &Settings,
1164+
is_writer_optional: bool,
1165+
) -> UResult<Vec<OutFile>> {
1166+
// This object is responsible for creating the filename for each chunk
1167+
let mut filename_iterator: FilenameIterator<'_> =
1168+
FilenameIterator::new(&settings.prefix, &settings.suffix)
1169+
.map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
1170+
let mut out_files: Vec<OutFile> = Vec::new();
1171+
for _ in 0..num_files {
1172+
let filename = filename_iterator
1173+
.next()
1174+
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
1175+
let maybe_writer = if is_writer_optional {
1176+
None
1177+
} else {
1178+
Some(settings.instantiate_current_writer(filename.as_str())?)
1179+
};
1180+
out_files.push(OutFile {
1181+
filename,
1182+
maybe_writer,
1183+
});
1184+
}
1185+
Ok(out_files)
1186+
}
1187+
11331188
/// Split a file or STDIN into a specific number of chunks by byte.
1134-
/// If in Kth chunk of N mode - print the k-th chunk to STDOUT.
11351189
///
11361190
/// When file size cannot be evenly divided into the number of chunks of the same size,
11371191
/// the first X chunks are 1 byte longer than the rest,
11381192
/// where X is a modulus reminder of (file size % number of chunks)
11391193
///
1140-
/// In Kth chunk of N mode - writes to stdout the contents of the chunk identified by `kth_chunk`
1194+
/// In Kth chunk of N mode - writes to STDOUT the contents of the chunk identified by `kth_chunk`
11411195
///
11421196
/// In N chunks mode - this function always creates one output file for each chunk, even
11431197
/// if there is an error reading or writing one of the chunks or if
@@ -1207,7 +1261,7 @@ where
12071261
// In Kth chunk of N mode - we will write to stdout instead of to a file.
12081262
let mut stdout_writer = std::io::stdout().lock();
12091263
// In N chunks mode - we will write to `num_chunks` files
1210-
let mut writers = vec![];
1264+
let mut out_files: Vec<OutFile> = Vec::new();
12111265

12121266
// Calculate chunk size base and modulo reminder
12131267
// to be used in calculating chunk_size later on
@@ -1219,16 +1273,7 @@ where
12191273
// This will create each of the underlying files
12201274
// or stdin pipes to child shell/command processes if in `--filter` mode
12211275
if kth_chunk.is_none() {
1222-
// This object is responsible for creating the filename for each chunk.
1223-
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)
1224-
.map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
1225-
for _ in 0..num_chunks {
1226-
let filename = filename_iterator
1227-
.next()
1228-
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
1229-
let writer = settings.instantiate_current_writer(filename.as_str())?;
1230-
writers.push(writer);
1231-
}
1276+
out_files = get_out_files(num_chunks, settings, false)?;
12321277
}
12331278

12341279
for i in 1_u64..=num_chunks {
@@ -1272,7 +1317,7 @@ where
12721317
}
12731318
None => {
12741319
let idx = (i - 1) as usize;
1275-
let writer = writers.get_mut(idx).unwrap();
1320+
let writer = out_files[idx].get_writer(settings)?;
12761321
writer.write_all(buf)?;
12771322
}
12781323
}
@@ -1284,9 +1329,14 @@ where
12841329
}
12851330

12861331
/// Split a file or STDIN into a specific number of chunks by line.
1287-
/// If in Kth chunk of N mode - print the k-th chunk to STDOUT.
12881332
///
1289-
/// In Kth chunk of N mode - writes to stdout the contents of the chunk identified by `kth_chunk`
1333+
/// It is most likely that input cannot be evenly divided into the number of chunks
1334+
/// of the same size in bytes or number of lines, since we cannot break lines.
1335+
/// It is also likely that there could be empty files (having `elide_empty_files` is disabled)
1336+
/// when a long line overlaps one or more chunks.
1337+
///
1338+
/// In Kth chunk of N mode - writes to STDOUT the contents of the chunk identified by `kth_chunk`
1339+
/// Note: the `elide_empty_files` flag is ignored in this mode
12901340
///
12911341
/// In N chunks mode - this function always creates one output file for each chunk, even
12921342
/// if there is an error reading or writing one of the chunks or if
@@ -1322,76 +1372,97 @@ where
13221372
let initial_buf = &mut Vec::new();
13231373
let num_bytes = get_input_size(&settings.input, reader, initial_buf, &settings.io_blksize)?;
13241374
let reader = initial_buf.chain(reader);
1325-
let chunk_size = (num_bytes / num_chunks) as usize;
13261375

13271376
// If input file is empty and we would not have determined the Kth chunk
13281377
// in the Kth chunk of N chunk mode, then terminate immediately.
13291378
// This happens on `split -n l/3/10 /dev/null`, for example.
1330-
if kth_chunk.is_some() && num_bytes == 0 {
1379+
// Similarly, if input file is empty and `elide_empty_files` parameter is enabled,
1380+
// then we would have written zero chunks of output,
1381+
// so terminate immediately as well.
1382+
// This happens on `split -e -n l/3 /dev/null`, for example.
1383+
if num_bytes == 0 && (kth_chunk.is_some() || settings.elide_empty_files) {
13311384
return Ok(());
13321385
}
13331386

13341387
// In Kth chunk of N mode - we will write to stdout instead of to a file.
13351388
let mut stdout_writer = std::io::stdout().lock();
13361389
// In N chunks mode - we will write to `num_chunks` files
1337-
let mut writers = vec![];
1390+
let mut out_files: Vec<OutFile> = Vec::new();
1391+
1392+
// Calculate chunk size base and modulo reminder
1393+
// to be used in calculating `num_bytes_should_be_written` later on
1394+
let chunk_size_base = num_bytes / num_chunks;
1395+
let chunk_size_reminder = num_bytes % num_chunks;
13381396

13391397
// If in N chunks mode
1340-
// Create one writer for each chunk.
1341-
// This will create each of the underlying files
1342-
// or stdin pipes to child shell/command processes if in `--filter` mode
1398+
// Generate filenames for each file and
1399+
// if `elide_empty_files` parameter is NOT enabled - instantiate the writer
1400+
// which will create each of the underlying files or stdin pipes
1401+
// to child shell/command processes if in `--filter` mode.
1402+
// Otherwise keep writer optional, to be instantiated later if there is data
1403+
// to write for the associated chunk.
13431404
if kth_chunk.is_none() {
1344-
// This object is responsible for creating the filename for each chunk.
1345-
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)
1346-
.map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
1347-
for _ in 0..num_chunks {
1348-
let filename = filename_iterator
1349-
.next()
1350-
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
1351-
let writer = settings.instantiate_current_writer(filename.as_str())?;
1352-
writers.push(writer);
1353-
}
1405+
out_files = get_out_files(num_chunks, settings, settings.elide_empty_files)?;
13541406
}
13551407

1356-
let mut num_bytes_remaining_in_current_chunk = chunk_size;
1357-
let mut i = 1;
1408+
let mut chunk_number = 1;
13581409
let sep = settings.separator;
1410+
let mut num_bytes_should_be_written = chunk_size_base + (chunk_size_reminder > 0) as u64;
1411+
let mut num_bytes_written = 0;
13591412

13601413
for line_result in reader.split(sep) {
1361-
// add separator back in at the end of the line
13621414
let mut line = line_result?;
1363-
line.push(sep);
1415+
// add separator back in at the end of the line,
1416+
// since `reader.split(sep)` removes it,
1417+
// except if the last line did not end with separator character
1418+
if (num_bytes_written + line.len() as u64) < num_bytes {
1419+
line.push(sep);
1420+
}
13641421
let bytes = line.as_slice();
13651422

13661423
match kth_chunk {
1367-
Some(chunk_number) => {
1368-
if i == chunk_number {
1424+
Some(kth) => {
1425+
if chunk_number == kth {
13691426
stdout_writer.write_all(bytes)?;
13701427
}
13711428
}
13721429
None => {
1373-
let idx = (i - 1) as usize;
1374-
let maybe_writer = writers.get_mut(idx);
1375-
let writer = maybe_writer.unwrap();
1430+
// Should write into a file
1431+
let idx = (chunk_number - 1) as usize;
1432+
let writer = out_files[idx].get_writer(settings)?;
13761433
custom_write_all(bytes, writer, settings)?;
13771434
}
13781435
}
13791436

1380-
let num_bytes = bytes.len();
1381-
if num_bytes >= num_bytes_remaining_in_current_chunk {
1382-
num_bytes_remaining_in_current_chunk = chunk_size;
1383-
i += 1;
1384-
} else {
1385-
num_bytes_remaining_in_current_chunk -= num_bytes;
1437+
// Advance to the next chunk if the current one is filled.
1438+
// There could be a situation when a long line, which started in current chunk,
1439+
// would overlap the next chunk (or even several next chunks),
1440+
// and since we cannot break lines for this split strategy, we could end up with
1441+
// empty files in place(s) of skipped chunk(s)
1442+
let num_line_bytes = bytes.len() as u64;
1443+
num_bytes_written += num_line_bytes;
1444+
let mut skipped = -1;
1445+
while num_bytes_should_be_written <= num_bytes_written {
1446+
num_bytes_should_be_written +=
1447+
chunk_size_base + (chunk_size_reminder > chunk_number) as u64;
1448+
chunk_number += 1;
1449+
skipped += 1;
13861450
}
13871451

1388-
if let Some(chunk_number) = kth_chunk {
1389-
if i > chunk_number {
1452+
// If a chunk was skipped and `elide_empty_files` flag is set,
1453+
// roll chunk_number back to preserve sequential continuity
1454+
// of file names for files written to,
1455+
// except for Kth chunk of N mode
1456+
if settings.elide_empty_files && skipped > 0 && kth_chunk.is_none() {
1457+
chunk_number -= skipped as u64;
1458+
}
1459+
1460+
if let Some(kth) = kth_chunk {
1461+
if chunk_number > kth {
13901462
break;
13911463
}
13921464
}
13931465
}
1394-
13951466
Ok(())
13961467
}
13971468

@@ -1432,23 +1503,14 @@ where
14321503
// In Kth chunk of N mode - we will write to stdout instead of to a file.
14331504
let mut stdout_writer = std::io::stdout().lock();
14341505
// In N chunks mode - we will write to `num_chunks` files
1435-
let mut writers = vec![];
1506+
let mut out_files: Vec<OutFile> = Vec::new();
14361507

14371508
// If in N chunks mode
14381509
// Create one writer for each chunk.
14391510
// This will create each of the underlying files
14401511
// or stdin pipes to child shell/command processes if in `--filter` mode
14411512
if kth_chunk.is_none() {
1442-
// This object is responsible for creating the filename for each chunk.
1443-
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)
1444-
.map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
1445-
for _ in 0..num_chunks {
1446-
let filename = filename_iterator
1447-
.next()
1448-
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
1449-
let writer = settings.instantiate_current_writer(filename.as_str())?;
1450-
writers.push(writer);
1451-
}
1513+
out_files = get_out_files(num_chunks, settings, false)?;
14521514
}
14531515

14541516
let num_chunks: usize = num_chunks.try_into().unwrap();
@@ -1470,9 +1532,7 @@ where
14701532
}
14711533
}
14721534
None => {
1473-
let maybe_writer = writers.get_mut(i % num_chunks);
1474-
let writer = maybe_writer.unwrap();
1475-
1535+
let writer = out_files[i % num_chunks].get_writer(settings)?;
14761536
let writer_stdin_open = custom_write_all(bytes, writer, settings)?;
14771537
if !writer_stdin_open {
14781538
closed_writers += 1;

src/uu/split/src/strategy.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
use crate::{OPT_BYTES, OPT_LINES, OPT_LINE_BYTES, OPT_NUMBER};
99
use clap::{parser::ValueSource, ArgMatches};
1010
use std::fmt;
11-
use uucore::parse_size::{parse_size_u64, parse_size_u64_max, ParseSizeError};
11+
use uucore::{
12+
display::Quotable,
13+
parse_size::{parse_size_u64, parse_size_u64_max, ParseSizeError},
14+
};
1215

1316
/// Sub-strategy of the [`Strategy::Number`]
1417
/// Splitting a file into a specific number of chunks.
@@ -208,10 +211,10 @@ impl fmt::Display for StrategyError {
208211
Self::Lines(e) => write!(f, "invalid number of lines: {e}"),
209212
Self::Bytes(e) => write!(f, "invalid number of bytes: {e}"),
210213
Self::NumberType(NumberTypeError::NumberOfChunks(s)) => {
211-
write!(f, "invalid number of chunks: {s}")
214+
write!(f, "invalid number of chunks: {}", s.quote())
212215
}
213216
Self::NumberType(NumberTypeError::ChunkNumber(s)) => {
214-
write!(f, "invalid chunk number: {s}")
217+
write!(f, "invalid chunk number: {}", s.quote())
215218
}
216219
Self::MultipleWays => write!(f, "cannot split in more than one way"),
217220
}

0 commit comments

Comments
 (0)