@@ -1130,14 +1130,68 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
1130
1130
}
1131
1131
}
1132
1132
1133
+ /// Output file parameters
1134
+ struct OutFile {
1135
+ filename : String ,
1136
+ maybe_writer : Option < BufWriter < Box < dyn Write > > > ,
1137
+ }
1138
+
1139
+ impl OutFile {
1140
+ /// Get the writer for the output file
1141
+ /// Instantiate the writer if it has not been instantiated upfront
1142
+ fn get_writer ( & mut self , settings : & Settings ) -> UResult < & mut BufWriter < Box < dyn Write > > > {
1143
+ if self . maybe_writer . is_some ( ) {
1144
+ Ok ( self . maybe_writer . as_mut ( ) . unwrap ( ) )
1145
+ } else {
1146
+ // Writer was not instantiated upfront
1147
+ // Instantiate it and record for future use
1148
+ self . maybe_writer = Some ( settings. instantiate_current_writer ( self . filename . as_str ( ) ) ?) ;
1149
+ Ok ( self . maybe_writer . as_mut ( ) . unwrap ( ) )
1150
+ }
1151
+ }
1152
+ }
1153
+
1154
+ /// Generate a set of Output Files
1155
+ /// This is a helper function to [`n_chunks_by_byte`], [`n_chunks_by_line`]
1156
+ /// and [`n_chunks_by_line_round_robin`].
1157
+ /// Each OutFile is generated with filename, while the writer for it could be
1158
+ /// optional, to be instantiated later by the calling function as needed.
1159
+ /// Optional writers could happen in [`n_chunks_by_line`]
1160
+ /// if `elide_empty_files` parameter is set to `true`.
1161
+ fn get_out_files (
1162
+ num_files : u64 ,
1163
+ settings : & Settings ,
1164
+ is_writer_optional : bool ,
1165
+ ) -> UResult < Vec < OutFile > > {
1166
+ // This object is responsible for creating the filename for each chunk
1167
+ let mut filename_iterator: FilenameIterator < ' _ > =
1168
+ FilenameIterator :: new ( & settings. prefix , & settings. suffix )
1169
+ . map_err ( |e| io:: Error :: new ( ErrorKind :: Other , format ! ( "{e}" ) ) ) ?;
1170
+ let mut out_files: Vec < OutFile > = Vec :: new ( ) ;
1171
+ for _ in 0 ..num_files {
1172
+ let filename = filename_iterator
1173
+ . next ( )
1174
+ . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
1175
+ let maybe_writer = if is_writer_optional {
1176
+ None
1177
+ } else {
1178
+ Some ( settings. instantiate_current_writer ( filename. as_str ( ) ) ?)
1179
+ } ;
1180
+ out_files. push ( OutFile {
1181
+ filename,
1182
+ maybe_writer,
1183
+ } ) ;
1184
+ }
1185
+ Ok ( out_files)
1186
+ }
1187
+
1133
1188
/// Split a file or STDIN into a specific number of chunks by byte.
1134
- /// If in Kth chunk of N mode - print the k-th chunk to STDOUT.
1135
1189
///
1136
1190
/// When file size cannot be evenly divided into the number of chunks of the same size,
1137
1191
/// the first X chunks are 1 byte longer than the rest,
1138
1192
/// where X is a modulus reminder of (file size % number of chunks)
1139
1193
///
1140
- /// In Kth chunk of N mode - writes to stdout the contents of the chunk identified by `kth_chunk`
1194
+ /// In Kth chunk of N mode - writes to STDOUT the contents of the chunk identified by `kth_chunk`
1141
1195
///
1142
1196
/// In N chunks mode - this function always creates one output file for each chunk, even
1143
1197
/// if there is an error reading or writing one of the chunks or if
@@ -1207,7 +1261,7 @@ where
1207
1261
// In Kth chunk of N mode - we will write to stdout instead of to a file.
1208
1262
let mut stdout_writer = std:: io:: stdout ( ) . lock ( ) ;
1209
1263
// In N chunks mode - we will write to `num_chunks` files
1210
- let mut writers = vec ! [ ] ;
1264
+ let mut out_files : Vec < OutFile > = Vec :: new ( ) ;
1211
1265
1212
1266
// Calculate chunk size base and modulo reminder
1213
1267
// to be used in calculating chunk_size later on
@@ -1219,16 +1273,7 @@ where
1219
1273
// This will create each of the underlying files
1220
1274
// or stdin pipes to child shell/command processes if in `--filter` mode
1221
1275
if kth_chunk. is_none ( ) {
1222
- // This object is responsible for creating the filename for each chunk.
1223
- let mut filename_iterator = FilenameIterator :: new ( & settings. prefix , & settings. suffix )
1224
- . map_err ( |e| io:: Error :: new ( ErrorKind :: Other , format ! ( "{e}" ) ) ) ?;
1225
- for _ in 0 ..num_chunks {
1226
- let filename = filename_iterator
1227
- . next ( )
1228
- . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
1229
- let writer = settings. instantiate_current_writer ( filename. as_str ( ) ) ?;
1230
- writers. push ( writer) ;
1231
- }
1276
+ out_files = get_out_files ( num_chunks, settings, false ) ?;
1232
1277
}
1233
1278
1234
1279
for i in 1_u64 ..=num_chunks {
@@ -1272,7 +1317,7 @@ where
1272
1317
}
1273
1318
None => {
1274
1319
let idx = ( i - 1 ) as usize ;
1275
- let writer = writers . get_mut ( idx) . unwrap ( ) ;
1320
+ let writer = out_files [ idx] . get_writer ( settings ) ? ;
1276
1321
writer. write_all ( buf) ?;
1277
1322
}
1278
1323
}
@@ -1284,9 +1329,14 @@ where
1284
1329
}
1285
1330
1286
1331
/// Split a file or STDIN into a specific number of chunks by line.
1287
- /// If in Kth chunk of N mode - print the k-th chunk to STDOUT.
1288
1332
///
1289
- /// In Kth chunk of N mode - writes to stdout the contents of the chunk identified by `kth_chunk`
1333
+ /// It is most likely that input cannot be evenly divided into the number of chunks
1334
+ /// of the same size in bytes or number of lines, since we cannot break lines.
1335
+ /// It is also likely that there could be empty files (having `elide_empty_files` is disabled)
1336
+ /// when a long line overlaps one or more chunks.
1337
+ ///
1338
+ /// In Kth chunk of N mode - writes to STDOUT the contents of the chunk identified by `kth_chunk`
1339
+ /// Note: the `elide_empty_files` flag is ignored in this mode
1290
1340
///
1291
1341
/// In N chunks mode - this function always creates one output file for each chunk, even
1292
1342
/// if there is an error reading or writing one of the chunks or if
@@ -1322,76 +1372,97 @@ where
1322
1372
let initial_buf = & mut Vec :: new ( ) ;
1323
1373
let num_bytes = get_input_size ( & settings. input , reader, initial_buf, & settings. io_blksize ) ?;
1324
1374
let reader = initial_buf. chain ( reader) ;
1325
- let chunk_size = ( num_bytes / num_chunks) as usize ;
1326
1375
1327
1376
// If input file is empty and we would not have determined the Kth chunk
1328
1377
// in the Kth chunk of N chunk mode, then terminate immediately.
1329
1378
// This happens on `split -n l/3/10 /dev/null`, for example.
1330
- if kth_chunk. is_some ( ) && num_bytes == 0 {
1379
+ // Similarly, if input file is empty and `elide_empty_files` parameter is enabled,
1380
+ // then we would have written zero chunks of output,
1381
+ // so terminate immediately as well.
1382
+ // This happens on `split -e -n l/3 /dev/null`, for example.
1383
+ if num_bytes == 0 && ( kth_chunk. is_some ( ) || settings. elide_empty_files ) {
1331
1384
return Ok ( ( ) ) ;
1332
1385
}
1333
1386
1334
1387
// In Kth chunk of N mode - we will write to stdout instead of to a file.
1335
1388
let mut stdout_writer = std:: io:: stdout ( ) . lock ( ) ;
1336
1389
// In N chunks mode - we will write to `num_chunks` files
1337
- let mut writers = vec ! [ ] ;
1390
+ let mut out_files: Vec < OutFile > = Vec :: new ( ) ;
1391
+
1392
+ // Calculate chunk size base and modulo reminder
1393
+ // to be used in calculating `num_bytes_should_be_written` later on
1394
+ let chunk_size_base = num_bytes / num_chunks;
1395
+ let chunk_size_reminder = num_bytes % num_chunks;
1338
1396
1339
1397
// If in N chunks mode
1340
- // Create one writer for each chunk.
1341
- // This will create each of the underlying files
1342
- // or stdin pipes to child shell/command processes if in `--filter` mode
1398
+ // Generate filenames for each file and
1399
+ // if `elide_empty_files` parameter is NOT enabled - instantiate the writer
1400
+ // which will create each of the underlying files or stdin pipes
1401
+ // to child shell/command processes if in `--filter` mode.
1402
+ // Otherwise keep writer optional, to be instantiated later if there is data
1403
+ // to write for the associated chunk.
1343
1404
if kth_chunk. is_none ( ) {
1344
- // This object is responsible for creating the filename for each chunk.
1345
- let mut filename_iterator = FilenameIterator :: new ( & settings. prefix , & settings. suffix )
1346
- . map_err ( |e| io:: Error :: new ( ErrorKind :: Other , format ! ( "{e}" ) ) ) ?;
1347
- for _ in 0 ..num_chunks {
1348
- let filename = filename_iterator
1349
- . next ( )
1350
- . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
1351
- let writer = settings. instantiate_current_writer ( filename. as_str ( ) ) ?;
1352
- writers. push ( writer) ;
1353
- }
1405
+ out_files = get_out_files ( num_chunks, settings, settings. elide_empty_files ) ?;
1354
1406
}
1355
1407
1356
- let mut num_bytes_remaining_in_current_chunk = chunk_size;
1357
- let mut i = 1 ;
1408
+ let mut chunk_number = 1 ;
1358
1409
let sep = settings. separator ;
1410
+ let mut num_bytes_should_be_written = chunk_size_base + ( chunk_size_reminder > 0 ) as u64 ;
1411
+ let mut num_bytes_written = 0 ;
1359
1412
1360
1413
for line_result in reader. split ( sep) {
1361
- // add separator back in at the end of the line
1362
1414
let mut line = line_result?;
1363
- line. push ( sep) ;
1415
+ // add separator back in at the end of the line,
1416
+ // since `reader.split(sep)` removes it,
1417
+ // except if the last line did not end with separator character
1418
+ if ( num_bytes_written + line. len ( ) as u64 ) < num_bytes {
1419
+ line. push ( sep) ;
1420
+ }
1364
1421
let bytes = line. as_slice ( ) ;
1365
1422
1366
1423
match kth_chunk {
1367
- Some ( chunk_number ) => {
1368
- if i == chunk_number {
1424
+ Some ( kth ) => {
1425
+ if chunk_number == kth {
1369
1426
stdout_writer. write_all ( bytes) ?;
1370
1427
}
1371
1428
}
1372
1429
None => {
1373
- let idx = ( i - 1 ) as usize ;
1374
- let maybe_writer = writers . get_mut ( idx ) ;
1375
- let writer = maybe_writer . unwrap ( ) ;
1430
+ // Should write into a file
1431
+ let idx = ( chunk_number - 1 ) as usize ;
1432
+ let writer = out_files [ idx ] . get_writer ( settings ) ? ;
1376
1433
custom_write_all ( bytes, writer, settings) ?;
1377
1434
}
1378
1435
}
1379
1436
1380
- let num_bytes = bytes. len ( ) ;
1381
- if num_bytes >= num_bytes_remaining_in_current_chunk {
1382
- num_bytes_remaining_in_current_chunk = chunk_size;
1383
- i += 1 ;
1384
- } else {
1385
- num_bytes_remaining_in_current_chunk -= num_bytes;
1437
+ // Advance to the next chunk if the current one is filled.
1438
+ // There could be a situation when a long line, which started in current chunk,
1439
+ // would overlap the next chunk (or even several next chunks),
1440
+ // and since we cannot break lines for this split strategy, we could end up with
1441
+ // empty files in place(s) of skipped chunk(s)
1442
+ let num_line_bytes = bytes. len ( ) as u64 ;
1443
+ num_bytes_written += num_line_bytes;
1444
+ let mut skipped = -1 ;
1445
+ while num_bytes_should_be_written <= num_bytes_written {
1446
+ num_bytes_should_be_written +=
1447
+ chunk_size_base + ( chunk_size_reminder > chunk_number) as u64 ;
1448
+ chunk_number += 1 ;
1449
+ skipped += 1 ;
1386
1450
}
1387
1451
1388
- if let Some ( chunk_number) = kth_chunk {
1389
- if i > chunk_number {
1452
+ // If a chunk was skipped and `elide_empty_files` flag is set,
1453
+ // roll chunk_number back to preserve sequential continuity
1454
+ // of file names for files written to,
1455
+ // except for Kth chunk of N mode
1456
+ if settings. elide_empty_files && skipped > 0 && kth_chunk. is_none ( ) {
1457
+ chunk_number -= skipped as u64 ;
1458
+ }
1459
+
1460
+ if let Some ( kth) = kth_chunk {
1461
+ if chunk_number > kth {
1390
1462
break ;
1391
1463
}
1392
1464
}
1393
1465
}
1394
-
1395
1466
Ok ( ( ) )
1396
1467
}
1397
1468
@@ -1432,23 +1503,14 @@ where
1432
1503
// In Kth chunk of N mode - we will write to stdout instead of to a file.
1433
1504
let mut stdout_writer = std:: io:: stdout ( ) . lock ( ) ;
1434
1505
// In N chunks mode - we will write to `num_chunks` files
1435
- let mut writers = vec ! [ ] ;
1506
+ let mut out_files : Vec < OutFile > = Vec :: new ( ) ;
1436
1507
1437
1508
// If in N chunks mode
1438
1509
// Create one writer for each chunk.
1439
1510
// This will create each of the underlying files
1440
1511
// or stdin pipes to child shell/command processes if in `--filter` mode
1441
1512
if kth_chunk. is_none ( ) {
1442
- // This object is responsible for creating the filename for each chunk.
1443
- let mut filename_iterator = FilenameIterator :: new ( & settings. prefix , & settings. suffix )
1444
- . map_err ( |e| io:: Error :: new ( ErrorKind :: Other , format ! ( "{e}" ) ) ) ?;
1445
- for _ in 0 ..num_chunks {
1446
- let filename = filename_iterator
1447
- . next ( )
1448
- . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
1449
- let writer = settings. instantiate_current_writer ( filename. as_str ( ) ) ?;
1450
- writers. push ( writer) ;
1451
- }
1513
+ out_files = get_out_files ( num_chunks, settings, false ) ?;
1452
1514
}
1453
1515
1454
1516
let num_chunks: usize = num_chunks. try_into ( ) . unwrap ( ) ;
@@ -1470,9 +1532,7 @@ where
1470
1532
}
1471
1533
}
1472
1534
None => {
1473
- let maybe_writer = writers. get_mut ( i % num_chunks) ;
1474
- let writer = maybe_writer. unwrap ( ) ;
1475
-
1535
+ let writer = out_files[ i % num_chunks] . get_writer ( settings) ?;
1476
1536
let writer_stdin_open = custom_write_all ( bytes, writer, settings) ?;
1477
1537
if !writer_stdin_open {
1478
1538
closed_writers += 1 ;
0 commit comments