@@ -919,204 +919,6 @@ impl Write for LineChunkWriter<'_> {
919
919
}
920
920
}
921
921
922
- /// Write lines to each sequential output files, limited by bytes.
923
- ///
924
- /// This struct maintains an underlying writer representing the
925
- /// current chunk of the output. On each call to [`write`], it writes
926
- /// as many lines as possible to the current chunk without exceeding
927
- /// the specified byte limit. If a single line has more bytes than the
928
- /// limit, then fill an entire single chunk with those bytes and
929
- /// handle the remainder of the line as if it were its own distinct
930
- /// line. As many new underlying writers are created as needed to
931
- /// write all the data in the input buffer.
932
- struct LineBytesChunkWriter < ' a > {
933
- /// Parameters for creating the underlying writer for each new chunk.
934
- settings : & ' a Settings ,
935
-
936
- /// The maximum number of bytes allowed for a single chunk of output.
937
- chunk_size : u64 ,
938
-
939
- /// Running total of number of chunks that have been completed.
940
- num_chunks_written : usize ,
941
-
942
- /// Remaining capacity in number of bytes in the current chunk.
943
- ///
944
- /// This number starts at `chunk_size` and decreases as lines are
945
- /// written. Once it reaches zero, a writer for a new chunk is
946
- /// initialized and this number gets reset to `chunk_size`.
947
- num_bytes_remaining_in_current_chunk : usize ,
948
-
949
- /// The underlying writer for the current chunk.
950
- ///
951
- /// Once the number of bytes written to this writer exceeds
952
- /// `chunk_size`, a new writer is initialized and assigned to this
953
- /// field.
954
- inner : BufWriter < Box < dyn Write > > ,
955
-
956
- /// Iterator that yields filenames for each chunk.
957
- filename_iterator : FilenameIterator < ' a > ,
958
- }
959
-
960
- impl < ' a > LineBytesChunkWriter < ' a > {
961
- fn new ( chunk_size : u64 , settings : & ' a Settings ) -> UResult < Self > {
962
- let mut filename_iterator = FilenameIterator :: new ( & settings. prefix , & settings. suffix ) ?;
963
- let filename = filename_iterator
964
- . next ( )
965
- . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
966
- if settings. verbose {
967
- println ! ( "creating file {}" , filename. quote( ) ) ;
968
- }
969
- let inner = settings. instantiate_current_writer ( & filename, true ) ?;
970
- Ok ( LineBytesChunkWriter {
971
- settings,
972
- chunk_size,
973
- num_bytes_remaining_in_current_chunk : usize:: try_from ( chunk_size) . unwrap ( ) ,
974
- num_chunks_written : 0 ,
975
- inner,
976
- filename_iterator,
977
- } )
978
- }
979
- }
980
-
981
- impl Write for LineBytesChunkWriter < ' _ > {
982
- /// Write as many lines to a chunk as possible without
983
- /// exceeding the byte limit. If a single line has more bytes
984
- /// than the limit, then fill an entire single chunk with those
985
- /// bytes and handle the remainder of the line as if it were
986
- /// its own distinct line.
987
- ///
988
- /// For example: if the `chunk_size` is 8 and the input is:
989
- ///
990
- /// ```text
991
- /// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
992
- /// ```
993
- ///
994
- /// then the output gets broken into chunks like this:
995
- ///
996
- /// ```text
997
- /// chunk 0 chunk 1 chunk 2 chunk 3
998
- ///
999
- /// 0 1 2
1000
- /// 01234567 89 01234 56789 012 345 6
1001
- /// |------| |-------| |--------| |---|
1002
- /// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
1003
- /// ```
1004
- ///
1005
- /// Implements `--line-bytes=SIZE`
1006
- fn write ( & mut self , mut buf : & [ u8 ] ) -> std:: io:: Result < usize > {
1007
- // The total number of bytes written during the loop below.
1008
- //
1009
- // It is necessary to keep this running total because we may
1010
- // be making multiple calls to `write()` on multiple different
1011
- // underlying writers and we want the final reported number of
1012
- // bytes written to reflect the total number of bytes written
1013
- // to all of the underlying writers.
1014
- let mut total_bytes_written = 0 ;
1015
-
1016
- // Loop until we have written all bytes in the input buffer
1017
- // (or an IO error occurs).
1018
- loop {
1019
- // If the buffer is empty, then we are done writing.
1020
- if buf. is_empty ( ) {
1021
- return Ok ( total_bytes_written) ;
1022
- }
1023
-
1024
- // If we have filled the current chunk with bytes, then
1025
- // start a new chunk and initialize its corresponding
1026
- // writer.
1027
- if self . num_bytes_remaining_in_current_chunk == 0 {
1028
- self . num_chunks_written += 1 ;
1029
- let filename = self . filename_iterator . next ( ) . ok_or_else ( || {
1030
- std:: io:: Error :: new ( ErrorKind :: Other , "output file suffixes exhausted" )
1031
- } ) ?;
1032
- if self . settings . verbose {
1033
- println ! ( "creating file {}" , filename. quote( ) ) ;
1034
- }
1035
- self . inner = self . settings . instantiate_current_writer ( & filename, true ) ?;
1036
- self . num_bytes_remaining_in_current_chunk = self . chunk_size . try_into ( ) . unwrap ( ) ;
1037
- }
1038
-
1039
- // Find the first separator (default - newline character) in the buffer.
1040
- let sep = self . settings . separator ;
1041
- match memchr:: memchr ( sep, buf) {
1042
- // If there is no separator character and the buffer is
1043
- // not empty, then write as many bytes as we can and
1044
- // then move on to the next chunk if necessary.
1045
- None => {
1046
- let end = self . num_bytes_remaining_in_current_chunk ;
1047
-
1048
- // This is ugly but here to match GNU behavior. If the input
1049
- // doesn't end with a separator, pretend that it does for handling
1050
- // the second to last segment chunk. See `line-bytes.sh`.
1051
- if end == buf. len ( )
1052
- && self . num_bytes_remaining_in_current_chunk
1053
- < self . chunk_size . try_into ( ) . unwrap ( )
1054
- && buf[ buf. len ( ) - 1 ] != sep
1055
- {
1056
- self . num_bytes_remaining_in_current_chunk = 0 ;
1057
- } else {
1058
- let num_bytes_written = custom_write (
1059
- & buf[ ..end. min ( buf. len ( ) ) ] ,
1060
- & mut self . inner ,
1061
- self . settings ,
1062
- ) ?;
1063
- self . num_bytes_remaining_in_current_chunk -= num_bytes_written;
1064
- total_bytes_written += num_bytes_written;
1065
- buf = & buf[ num_bytes_written..] ;
1066
- }
1067
- }
1068
-
1069
- // If there is a separator character and the line
1070
- // (including the separator character) will fit in the
1071
- // current chunk, then write the entire line and
1072
- // continue to the next iteration. (See chunk 1 in the
1073
- // example comment above.)
1074
- Some ( i) if i < self . num_bytes_remaining_in_current_chunk => {
1075
- let num_bytes_written =
1076
- custom_write ( & buf[ ..=i] , & mut self . inner , self . settings ) ?;
1077
- self . num_bytes_remaining_in_current_chunk -= num_bytes_written;
1078
- total_bytes_written += num_bytes_written;
1079
- buf = & buf[ num_bytes_written..] ;
1080
- }
1081
-
1082
- // If there is a separator character, the line
1083
- // (including the separator character) will not fit in
1084
- // the current chunk, *and* no other lines have been
1085
- // written to the current chunk, then write as many
1086
- // bytes as we can and continue to the next
1087
- // iteration. (See chunk 0 in the example comment
1088
- // above.)
1089
- Some ( _)
1090
- if self . num_bytes_remaining_in_current_chunk
1091
- == self . chunk_size . try_into ( ) . unwrap ( ) =>
1092
- {
1093
- let end = self . num_bytes_remaining_in_current_chunk ;
1094
- let num_bytes_written =
1095
- custom_write ( & buf[ ..end] , & mut self . inner , self . settings ) ?;
1096
- self . num_bytes_remaining_in_current_chunk -= num_bytes_written;
1097
- total_bytes_written += num_bytes_written;
1098
- buf = & buf[ num_bytes_written..] ;
1099
- }
1100
-
1101
- // If there is a separator character, the line
1102
- // (including the separator character) will not fit in
1103
- // the current chunk, and at least one other line has
1104
- // been written to the current chunk, then signal to
1105
- // the next iteration that a new chunk needs to be
1106
- // created and continue to the next iteration of the
1107
- // loop to try writing the line there.
1108
- Some ( _) => {
1109
- self . num_bytes_remaining_in_current_chunk = 0 ;
1110
- }
1111
- }
1112
- }
1113
- }
1114
-
1115
- fn flush ( & mut self ) -> std:: io:: Result < ( ) > {
1116
- self . inner . flush ( )
1117
- }
1118
- }
1119
-
1120
922
/// Output file parameters
1121
923
struct OutFile {
1122
924
filename : String ,
@@ -1629,6 +1431,114 @@ where
1629
1431
Ok ( ( ) )
1630
1432
}
1631
1433
1434
+ /// Like `std::io::Lines`, but includes the line ending character.
1435
+ ///
1436
+ /// This struct is generally created by calling `lines_with_sep` on a
1437
+ /// reader.
1438
+ pub struct LinesWithSep < R > {
1439
+ inner : R ,
1440
+ separator : u8 ,
1441
+ }
1442
+
1443
+ impl < R > Iterator for LinesWithSep < R >
1444
+ where
1445
+ R : BufRead ,
1446
+ {
1447
+ type Item = std:: io:: Result < Vec < u8 > > ;
1448
+
1449
+ /// Read bytes from a buffer up to the requested number of lines.
1450
+ fn next ( & mut self ) -> Option < Self :: Item > {
1451
+ let mut buf = vec ! [ ] ;
1452
+ match self . inner . read_until ( self . separator , & mut buf) {
1453
+ Ok ( 0 ) => None ,
1454
+ Ok ( _) => Some ( Ok ( buf) ) ,
1455
+ Err ( e) => Some ( Err ( e) ) ,
1456
+ }
1457
+ }
1458
+ }
1459
+
1460
+ /// Like `std::str::lines` but includes the line ending character.
1461
+ ///
1462
+ /// The `separator` defines the character to interpret as the line
1463
+ /// ending. For the usual notion of "line", set this to `b'\n'`.
1464
+ pub fn lines_with_sep < R > ( reader : R , separator : u8 ) -> LinesWithSep < R >
1465
+ where
1466
+ R : BufRead ,
1467
+ {
1468
+ LinesWithSep {
1469
+ inner : reader,
1470
+ separator,
1471
+ }
1472
+ }
1473
+
1474
+ fn line_bytes < R > ( settings : & Settings , reader : & mut R , chunk_size : usize ) -> UResult < ( ) >
1475
+ where
1476
+ R : BufRead ,
1477
+ {
1478
+ let mut filename_iterator = FilenameIterator :: new ( & settings. prefix , & settings. suffix ) ?;
1479
+
1480
+ // Initialize the writer just to satisfy the compiler. It is going
1481
+ // to be overwritten for sure at the beginning of the loop below
1482
+ // because we start with `remaining == 0`, indicating that a new
1483
+ // chunk should start.
1484
+ let mut writer: BufWriter < Box < dyn Write > > =
1485
+ BufWriter :: new ( Box :: new ( std:: io:: Cursor :: new ( vec ! [ ] ) ) ) ;
1486
+
1487
+ let mut remaining = 0 ;
1488
+ for line in lines_with_sep ( reader, settings. separator ) {
1489
+ let line = line?;
1490
+ let mut line = & line[ ..] ;
1491
+ loop {
1492
+ if remaining == 0 {
1493
+ let filename = filename_iterator
1494
+ . next ( )
1495
+ . ok_or_else ( || USimpleError :: new ( 1 , "output file suffixes exhausted" ) ) ?;
1496
+ if settings. verbose {
1497
+ println ! ( "creating file {}" , filename. quote( ) ) ;
1498
+ }
1499
+ writer = settings. instantiate_current_writer ( & filename, true ) ?;
1500
+ remaining = chunk_size;
1501
+ }
1502
+
1503
+ // Special case: if this is the last line and it doesn't end
1504
+ // with a newline character, then count its length as though
1505
+ // it did end with a newline. If that puts it over the edge
1506
+ // of this chunk, continue to the next chunk.
1507
+ if line. len ( ) == remaining
1508
+ && remaining < chunk_size
1509
+ && line[ line. len ( ) - 1 ] != settings. separator
1510
+ {
1511
+ remaining = 0 ;
1512
+ continue ;
1513
+ }
1514
+
1515
+ // If the entire line fits in this chunk, write it and
1516
+ // continue to the next line.
1517
+ if line. len ( ) <= remaining {
1518
+ custom_write_all ( line, & mut writer, settings) ?;
1519
+ remaining -= line. len ( ) ;
1520
+ break ;
1521
+ }
1522
+
1523
+ // If the line is too large to fit in *any* chunk and we are
1524
+ // at the start of a new chunk, write as much as we can of
1525
+ // it and pass the remainder along to the next chunk.
1526
+ if line. len ( ) > chunk_size && remaining == chunk_size {
1527
+ custom_write_all ( & line[ ..chunk_size] , & mut writer, settings) ?;
1528
+ line = & line[ chunk_size..] ;
1529
+ remaining = 0 ;
1530
+ continue ;
1531
+ }
1532
+
1533
+ // If the line is too large to fit in *this* chunk, but
1534
+ // might otherwise fit in the next chunk, then just continue
1535
+ // to the next chunk and let it be handled there.
1536
+ remaining = 0 ;
1537
+ }
1538
+ }
1539
+ Ok ( ( ) )
1540
+ }
1541
+
1632
1542
#[ allow( clippy:: cognitive_complexity) ]
1633
1543
fn split ( settings : & Settings ) -> UResult < ( ) > {
1634
1544
let r_box = if settings. input == "-" {
@@ -1701,23 +1611,6 @@ fn split(settings: &Settings) -> UResult<()> {
1701
1611
} ,
1702
1612
}
1703
1613
}
1704
- Strategy :: LineBytes ( chunk_size) => {
1705
- let mut writer = LineBytesChunkWriter :: new ( chunk_size, settings) ?;
1706
- match std:: io:: copy ( & mut reader, & mut writer) {
1707
- Ok ( _) => Ok ( ( ) ) ,
1708
- Err ( e) => match e. kind ( ) {
1709
- // TODO Since the writer object controls the creation of
1710
- // new files, we need to rely on the `std::io::Result`
1711
- // returned by its `write()` method to communicate any
1712
- // errors to this calling scope. If a new file cannot be
1713
- // created because we have exceeded the number of
1714
- // allowable filenames, we use `ErrorKind::Other` to
1715
- // indicate that. A special error message needs to be
1716
- // printed in that case.
1717
- ErrorKind :: Other => Err ( USimpleError :: new ( 1 , format ! ( "{e}" ) ) ) ,
1718
- _ => Err ( uio_error ! ( e, "input/output error" ) ) ,
1719
- } ,
1720
- }
1721
- }
1614
+ Strategy :: LineBytes ( chunk_size) => line_bytes ( settings, & mut reader, chunk_size as usize ) ,
1722
1615
}
1723
1616
}
0 commit comments