1
- // spell-checker:ignore nonrepeating
2
-
3
- // TODO: this iterator is not compatible with GNU when --random-source is used
4
-
5
- use std:: { collections:: HashSet , ops:: RangeInclusive } ;
1
+ use std:: collections:: HashMap ;
2
+ use std:: ops:: RangeInclusive ;
6
3
7
4
use uucore:: error:: UResult ;
8
5
9
6
use crate :: WrappedRng ;
10
7
11
- enum NumberSet {
12
- AlreadyListed ( HashSet < u64 > ) ,
13
- Remaining ( Vec < u64 > ) ,
14
- }
15
-
8
+ /// An iterator that samples from an integer range without repetition.
9
+ ///
10
+ /// This is based on Fisher-Yates, and it's required for backward compatibility
11
+ /// that it behaves exactly like Fisher-Yates if --random-source or --random-seed
12
+ /// is used. But we have a few tricks:
13
+ ///
14
+ /// - In the beginning we use a hash table instead of an array. This way we lazily
15
+ /// keep track of swaps without allocating the entire range upfront.
16
+ ///
17
+ /// - When the hash table starts to get big relative to the remaining items
18
+ /// we switch over to an array.
19
+ ///
20
+ /// - We store the array backwards so that we can shrink it as we go and free excess
21
+ /// memory every now and then.
22
+ ///
23
+ /// Both the hash table and the array give the same output.
24
+ ///
25
+ /// There's room for optimization:
26
+ ///
27
+ /// - Switching over from the hash table to the array is costly. If we happen to know
28
+ /// (through --head-count) that only few draws remain then it would be better not
29
+ /// to switch.
30
+ ///
31
+ /// - If the entire range gets used then we might as well allocate an array to start
32
+ /// with. But if the user e.g. pipes through `head` rather than using --head-count
33
+ /// we can't know whether that's the case, so there's a tradeoff.
34
+ ///
35
+ /// GNU decides the other way: --head-count is noticeably faster than | head.
16
36
pub ( crate ) struct NonrepeatingIterator < ' a > {
17
- range : RangeInclusive < u64 > ,
18
37
rng : & ' a mut WrappedRng ,
19
- remaining_count : u64 ,
20
- buf : NumberSet ,
38
+ values : Values ,
39
+ }
40
+
41
+ enum Values {
42
+ Full ( Vec < u64 > ) ,
43
+ Sparse ( RangeInclusive < u64 > , HashMap < u64 , u64 > ) ,
21
44
}
22
45
23
46
impl < ' a > NonrepeatingIterator < ' a > {
24
- pub ( crate ) fn new ( range : RangeInclusive < u64 > , rng : & ' a mut WrappedRng , amount : u64 ) -> Self {
25
- let capped_amount = if range. start ( ) > range. end ( ) {
26
- 0
27
- } else if range == ( 0 ..=u64:: MAX ) {
28
- amount
29
- } else {
30
- amount. min ( range. end ( ) - range. start ( ) + 1 )
31
- } ;
32
- NonrepeatingIterator {
33
- range,
34
- rng,
35
- remaining_count : capped_amount,
36
- buf : NumberSet :: AlreadyListed ( HashSet :: default ( ) ) ,
37
- }
47
+ pub ( crate ) fn new ( range : RangeInclusive < u64 > , rng : & ' a mut WrappedRng ) -> Self {
48
+ let values = Values :: Sparse ( range, HashMap :: default ( ) ) ;
49
+ NonrepeatingIterator { rng, values }
38
50
}
39
51
40
52
fn produce ( & mut self ) -> UResult < u64 > {
41
- debug_assert ! ( self . range. start( ) <= self . range. end( ) ) ;
42
- match & mut self . buf {
43
- NumberSet :: AlreadyListed ( already_listed) => {
44
- let chosen = loop {
45
- let guess = self . rng . choose_from_range ( self . range . clone ( ) ) ?;
46
- let newly_inserted = already_listed. insert ( guess) ;
47
- if newly_inserted {
48
- break guess;
49
- }
50
- } ;
51
- // Once a significant fraction of the interval has already been enumerated,
52
- // the number of attempts to find a number that hasn't been chosen yet increases.
53
- // Therefore, we need to switch at some point from "set of already returned values" to "list of remaining values".
54
- let range_size = ( self . range . end ( ) - self . range . start ( ) ) . saturating_add ( 1 ) ;
55
- if number_set_should_list_remaining ( already_listed. len ( ) as u64 , range_size) {
56
- let mut remaining = self
57
- . range
58
- . clone ( )
59
- . filter ( |n| !already_listed. contains ( n) )
60
- . collect :: < Vec < _ > > ( ) ;
61
- assert ! ( remaining. len( ) as u64 >= self . remaining_count) ;
62
- remaining. truncate ( self . remaining_count as usize ) ;
63
- self . rng . shuffle ( & mut remaining, usize:: MAX ) ?;
64
- self . buf = NumberSet :: Remaining ( remaining) ;
53
+ match & mut self . values {
54
+ Values :: Full ( items) => {
55
+ let this_idx = items. len ( ) - 1 ;
56
+
57
+ let other_idx = self . rng . choose_from_range ( 0 ..=items. len ( ) as u64 - 1 ) ? as usize ;
58
+ // Flip the index to pretend we're going left-to-right
59
+ let other_idx = items. len ( ) - other_idx - 1 ;
60
+
61
+ items. swap ( this_idx, other_idx) ;
62
+
63
+ let val = items. pop ( ) . unwrap ( ) ;
64
+ if items. len ( ) . is_power_of_two ( ) && items. len ( ) >= 512 {
65
+ items. shrink_to_fit ( ) ;
65
66
}
66
- Ok ( chosen )
67
+ Ok ( val )
67
68
}
68
- NumberSet :: Remaining ( remaining_numbers) => {
69
- debug_assert ! ( !remaining_numbers. is_empty( ) ) ;
70
- // We only enter produce() when there is at least one actual element remaining, so popping must always return an element.
71
- Ok ( remaining_numbers. pop ( ) . unwrap ( ) )
69
+ Values :: Sparse ( range, items) => {
70
+ let this_idx = * range. start ( ) ;
71
+ let this_val = items. remove ( & this_idx) . unwrap_or ( this_idx) ;
72
+
73
+ let other_idx = self . rng . choose_from_range ( range. clone ( ) ) ?;
74
+
75
+ let val = if this_idx != other_idx {
76
+ items. insert ( other_idx, this_val) . unwrap_or ( other_idx)
77
+ } else {
78
+ this_val
79
+ } ;
80
+ * range = * range. start ( ) + 1 ..=* range. end ( ) ;
81
+
82
+ Ok ( val)
72
83
}
73
84
}
74
85
}
@@ -77,101 +88,24 @@ impl<'a> NonrepeatingIterator<'a> {
77
88
impl Iterator for NonrepeatingIterator < ' _ > {
78
89
type Item = UResult < u64 > ;
79
90
80
- fn next ( & mut self ) -> Option < UResult < u64 > > {
81
- if self . range . is_empty ( ) || self . remaining_count == 0 {
82
- return None ;
91
+ fn next ( & mut self ) -> Option < Self :: Item > {
92
+ match & self . values {
93
+ Values :: Full ( items) if items. is_empty ( ) => return None ,
94
+ Values :: Full ( _) => ( ) ,
95
+ Values :: Sparse ( range, _) if range. is_empty ( ) => return None ,
96
+ Values :: Sparse ( range, items) => {
97
+ let range_len = range. size_hint ( ) . 0 as u64 ;
98
+ if range_len > 16 && items. len ( ) as u64 >= range_len / 8 {
99
+ self . values = Values :: Full ( hashmap_to_vec ( range. clone ( ) , items) ) ;
100
+ }
101
+ }
83
102
}
84
- self . remaining_count -= 1 ;
103
+
85
104
Some ( self . produce ( ) )
86
105
}
87
106
}
88
107
89
- // This could be a method, but it is much easier to test as a stand-alone function.
90
- fn number_set_should_list_remaining ( listed_count : u64 , range_size : u64 ) -> bool {
91
- // Arbitrarily determine the switchover point to be around 25%. This is because:
92
- // - HashSet has a large space overhead for the hash table load factor.
93
- // - This means that somewhere between 25-40%, the memory required for a "positive" HashSet and a "negative" Vec should be the same.
94
- // - HashSet has a small but non-negligible overhead for each lookup, so we have a slight preference for Vec anyway.
95
- // - At 25%, on average 1.33 attempts are needed to find a number that hasn't been taken yet.
96
- // - Finally, "24%" is computationally the simplest:
97
- listed_count >= range_size / 4
98
- }
99
-
100
- #[ cfg( test) ]
101
- // Since the computed value is a bool, it is more readable to write the expected value out:
102
- #[ allow( clippy:: bool_assert_comparison) ]
103
- mod test_number_set_decision {
104
- use super :: number_set_should_list_remaining;
105
-
106
- #[ test]
107
- fn test_stay_positive_large_remaining_first ( ) {
108
- assert_eq ! ( false , number_set_should_list_remaining( 0 , u64 :: MAX ) ) ;
109
- }
110
-
111
- #[ test]
112
- fn test_stay_positive_large_remaining_second ( ) {
113
- assert_eq ! ( false , number_set_should_list_remaining( 1 , u64 :: MAX ) ) ;
114
- }
115
-
116
- #[ test]
117
- fn test_stay_positive_large_remaining_tenth ( ) {
118
- assert_eq ! ( false , number_set_should_list_remaining( 9 , u64 :: MAX ) ) ;
119
- }
120
-
121
- #[ test]
122
- fn test_stay_positive_smallish_range_first ( ) {
123
- assert_eq ! ( false , number_set_should_list_remaining( 0 , 12345 ) ) ;
124
- }
125
-
126
- #[ test]
127
- fn test_stay_positive_smallish_range_second ( ) {
128
- assert_eq ! ( false , number_set_should_list_remaining( 1 , 12345 ) ) ;
129
- }
130
-
131
- #[ test]
132
- fn test_stay_positive_smallish_range_tenth ( ) {
133
- assert_eq ! ( false , number_set_should_list_remaining( 9 , 12345 ) ) ;
134
- }
135
-
136
- #[ test]
137
- fn test_stay_positive_small_range_not_too_early ( ) {
138
- assert_eq ! ( false , number_set_should_list_remaining( 1 , 10 ) ) ;
139
- }
140
-
141
- // Don't want to test close to the border, in case we decide to change the threshold.
142
- // However, at 50% coverage, we absolutely should switch:
143
- #[ test]
144
- fn test_switch_half ( ) {
145
- assert_eq ! ( true , number_set_should_list_remaining( 1234 , 2468 ) ) ;
146
- }
147
-
148
- // Ensure that the decision is monotonous:
149
- #[ test]
150
- fn test_switch_late1 ( ) {
151
- assert_eq ! ( true , number_set_should_list_remaining( 12340 , 12345 ) ) ;
152
- }
153
-
154
- #[ test]
155
- fn test_switch_late2 ( ) {
156
- assert_eq ! ( true , number_set_should_list_remaining( 12344 , 12345 ) ) ;
157
- }
158
-
159
- // Ensure that we are overflow-free:
160
- #[ test]
161
- fn test_no_crash_exceed_max_size1 ( ) {
162
- assert_eq ! ( false , number_set_should_list_remaining( 12345 , u64 :: MAX ) ) ;
163
- }
164
-
165
- #[ test]
166
- fn test_no_crash_exceed_max_size2 ( ) {
167
- assert_eq ! (
168
- true ,
169
- number_set_should_list_remaining( u64 :: MAX - 1 , u64 :: MAX )
170
- ) ;
171
- }
172
-
173
- #[ test]
174
- fn test_no_crash_exceed_max_size3 ( ) {
175
- assert_eq ! ( true , number_set_should_list_remaining( u64 :: MAX , u64 :: MAX ) ) ;
176
- }
108
+ fn hashmap_to_vec ( range : RangeInclusive < u64 > , map : & HashMap < u64 , u64 > ) -> Vec < u64 > {
109
+ let lookup = |idx| * map. get ( & idx) . unwrap_or ( & idx) ;
110
+ range. rev ( ) . map ( lookup) . collect ( )
177
111
}
0 commit comments