2
2
* menu.c - the menu idle governor
3
3
*
4
4
* Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
5
+ * Copyright (C) 2009 Intel Corporation
6
+ * Author:
7
+ * Arjan van de Ven <arjan@linux.intel.com>
5
8
*
6
- * This code is licenced under the GPL.
9
+ * This code is licenced under the GPL version 2 as described
10
+ * in the COPYING file that acompanies the Linux Kernel.
7
11
*/
8
12
9
13
#include <linux/kernel.h>
13
17
#include <linux/ktime.h>
14
18
#include <linux/hrtimer.h>
15
19
#include <linux/tick.h>
20
+ #include <linux/sched.h>
16
21
17
- #define BREAK_FUZZ 4 /* 4 us */
18
- #define PRED_HISTORY_PCT 50
22
+ #define BUCKETS 12
23
+ #define RESOLUTION 1024
24
+ #define DECAY 4
25
+ #define MAX_INTERESTING 50000
26
+
27
+ /*
28
+ * Concepts and ideas behind the menu governor
29
+ *
30
+ * For the menu governor, there are 3 decision factors for picking a C
31
+ * state:
32
+ * 1) Energy break even point
33
+ * 2) Performance impact
34
+ * 3) Latency tolerance (from pmqos infrastructure)
35
+ * These these three factors are treated independently.
36
+ *
37
+ * Energy break even point
38
+ * -----------------------
39
+ * C state entry and exit have an energy cost, and a certain amount of time in
40
+ * the C state is required to actually break even on this cost. CPUIDLE
41
+ * provides us this duration in the "target_residency" field. So all that we
42
+ * need is a good prediction of how long we'll be idle. Like the traditional
43
+ * menu governor, we start with the actual known "next timer event" time.
44
+ *
45
+ * Since there are other source of wakeups (interrupts for example) than
46
+ * the next timer event, this estimation is rather optimistic. To get a
47
+ * more realistic estimate, a correction factor is applied to the estimate,
48
+ * that is based on historic behavior. For example, if in the past the actual
49
+ * duration always was 50% of the next timer tick, the correction factor will
50
+ * be 0.5.
51
+ *
52
+ * menu uses a running average for this correction factor, however it uses a
53
+ * set of factors, not just a single factor. This stems from the realization
54
+ * that the ratio is dependent on the order of magnitude of the expected
55
+ * duration; if we expect 500 milliseconds of idle time the likelihood of
56
+ * getting an interrupt very early is much higher than if we expect 50 micro
57
+ * seconds of idle time. A second independent factor that has big impact on
58
+ * the actual factor is if there is (disk) IO outstanding or not.
59
+ * (as a special twist, we consider every sleep longer than 50 milliseconds
60
+ * as perfect; there are no power gains for sleeping longer than this)
61
+ *
62
+ * For these two reasons we keep an array of 12 independent factors, that gets
63
+ * indexed based on the magnitude of the expected duration as well as the
64
+ * "is IO outstanding" property.
65
+ *
66
+ * Limiting Performance Impact
67
+ * ---------------------------
68
+ * C states, especially those with large exit latencies, can have a real
69
+ * noticable impact on workloads, which is not acceptable for most sysadmins,
70
+ * and in addition, less performance has a power price of its own.
71
+ *
72
+ * As a general rule of thumb, menu assumes that the following heuristic
73
+ * holds:
74
+ * The busier the system, the less impact of C states is acceptable
75
+ *
76
+ * This rule-of-thumb is implemented using a performance-multiplier:
77
+ * If the exit latency times the performance multiplier is longer than
78
+ * the predicted duration, the C state is not considered a candidate
79
+ * for selection due to a too high performance impact. So the higher
80
+ * this multiplier is, the longer we need to be idle to pick a deep C
81
+ * state, and thus the less likely a busy CPU will hit such a deep
82
+ * C state.
83
+ *
84
+ * Two factors are used in determing this multiplier:
85
+ * a value of 10 is added for each point of "per cpu load average" we have.
86
+ * a value of 5 points is added for each process that is waiting for
87
+ * IO on this CPU.
88
+ * (these values are experimentally determined)
89
+ *
90
+ * The load average factor gives a longer term (few seconds) input to the
91
+ * decision, while the iowait value gives a cpu local instantanious input.
92
+ * The iowait factor may look low, but realize that this is also already
93
+ * represented in the system load average.
94
+ *
95
+ */
19
96
20
97
struct menu_device {
21
98
int last_state_idx ;
22
99
23
100
unsigned int expected_us ;
24
- unsigned int predicted_us ;
25
- unsigned int current_predicted_us ;
26
- unsigned int last_measured_us ;
27
- unsigned int elapsed_us ;
101
+ u64 predicted_us ;
102
+ unsigned int measured_us ;
103
+ unsigned int exit_us ;
104
+ unsigned int bucket ;
105
+ u64 correction_factor [BUCKETS ];
28
106
};
29
107
108
+
109
+ #define LOAD_INT (x ) ((x) >> FSHIFT)
110
+ #define LOAD_FRAC (x ) LOAD_INT(((x) & (FIXED_1-1)) * 100)
111
+
112
+ static int get_loadavg (void )
113
+ {
114
+ unsigned long this = this_cpu_load ();
115
+
116
+
117
+ return LOAD_INT (this ) * 10 + LOAD_FRAC (this ) / 10 ;
118
+ }
119
+
120
+ static inline int which_bucket (unsigned int duration )
121
+ {
122
+ int bucket = 0 ;
123
+
124
+ /*
125
+ * We keep two groups of stats; one with no
126
+ * IO pending, one without.
127
+ * This allows us to calculate
128
+ * E(duration)|iowait
129
+ */
130
+ if (nr_iowait_cpu ())
131
+ bucket = BUCKETS /2 ;
132
+
133
+ if (duration < 10 )
134
+ return bucket ;
135
+ if (duration < 100 )
136
+ return bucket + 1 ;
137
+ if (duration < 1000 )
138
+ return bucket + 2 ;
139
+ if (duration < 10000 )
140
+ return bucket + 3 ;
141
+ if (duration < 100000 )
142
+ return bucket + 4 ;
143
+ return bucket + 5 ;
144
+ }
145
+
146
+ /*
147
+ * Return a multiplier for the exit latency that is intended
148
+ * to take performance requirements into account.
149
+ * The more performance critical we estimate the system
150
+ * to be, the higher this multiplier, and thus the higher
151
+ * the barrier to go to an expensive C state.
152
+ */
153
+ static inline int performance_multiplier (void )
154
+ {
155
+ int mult = 1 ;
156
+
157
+ /* for higher loadavg, we are more reluctant */
158
+
159
+ mult += 2 * get_loadavg ();
160
+
161
+ /* for IO wait tasks (per cpu!) we add 5x each */
162
+ mult += 10 * nr_iowait_cpu ();
163
+
164
+ return mult ;
165
+ }
166
+
30
167
static DEFINE_PER_CPU (struct menu_device , menu_devices ) ;
31
168
32
169
/**
@@ -38,37 +175,59 @@ static int menu_select(struct cpuidle_device *dev)
38
175
struct menu_device * data = & __get_cpu_var (menu_devices );
39
176
int latency_req = pm_qos_requirement (PM_QOS_CPU_DMA_LATENCY );
40
177
int i ;
178
+ int multiplier ;
179
+
180
+ data -> last_state_idx = 0 ;
181
+ data -> exit_us = 0 ;
41
182
42
183
/* Special case when user has set very strict latency requirement */
43
- if (unlikely (latency_req == 0 )) {
44
- data -> last_state_idx = 0 ;
184
+ if (unlikely (latency_req == 0 ))
45
185
return 0 ;
46
- }
47
186
48
- /* determine the expected residency time */
187
+ /* determine the expected residency time, round up */
49
188
data -> expected_us =
50
- (u32 ) ktime_to_ns (tick_nohz_get_sleep_length ()) / 1000 ;
189
+ DIV_ROUND_UP ((u32 )ktime_to_ns (tick_nohz_get_sleep_length ()), 1000 );
190
+
191
+
192
+ data -> bucket = which_bucket (data -> expected_us );
193
+
194
+ multiplier = performance_multiplier ();
195
+
196
+ /*
197
+ * if the correction factor is 0 (eg first time init or cpu hotplug
198
+ * etc), we actually want to start out with a unity factor.
199
+ */
200
+ if (data -> correction_factor [data -> bucket ] == 0 )
201
+ data -> correction_factor [data -> bucket ] = RESOLUTION * DECAY ;
202
+
203
+ /* Make sure to round up for half microseconds */
204
+ data -> predicted_us = DIV_ROUND_CLOSEST (
205
+ data -> expected_us * data -> correction_factor [data -> bucket ],
206
+ RESOLUTION * DECAY );
207
+
208
+ /*
209
+ * We want to default to C1 (hlt), not to busy polling
210
+ * unless the timer is happening really really soon.
211
+ */
212
+ if (data -> expected_us > 5 )
213
+ data -> last_state_idx = CPUIDLE_DRIVER_STATE_START ;
51
214
52
- /* Recalculate predicted_us based on prediction_history_pct */
53
- data -> predicted_us *= PRED_HISTORY_PCT ;
54
- data -> predicted_us += (100 - PRED_HISTORY_PCT ) *
55
- data -> current_predicted_us ;
56
- data -> predicted_us /= 100 ;
57
215
58
216
/* find the deepest idle state that satisfies our constraints */
59
- for (i = CPUIDLE_DRIVER_STATE_START + 1 ; i < dev -> state_count ; i ++ ) {
217
+ for (i = CPUIDLE_DRIVER_STATE_START ; i < dev -> state_count ; i ++ ) {
60
218
struct cpuidle_state * s = & dev -> states [i ];
61
219
62
- if (s -> target_residency > data -> expected_us )
63
- break ;
64
220
if (s -> target_residency > data -> predicted_us )
65
221
break ;
66
222
if (s -> exit_latency > latency_req )
67
223
break ;
224
+ if (s -> exit_latency * multiplier > data -> predicted_us )
225
+ break ;
226
+ data -> exit_us = s -> exit_latency ;
227
+ data -> last_state_idx = i ;
68
228
}
69
229
70
- data -> last_state_idx = i - 1 ;
71
- return i - 1 ;
230
+ return data -> last_state_idx ;
72
231
}
73
232
74
233
/**
@@ -85,35 +244,49 @@ static void menu_reflect(struct cpuidle_device *dev)
85
244
unsigned int last_idle_us = cpuidle_get_last_residency (dev );
86
245
struct cpuidle_state * target = & dev -> states [last_idx ];
87
246
unsigned int measured_us ;
247
+ u64 new_factor ;
88
248
89
249
/*
90
250
* Ugh, this idle state doesn't support residency measurements, so we
91
251
* are basically lost in the dark. As a compromise, assume we slept
92
- * for one full standard timer tick. However, be aware that this
93
- * could potentially result in a suboptimal state transition.
252
+ * for the whole expected time.
94
253
*/
95
254
if (unlikely (!(target -> flags & CPUIDLE_FLAG_TIME_VALID )))
96
- last_idle_us = USEC_PER_SEC / HZ ;
255
+ last_idle_us = data -> expected_us ;
256
+
257
+
258
+ measured_us = last_idle_us ;
97
259
98
260
/*
99
- * measured_us and elapsed_us are the cumulative idle time, since the
100
- * last time we were woken out of idle by an interrupt .
261
+ * We correct for the exit latency; we are assuming here that the
262
+ * exit latency happens after the event that we're interested in .
101
263
*/
102
- if (data -> elapsed_us <= data -> elapsed_us + last_idle_us )
103
- measured_us = data -> elapsed_us + last_idle_us ;
264
+ if (measured_us > data -> exit_us )
265
+ measured_us -= data -> exit_us ;
266
+
267
+
268
+ /* update our correction ratio */
269
+
270
+ new_factor = data -> correction_factor [data -> bucket ]
271
+ * (DECAY - 1 ) / DECAY ;
272
+
273
+ if (data -> expected_us > 0 && data -> measured_us < MAX_INTERESTING )
274
+ new_factor += RESOLUTION * measured_us / data -> expected_us ;
104
275
else
105
- measured_us = -1 ;
276
+ /*
277
+ * we were idle so long that we count it as a perfect
278
+ * prediction
279
+ */
280
+ new_factor += RESOLUTION ;
106
281
107
- /* Predict time until next break event */
108
- data -> current_predicted_us = max (measured_us , data -> last_measured_us );
282
+ /*
283
+ * We don't want 0 as factor; we always want at least
284
+ * a tiny bit of estimated time.
285
+ */
286
+ if (new_factor == 0 )
287
+ new_factor = 1 ;
109
288
110
- if (last_idle_us + BREAK_FUZZ <
111
- data -> expected_us - target -> exit_latency ) {
112
- data -> last_measured_us = measured_us ;
113
- data -> elapsed_us = 0 ;
114
- } else {
115
- data -> elapsed_us = measured_us ;
116
- }
289
+ data -> correction_factor [data -> bucket ] = new_factor ;
117
290
}
118
291
119
292
/**
0 commit comments