Skip to content

Commit 59eaef7

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
x86/tsc: Remodel cyc2ns to use seqcount_latch()
Replace the custom multi-value scheme with the more regular seqcount_latch() scheme. Along with scrapping a lot of lines, the latch scheme is better documented and used in more places. The immediate benefit however is not being limited on the update side. The current code has a limit where the writers block which is hit by future changes. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 8309f86 commit 59eaef7

File tree

4 files changed

+53
-154
lines changed

4 files changed

+53
-154
lines changed

arch/x86/events/core.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2255,7 +2255,7 @@ static struct pmu pmu = {
22552255
void arch_perf_update_userpage(struct perf_event *event,
22562256
struct perf_event_mmap_page *userpg, u64 now)
22572257
{
2258-
struct cyc2ns_data *data;
2258+
struct cyc2ns_data data;
22592259
u64 offset;
22602260

22612261
userpg->cap_user_time = 0;
@@ -2267,17 +2267,17 @@ void arch_perf_update_userpage(struct perf_event *event,
22672267
if (!using_native_sched_clock() || !sched_clock_stable())
22682268
return;
22692269

2270-
data = cyc2ns_read_begin();
2270+
cyc2ns_read_begin(&data);
22712271

2272-
offset = data->cyc2ns_offset + __sched_clock_offset;
2272+
offset = data.cyc2ns_offset + __sched_clock_offset;
22732273

22742274
/*
22752275
* Internal timekeeping for enabled/running/stopped times
22762276
* is always in the local_clock domain.
22772277
*/
22782278
userpg->cap_user_time = 1;
2279-
userpg->time_mult = data->cyc2ns_mul;
2280-
userpg->time_shift = data->cyc2ns_shift;
2279+
userpg->time_mult = data.cyc2ns_mul;
2280+
userpg->time_shift = data.cyc2ns_shift;
22812281
userpg->time_offset = offset - now;
22822282

22832283
/*
@@ -2289,7 +2289,7 @@ void arch_perf_update_userpage(struct perf_event *event,
22892289
userpg->time_zero = offset;
22902290
}
22912291

2292-
cyc2ns_read_end(data);
2292+
cyc2ns_read_end();
22932293
}
22942294

22952295
void

arch/x86/include/asm/timer.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,9 @@ struct cyc2ns_data {
2929
u32 cyc2ns_mul;
3030
u32 cyc2ns_shift;
3131
u64 cyc2ns_offset;
32-
u32 __count;
33-
/* u32 hole */
34-
}; /* 24 bytes -- do not grow */
32+
}; /* 16 bytes */
3533

36-
extern struct cyc2ns_data *cyc2ns_read_begin(void);
37-
extern void cyc2ns_read_end(struct cyc2ns_data *);
34+
extern void cyc2ns_read_begin(struct cyc2ns_data *);
35+
extern void cyc2ns_read_end(void);
3836

3937
#endif /* _ASM_X86_TIMER_H */

arch/x86/kernel/tsc.c

Lines changed: 36 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
5151
static u64 art_to_tsc_offset;
5252
struct clocksource *art_related_clocksource;
5353

54-
/*
55-
* Use a ring-buffer like data structure, where a writer advances the head by
56-
* writing a new data entry and a reader advances the tail when it observes a
57-
* new entry.
58-
*
59-
* Writers are made to wait on readers until there's space to write a new
60-
* entry.
61-
*
62-
* This means that we can always use an {offset, mul} pair to compute a ns
63-
* value that is 'roughly' in the right direction, even if we're writing a new
64-
* {offset, mul} pair during the clock read.
65-
*
66-
* The down-side is that we can no longer guarantee strict monotonicity anymore
67-
* (assuming the TSC was that to begin with), because while we compute the
68-
* intersection point of the two clock slopes and make sure the time is
69-
* continuous at the point of switching; we can no longer guarantee a reader is
70-
* strictly before or after the switch point.
71-
*
72-
* It does mean a reader no longer needs to disable IRQs in order to avoid
73-
* CPU-Freq updates messing with his times, and similarly an NMI reader will
74-
* no longer run the risk of hitting half-written state.
75-
*/
76-
7754
struct cyc2ns {
78-
struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
79-
struct cyc2ns_data *head; /* 48 + 8 = 56 */
80-
struct cyc2ns_data *tail; /* 56 + 8 = 64 */
81-
}; /* exactly fits one cacheline */
55+
struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
56+
seqcount_t seq; /* 32 + 4 = 36 */
8257

83-
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
84-
85-
struct cyc2ns_data *cyc2ns_read_begin(void)
86-
{
87-
struct cyc2ns_data *head;
58+
}; /* fits one cacheline */
8859

89-
preempt_disable();
90-
91-
head = this_cpu_read(cyc2ns.head);
92-
/*
93-
* Ensure we observe the entry when we observe the pointer to it.
94-
* matches the wmb from cyc2ns_write_end().
95-
*/
96-
smp_read_barrier_depends();
97-
head->__count++;
98-
barrier();
99-
100-
return head;
101-
}
60+
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
10261

103-
void cyc2ns_read_end(struct cyc2ns_data *head)
62+
void cyc2ns_read_begin(struct cyc2ns_data *data)
10463
{
105-
barrier();
106-
/*
107-
* If we're the outer most nested read; update the tail pointer
108-
* when we're done. This notifies possible pending writers
109-
* that we've observed the head pointer and that the other
110-
* entry is now free.
111-
*/
112-
if (!--head->__count) {
113-
/*
114-
* x86-TSO does not reorder writes with older reads;
115-
* therefore once this write becomes visible to another
116-
* cpu, we must be finished reading the cyc2ns_data.
117-
*
118-
* matches with cyc2ns_write_begin().
119-
*/
120-
this_cpu_write(cyc2ns.tail, head);
121-
}
122-
preempt_enable();
123-
}
64+
int seq, idx;
12465

125-
/*
126-
* Begin writing a new @data entry for @cpu.
127-
*
128-
* Assumes some sort of write side lock; currently 'provided' by the assumption
129-
* that cpufreq will call its notifiers sequentially.
130-
*/
131-
static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
132-
{
133-
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
134-
struct cyc2ns_data *data = c2n->data;
135-
136-
if (data == c2n->head)
137-
data++;
66+
preempt_disable_notrace();
13867

139-
/* XXX send an IPI to @cpu in order to guarantee a read? */
68+
do {
69+
seq = this_cpu_read(cyc2ns.seq.sequence);
70+
idx = seq & 1;
14071

141-
/*
142-
* When we observe the tail write from cyc2ns_read_end(),
143-
* the cpu must be done with that entry and its safe
144-
* to start writing to it.
145-
*/
146-
while (c2n->tail == data)
147-
cpu_relax();
72+
data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
73+
data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
74+
data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
14875

149-
return data;
76+
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
15077
}
15178

152-
static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
79+
void cyc2ns_read_end(void)
15380
{
154-
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
155-
156-
/*
157-
* Ensure the @data writes are visible before we publish the
158-
* entry. Matches the data-depencency in cyc2ns_read_begin().
159-
*/
160-
smp_wmb();
161-
162-
ACCESS_ONCE(c2n->head) = data;
81+
preempt_enable_notrace();
16382
}
16483

16584
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
191110
data->cyc2ns_mul = 0;
192111
data->cyc2ns_shift = 0;
193112
data->cyc2ns_offset = 0;
194-
data->__count = 0;
195113
}
196114

197115
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
201119
cyc2ns_data_init(&c2n->data[0]);
202120
cyc2ns_data_init(&c2n->data[1]);
203121

204-
c2n->head = c2n->data;
205-
c2n->tail = c2n->data;
122+
seqcount_init(&c2n->seq);
206123
}
207124

208125
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
209126
{
210-
struct cyc2ns_data *data, *tail;
127+
struct cyc2ns_data data;
211128
unsigned long long ns;
212129

213-
/*
214-
* See cyc2ns_read_*() for details; replicated in order to avoid
215-
* an extra few instructions that came with the abstraction.
216-
* Notable, it allows us to only do the __count and tail update
217-
* dance when its actually needed.
218-
*/
219-
220-
preempt_disable_notrace();
221-
data = this_cpu_read(cyc2ns.head);
222-
tail = this_cpu_read(cyc2ns.tail);
223-
224-
if (likely(data == tail)) {
225-
ns = data->cyc2ns_offset;
226-
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
227-
} else {
228-
data->__count++;
130+
cyc2ns_read_begin(&data);
229131

230-
barrier();
132+
ns = data.cyc2ns_offset;
133+
ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
231134

232-
ns = data->cyc2ns_offset;
233-
ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
234-
235-
barrier();
236-
237-
if (!--data->__count)
238-
this_cpu_write(cyc2ns.tail, data);
239-
}
240-
preempt_enable_notrace();
135+
cyc2ns_read_end();
241136

242137
return ns;
243138
}
244139

245140
static void set_cyc2ns_scale(unsigned long khz, int cpu)
246141
{
247142
unsigned long long tsc_now, ns_now;
248-
struct cyc2ns_data *data;
143+
struct cyc2ns_data data;
144+
struct cyc2ns *c2n;
249145
unsigned long flags;
250146

251147
local_irq_save(flags);
@@ -254,8 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
254150
if (!khz)
255151
goto done;
256152

257-
data = cyc2ns_write_begin(cpu);
258-
259153
tsc_now = rdtsc();
260154
ns_now = cycles_2_ns(tsc_now);
261155

@@ -264,7 +158,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
264158
* time function is continuous; see the comment near struct
265159
* cyc2ns_data.
266160
*/
267-
clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
161+
clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
268162
NSEC_PER_MSEC, 0);
269163

270164
/*
@@ -273,15 +167,20 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
273167
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
274168
* value) - refer perf_event_mmap_page documentation in perf_event.h.
275169
*/
276-
if (data->cyc2ns_shift == 32) {
277-
data->cyc2ns_shift = 31;
278-
data->cyc2ns_mul >>= 1;
170+
if (data.cyc2ns_shift == 32) {
171+
data.cyc2ns_shift = 31;
172+
data.cyc2ns_mul >>= 1;
279173
}
280174

281-
data->cyc2ns_offset = ns_now -
282-
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
175+
data.cyc2ns_offset = ns_now -
176+
mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
177+
178+
c2n = per_cpu_ptr(&cyc2ns, cpu);
283179

284-
cyc2ns_write_end(cpu, data);
180+
raw_write_seqcount_latch(&c2n->seq);
181+
c2n->data[0] = data;
182+
raw_write_seqcount_latch(&c2n->seq);
183+
c2n->data[1] = data;
285184

286185
done:
287186
sched_clock_idle_wakeup_event(0);

arch/x86/platform/uv/tlb_uv.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
456456
*/
457457
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
458458
{
459-
struct cyc2ns_data *data = cyc2ns_read_begin();
459+
struct cyc2ns_data data;
460460
unsigned long long ns;
461461

462-
ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
462+
cyc2ns_read_begin(&data);
463+
ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
464+
cyc2ns_read_end();
463465

464-
cyc2ns_read_end(data);
465466
return ns;
466467
}
467468

@@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
470471
*/
471472
static inline unsigned long long ns_2_cycles(unsigned long long ns)
472473
{
473-
struct cyc2ns_data *data = cyc2ns_read_begin();
474+
struct cyc2ns_data data;
474475
unsigned long long cyc;
475476

476-
cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
477+
cyc2ns_read_begin(&data);
478+
cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
479+
cyc2ns_read_end();
477480

478-
cyc2ns_read_end(data);
479481
return cyc;
480482
}
481483

0 commit comments

Comments
 (0)