Skip to content

Commit 1018016

Browse files
Jason LowIngo Molnar
authored andcommitted
sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability
While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low <jason.low2@hp.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: Scott J Norton <scott.norton@hp.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Waiman Long <Waiman.Long@hp.com> Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 7e5a2c1 commit 1018016

File tree

5 files changed

+62
-52
lines changed

5 files changed

+62
-52
lines changed

include/linux/init_task.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
5050
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
5151
.rlim = INIT_RLIMITS, \
5252
.cputimer = { \
53-
.cputime = INIT_CPUTIME, \
54-
.running = 0, \
55-
.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
53+
.utime = ATOMIC64_INIT(0), \
54+
.stime = ATOMIC64_INIT(0), \
55+
.sum_exec_runtime = ATOMIC64_INIT(0), \
56+
.running = 0 \
5657
}, \
5758
.cred_guard_mutex = \
5859
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \

include/linux/sched.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -598,9 +598,10 @@ struct task_cputime {
598598
* used for thread group CPU timer calculations.
599599
*/
600600
struct thread_group_cputimer {
601-
struct task_cputime cputime;
601+
atomic64_t utime;
602+
atomic64_t stime;
603+
atomic64_t sum_exec_runtime;
602604
int running;
603-
raw_spinlock_t lock;
604605
};
605606

606607
#include <linux/rwsem.h>
@@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void)
29672968
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
29682969
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
29692970

2970-
static inline void thread_group_cputime_init(struct signal_struct *sig)
2971-
{
2972-
raw_spin_lock_init(&sig->cputimer.lock);
2973-
}
2974-
29752971
/*
29762972
* Reevaluate whether the task has signals pending delivery.
29772973
* Wake the task if so.

kernel/fork.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
10911091
{
10921092
unsigned long cpu_limit;
10931093

1094-
/* Thread group counters. */
1095-
thread_group_cputime_init(sig);
1096-
10971094
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
10981095
if (cpu_limit != RLIM_INFINITY) {
10991096
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);

kernel/sched/stats.h

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
174174
{
175175
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
176176

177-
if (!cputimer->running)
177+
/* Check if cputimer isn't running. This is accessed without locking. */
178+
if (!READ_ONCE(cputimer->running))
178179
return false;
179180

180181
/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
215216
if (!cputimer_running(tsk))
216217
return;
217218

218-
raw_spin_lock(&cputimer->lock);
219-
cputimer->cputime.utime += cputime;
220-
raw_spin_unlock(&cputimer->lock);
219+
atomic64_add(cputime, &cputimer->utime);
221220
}
222221

223222
/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
238237
if (!cputimer_running(tsk))
239238
return;
240239

241-
raw_spin_lock(&cputimer->lock);
242-
cputimer->cputime.stime += cputime;
243-
raw_spin_unlock(&cputimer->lock);
240+
atomic64_add(cputime, &cputimer->stime);
244241
}
245242

246243
/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
261258
if (!cputimer_running(tsk))
262259
return;
263260

264-
raw_spin_lock(&cputimer->lock);
265-
cputimer->cputime.sum_exec_runtime += ns;
266-
raw_spin_unlock(&cputimer->lock);
261+
atomic64_add(ns, &cputimer->sum_exec_runtime);
267262
}

kernel/time/posix-cpu-timers.c

Lines changed: 50 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
196196
return 0;
197197
}
198198

199-
static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
199+
/*
200+
* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
201+
* to avoid race conditions with concurrent updates to cputime.
202+
*/
203+
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
200204
{
201-
if (b->utime > a->utime)
202-
a->utime = b->utime;
205+
u64 curr_cputime;
206+
retry:
207+
curr_cputime = atomic64_read(cputime);
208+
if (sum_cputime > curr_cputime) {
209+
if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
210+
goto retry;
211+
}
212+
}
203213

204-
if (b->stime > a->stime)
205-
a->stime = b->stime;
214+
static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
215+
{
216+
__update_gt_cputime(&cputimer->utime, sum->utime);
217+
__update_gt_cputime(&cputimer->stime, sum->stime);
218+
__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
219+
}
206220

207-
if (b->sum_exec_runtime > a->sum_exec_runtime)
208-
a->sum_exec_runtime = b->sum_exec_runtime;
221+
/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
222+
static inline void sample_group_cputimer(struct task_cputime *times,
223+
struct thread_group_cputimer *cputimer)
224+
{
225+
times->utime = atomic64_read(&cputimer->utime);
226+
times->stime = atomic64_read(&cputimer->stime);
227+
times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
209228
}
210229

211230
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212231
{
213232
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214233
struct task_cputime sum;
215-
unsigned long flags;
216234

217-
if (!cputimer->running) {
235+
/* Check if cputimer isn't running. This is accessed without locking. */
236+
if (!READ_ONCE(cputimer->running)) {
218237
/*
219238
* The POSIX timer interface allows for absolute time expiry
220239
* values through the TIMER_ABSTIME flag, therefore we have
221-
* to synchronize the timer to the clock every time we start
222-
* it.
240+
* to synchronize the timer to the clock every time we start it.
223241
*/
224242
thread_group_cputime(tsk, &sum);
225-
raw_spin_lock_irqsave(&cputimer->lock, flags);
226-
cputimer->running = 1;
227-
update_gt_cputime(&cputimer->cputime, &sum);
228-
} else
229-
raw_spin_lock_irqsave(&cputimer->lock, flags);
230-
*times = cputimer->cputime;
231-
raw_spin_unlock_irqrestore(&cputimer->lock, flags);
243+
update_gt_cputime(cputimer, &sum);
244+
245+
/*
246+
* We're setting cputimer->running without a lock. Ensure
247+
* this only gets written to in one operation. We set
248+
* running after update_gt_cputime() as a small optimization,
249+
* but barriers are not required because update_gt_cputime()
250+
* can handle concurrent updates.
251+
*/
252+
WRITE_ONCE(cputimer->running, 1);
253+
}
254+
sample_group_cputimer(times, cputimer);
232255
}
233256

234257
/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
582605
if (!task_cputime_zero(&tsk->cputime_expires))
583606
return false;
584607

585-
if (tsk->signal->cputimer.running)
608+
/* Check if cputimer is running. This is accessed without locking. */
609+
if (READ_ONCE(tsk->signal->cputimer.running))
586610
return false;
587611

588612
return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
882906
}
883907
}
884908

885-
static void stop_process_timers(struct signal_struct *sig)
909+
static inline void stop_process_timers(struct signal_struct *sig)
886910
{
887911
struct thread_group_cputimer *cputimer = &sig->cputimer;
888-
unsigned long flags;
889912

890-
raw_spin_lock_irqsave(&cputimer->lock, flags);
891-
cputimer->running = 0;
892-
raw_spin_unlock_irqrestore(&cputimer->lock, flags);
913+
/* Turn off cputimer->running. This is done without locking. */
914+
WRITE_ONCE(cputimer->running, 0);
893915
}
894916

895917
static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
11111133
}
11121134

11131135
sig = tsk->signal;
1114-
if (sig->cputimer.running) {
1136+
/* Check if cputimer is running. This is accessed without locking. */
1137+
if (READ_ONCE(sig->cputimer.running)) {
11151138
struct task_cputime group_sample;
11161139

1117-
raw_spin_lock(&sig->cputimer.lock);
1118-
group_sample = sig->cputimer.cputime;
1119-
raw_spin_unlock(&sig->cputimer.lock);
1140+
sample_group_cputimer(&group_sample, &sig->cputimer);
11201141

11211142
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
11221143
return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
11571178
* If there are any active process wide timers (POSIX 1.b, itimers,
11581179
* RLIMIT_CPU) cputimer must be running.
11591180
*/
1160-
if (tsk->signal->cputimer.running)
1181+
if (READ_ONCE(tsk->signal->cputimer.running))
11611182
check_process_timers(tsk, &firing);
11621183

11631184
/*

0 commit comments

Comments
 (0)