Skip to content

Commit 9d7fb04

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched/cputime: Guarantee stime + utime == rtime
While the current code guarantees monotonicity for stime and utime independently of one another, it does not guarantee that the sum of both is equal to the total time we started out with. This confuses things (and peoples) who look at this sum, like top, and will report >100% usage followed by a matching period of 0%. Rework the code to provide both individual monotonicity and a coherent sum. Suggested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Reported-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Tested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: jason.low2@hp.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 781b020 commit 9d7fb04

File tree

4 files changed

+97
-61
lines changed

4 files changed

+97
-61
lines changed

include/linux/init_task.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@ extern struct fs_struct init_fs;
3232
#define INIT_CPUSET_SEQ(tsk)
3333
#endif
3434

35+
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
36+
#define INIT_PREV_CPUTIME(x) .prev_cputime = { \
37+
.lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock), \
38+
},
39+
#else
40+
#define INIT_PREV_CPUTIME(x)
41+
#endif
42+
3543
#define INIT_SIGNALS(sig) { \
3644
.nr_threads = 1, \
3745
.thread_head = LIST_HEAD_INIT(init_task.thread_node), \
@@ -46,6 +54,7 @@ extern struct fs_struct init_fs;
4654
.cputime_atomic = INIT_CPUTIME_ATOMIC, \
4755
.running = 0, \
4856
}, \
57+
INIT_PREV_CPUTIME(sig) \
4958
.cred_guard_mutex = \
5059
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
5160
}
@@ -246,6 +255,7 @@ extern struct task_group root_task_group;
246255
INIT_TASK_RCU_TASKS(tsk) \
247256
INIT_CPUSET_SEQ(tsk) \
248257
INIT_RT_MUTEXES(tsk) \
258+
INIT_PREV_CPUTIME(tsk) \
249259
INIT_VTIME(tsk) \
250260
INIT_NUMA_BALANCING(tsk) \
251261
INIT_KASAN(tsk) \

include/linux/sched.h

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -530,39 +530,49 @@ struct cpu_itimer {
530530
};
531531

532532
/**
533-
* struct cputime - snaphsot of system and user cputime
533+
* struct prev_cputime - snaphsot of system and user cputime
534534
* @utime: time spent in user mode
535535
* @stime: time spent in system mode
536+
* @lock: protects the above two fields
536537
*
537-
* Gathers a generic snapshot of user and system time.
538+
* Stores previous user/system time values such that we can guarantee
539+
* monotonicity.
538540
*/
539-
struct cputime {
541+
struct prev_cputime {
542+
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
540543
cputime_t utime;
541544
cputime_t stime;
545+
raw_spinlock_t lock;
546+
#endif
542547
};
543548

549+
static inline void prev_cputime_init(struct prev_cputime *prev)
550+
{
551+
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
552+
prev->utime = prev->stime = 0;
553+
raw_spin_lock_init(&prev->lock);
554+
#endif
555+
}
556+
544557
/**
545558
* struct task_cputime - collected CPU time counts
546559
* @utime: time spent in user mode, in &cputime_t units
547560
* @stime: time spent in kernel mode, in &cputime_t units
548561
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
549562
*
550-
* This is an extension of struct cputime that includes the total runtime
551-
* spent by the task from the scheduler point of view.
552-
*
553-
* As a result, this structure groups together three kinds of CPU time
554-
* that are tracked for threads and thread groups. Most things considering
555-
* CPU time want to group these counts together and treat all three
556-
* of them in parallel.
563+
* This structure groups together three kinds of CPU time that are tracked for
564+
* threads and thread groups. Most things considering CPU time want to group
565+
* these counts together and treat all three of them in parallel.
557566
*/
558567
struct task_cputime {
559568
cputime_t utime;
560569
cputime_t stime;
561570
unsigned long long sum_exec_runtime;
562571
};
572+
563573
/* Alternate field names when used to cache expirations. */
564-
#define prof_exp stime
565574
#define virt_exp utime
575+
#define prof_exp stime
566576
#define sched_exp sum_exec_runtime
567577

568578
#define INIT_CPUTIME \
@@ -715,9 +725,7 @@ struct signal_struct {
715725
cputime_t utime, stime, cutime, cstime;
716726
cputime_t gtime;
717727
cputime_t cgtime;
718-
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
719-
struct cputime prev_cputime;
720-
#endif
728+
struct prev_cputime prev_cputime;
721729
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
722730
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
723731
unsigned long inblock, oublock, cinblock, coublock;
@@ -1481,9 +1489,7 @@ struct task_struct {
14811489

14821490
cputime_t utime, stime, utimescaled, stimescaled;
14831491
cputime_t gtime;
1484-
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1485-
struct cputime prev_cputime;
1486-
#endif
1492+
struct prev_cputime prev_cputime;
14871493
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14881494
seqlock_t vtime_seqlock;
14891495
unsigned long long vtime_snap;

kernel/fork.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
10671067
rcu_assign_pointer(tsk->sighand, sig);
10681068
if (!sig)
10691069
return -ENOMEM;
1070+
10701071
atomic_set(&sig->count, 1);
10711072
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
10721073
return 0;
@@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
11281129
init_sigpending(&sig->shared_pending);
11291130
INIT_LIST_HEAD(&sig->posix_timers);
11301131
seqlock_init(&sig->stats_lock);
1132+
prev_cputime_init(&sig->prev_cputime);
11311133

11321134
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
11331135
sig->real_timer.function = it_real_fn;
@@ -1335,9 +1337,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
13351337

13361338
p->utime = p->stime = p->gtime = 0;
13371339
p->utimescaled = p->stimescaled = 0;
1338-
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1339-
p->prev_cputime.utime = p->prev_cputime.stime = 0;
1340-
#endif
1340+
prev_cputime_init(&p->prev_cputime);
1341+
13411342
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
13421343
seqlock_init(&p->vtime_seqlock);
13431344
p->vtime_snap = 0;

kernel/sched/cputime.c

Lines changed: 60 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -555,48 +555,43 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
555555
}
556556

557557
/*
558-
* Atomically advance counter to the new value. Interrupts, vcpu
559-
* scheduling, and scaling inaccuracies can cause cputime_advance
560-
* to be occasionally called with a new value smaller than counter.
561-
* Let's enforce atomicity.
558+
* Adjust tick based cputime random precision against scheduler runtime
559+
* accounting.
562560
*
563-
* Normally a caller will only go through this loop once, or not
564-
* at all in case a previous caller updated counter the same jiffy.
565-
*/
566-
static void cputime_advance(cputime_t *counter, cputime_t new)
567-
{
568-
cputime_t old;
569-
570-
while (new > (old = READ_ONCE(*counter)))
571-
cmpxchg_cputime(counter, old, new);
572-
}
573-
574-
/*
575-
* Adjust tick based cputime random precision against scheduler
576-
* runtime accounting.
561+
* Tick based cputime accounting depend on random scheduling timeslices of a
562+
* task to be interrupted or not by the timer. Depending on these
563+
* circumstances, the number of these interrupts may be over or
564+
* under-optimistic, matching the real user and system cputime with a variable
565+
* precision.
566+
*
567+
* Fix this by scaling these tick based values against the total runtime
568+
* accounted by the CFS scheduler.
569+
*
570+
* This code provides the following guarantees:
571+
*
572+
* stime + utime == rtime
573+
* stime_i+1 >= stime_i, utime_i+1 >= utime_i
574+
*
575+
* Assuming that rtime_i+1 >= rtime_i.
577576
*/
578577
static void cputime_adjust(struct task_cputime *curr,
579-
struct cputime *prev,
578+
struct prev_cputime *prev,
580579
cputime_t *ut, cputime_t *st)
581580
{
582581
cputime_t rtime, stime, utime;
582+
unsigned long flags;
583583

584-
/*
585-
* Tick based cputime accounting depend on random scheduling
586-
* timeslices of a task to be interrupted or not by the timer.
587-
* Depending on these circumstances, the number of these interrupts
588-
* may be over or under-optimistic, matching the real user and system
589-
* cputime with a variable precision.
590-
*
591-
* Fix this by scaling these tick based values against the total
592-
* runtime accounted by the CFS scheduler.
593-
*/
584+
/* Serialize concurrent callers such that we can honour our guarantees */
585+
raw_spin_lock_irqsave(&prev->lock, flags);
594586
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
595587

596588
/*
597-
* Update userspace visible utime/stime values only if actual execution
598-
* time is bigger than already exported. Note that can happen, that we
599-
* provided bigger values due to scaling inaccuracy on big numbers.
589+
* This is possible under two circumstances:
590+
* - rtime isn't monotonic after all (a bug);
591+
* - we got reordered by the lock.
592+
*
593+
* In both cases this acts as a filter such that the rest of the code
594+
* can assume it is monotonic regardless of anything else.
600595
*/
601596
if (prev->stime + prev->utime >= rtime)
602597
goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
606601

607602
if (utime == 0) {
608603
stime = rtime;
609-
} else if (stime == 0) {
610-
utime = rtime;
611-
} else {
612-
cputime_t total = stime + utime;
604+
goto update;
605+
}
613606

614-
stime = scale_stime((__force u64)stime,
615-
(__force u64)rtime, (__force u64)total);
616-
utime = rtime - stime;
607+
if (stime == 0) {
608+
utime = rtime;
609+
goto update;
617610
}
618611

619-
cputime_advance(&prev->stime, stime);
620-
cputime_advance(&prev->utime, utime);
612+
stime = scale_stime((__force u64)stime, (__force u64)rtime,
613+
(__force u64)(stime + utime));
614+
615+
/*
616+
* Make sure stime doesn't go backwards; this preserves monotonicity
617+
* for utime because rtime is monotonic.
618+
*
619+
* utime_i+1 = rtime_i+1 - stime_i
620+
* = rtime_i+1 - (rtime_i - utime_i)
621+
* = (rtime_i+1 - rtime_i) + utime_i
622+
* >= utime_i
623+
*/
624+
if (stime < prev->stime)
625+
stime = prev->stime;
626+
utime = rtime - stime;
627+
628+
/*
629+
* Make sure utime doesn't go backwards; this still preserves
630+
* monotonicity for stime, analogous argument to above.
631+
*/
632+
if (utime < prev->utime) {
633+
utime = prev->utime;
634+
stime = rtime - utime;
635+
}
621636

637+
update:
638+
prev->stime = stime;
639+
prev->utime = utime;
622640
out:
623641
*ut = prev->utime;
624642
*st = prev->stime;
643+
raw_spin_unlock_irqrestore(&prev->lock, flags);
625644
}
626645

627646
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)

0 commit comments

Comments
 (0)