Skip to content

Commit d09d8df

Browse files
Josef Bacikaxboe
authored andcommitted
blkcg: add generic throttling mechanism
Since IO can be issued from literally anywhere it's almost impossible to do throttling without having some sort of adverse effect somewhere else in the system because of locking or other dependencies. The best way to solve this is to do the throttling when we know we aren't holding any other kernel resources. Do this by tracking throttling in a per-blkg basis, and if we require throttling flag the task that it needs to check before it returns to user space and possibly sleep there. This is to address the case where a process is doing work that is generating IO that can't be throttled, whether that is directly with a lot of REQ_META IO, or indirectly by allocating so much memory that it is swamping the disk with REQ_SWAP. We can't use task_add_work as we don't want to induce a memory allocation in the IO path, so simply saving the request queue in the task and flagging it to do the notify_resume thing achieves the same result without the overhead of a memory allocation. Signed-off-by: Josef Bacik <jbacik@fb.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 0d3bd88 commit d09d8df

File tree

5 files changed

+332
-0
lines changed

5 files changed

+332
-0
lines changed

block/blk-cgroup.c

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <linux/atomic.h>
2828
#include <linux/ctype.h>
2929
#include <linux/blk-cgroup.h>
30+
#include <linux/tracehook.h>
3031
#include "blk.h"
3132

3233
#define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
9991000
if (!blkcg_debug_stats)
10001001
goto next;
10011002

1003+
if (atomic_read(&blkg->use_delay)) {
1004+
has_stats = true;
1005+
off += scnprintf(buf+off, size-off,
1006+
" use_delay=%d delay_nsec=%llu",
1007+
atomic_read(&blkg->use_delay),
1008+
(unsigned long long)atomic64_read(&blkg->delay_nsec));
1009+
}
1010+
10021011
for (i = 0; i < BLKCG_MAX_POLS; i++) {
10031012
struct blkcg_policy *pol = blkcg_policy[i];
10041013
size_t written;
@@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
13261335
mutex_unlock(&blkcg_pol_mutex);
13271336
}
13281337

1338+
static void blkcg_exit(struct task_struct *tsk)
1339+
{
1340+
if (tsk->throttle_queue)
1341+
blk_put_queue(tsk->throttle_queue);
1342+
tsk->throttle_queue = NULL;
1343+
}
1344+
13291345
struct cgroup_subsys io_cgrp_subsys = {
13301346
.css_alloc = blkcg_css_alloc,
13311347
.css_offline = blkcg_css_offline,
@@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
13351351
.dfl_cftypes = blkcg_files,
13361352
.legacy_cftypes = blkcg_legacy_files,
13371353
.legacy_name = "blkio",
1354+
.exit = blkcg_exit,
13381355
#ifdef CONFIG_MEMCG
13391356
/*
13401357
* This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
15861603
}
15871604
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
15881605

1606+
/*
1607+
* Scale the accumulated delay based on how long it has been since we updated
1608+
* the delay. We only call this when we are adding delay, in case it's been a
1609+
* while since we added delay, and when we are checking to see if we need to
1610+
* delay a task, to account for any delays that may have occurred.
1611+
*/
1612+
static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1613+
{
1614+
u64 old = atomic64_read(&blkg->delay_start);
1615+
1616+
/*
1617+
* We only want to scale down every second. The idea here is that we
1618+
* want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1619+
* time window. We only want to throttle tasks for recent delay that
1620+
* has occurred, in 1 second time windows since that's the maximum
1621+
* things can be throttled. We save the current delay window in
1622+
* blkg->last_delay so we know what amount is still left to be charged
1623+
* to the blkg from this point onward. blkg->last_use keeps track of
1624+
* the use_delay counter. The idea is if we're unthrottling the blkg we
1625+
* are ok with whatever is happening now, and we can take away more of
1626+
* the accumulated delay as we've already throttled enough that
1627+
* everybody is happy with their IO latencies.
1628+
*/
1629+
if (time_before64(old + NSEC_PER_SEC, now) &&
1630+
atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1631+
u64 cur = atomic64_read(&blkg->delay_nsec);
1632+
u64 sub = min_t(u64, blkg->last_delay, now - old);
1633+
int cur_use = atomic_read(&blkg->use_delay);
1634+
1635+
/*
1636+
* We've been unthrottled, subtract a larger chunk of our
1637+
* accumulated delay.
1638+
*/
1639+
if (cur_use < blkg->last_use)
1640+
sub = max_t(u64, sub, blkg->last_delay >> 1);
1641+
1642+
/*
1643+
* This shouldn't happen, but handle it anyway. Our delay_nsec
1644+
* should only ever be growing except here where we subtract out
1645+
* min(last_delay, 1 second), but lord knows bugs happen and I'd
1646+
* rather not end up with negative numbers.
1647+
*/
1648+
if (unlikely(cur < sub)) {
1649+
atomic64_set(&blkg->delay_nsec, 0);
1650+
blkg->last_delay = 0;
1651+
} else {
1652+
atomic64_sub(sub, &blkg->delay_nsec);
1653+
blkg->last_delay = cur - sub;
1654+
}
1655+
blkg->last_use = cur_use;
1656+
}
1657+
}
1658+
1659+
/*
1660+
* This is called when we want to actually walk up the hierarchy and check to
1661+
* see if we need to throttle, and then actually throttle if there is some
1662+
* accumulated delay. This should only be called upon return to user space so
1663+
* we're not holding some lock that would induce a priority inversion.
1664+
*/
1665+
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1666+
{
1667+
u64 now = ktime_to_ns(ktime_get());
1668+
u64 exp;
1669+
u64 delay_nsec = 0;
1670+
int tok;
1671+
1672+
while (blkg->parent) {
1673+
if (atomic_read(&blkg->use_delay)) {
1674+
blkcg_scale_delay(blkg, now);
1675+
delay_nsec = max_t(u64, delay_nsec,
1676+
atomic64_read(&blkg->delay_nsec));
1677+
}
1678+
blkg = blkg->parent;
1679+
}
1680+
1681+
if (!delay_nsec)
1682+
return;
1683+
1684+
/*
1685+
* Let's not sleep for all eternity if we've amassed a huge delay.
1686+
* Swapping or metadata IO can accumulate 10's of seconds worth of
1687+
* delay, and we want userspace to be able to do _something_ so cap the
1688+
* delays at 1 second. If there's 10's of seconds worth of delay then
1689+
* the tasks will be delayed for 1 second for every syscall.
1690+
*/
1691+
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1692+
1693+
/*
1694+
* TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1695+
* that hasn't landed upstream yet. Once that stuff is in place we need
1696+
* to do a psi_memstall_enter/leave if memdelay is set.
1697+
*/
1698+
1699+
exp = ktime_add_ns(now, delay_nsec);
1700+
tok = io_schedule_prepare();
1701+
do {
1702+
__set_current_state(TASK_KILLABLE);
1703+
if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1704+
break;
1705+
} while (!fatal_signal_pending(current));
1706+
io_schedule_finish(tok);
1707+
}
1708+
1709+
/**
1710+
* blkcg_maybe_throttle_current - throttle the current task if it has been marked
1711+
*
1712+
* This is only called if we've been marked with set_notify_resume(). Obviously
1713+
* we can be set_notify_resume() for reasons other than blkcg throttling, so we
1714+
* check to see if current->throttle_queue is set and if not this doesn't do
1715+
* anything. This should only ever be called by the resume code, it's not meant
1716+
* to be called by people willy-nilly as it will actually do the work to
1717+
* throttle the task if it is setup for throttling.
1718+
*/
1719+
void blkcg_maybe_throttle_current(void)
1720+
{
1721+
struct request_queue *q = current->throttle_queue;
1722+
struct cgroup_subsys_state *css;
1723+
struct blkcg *blkcg;
1724+
struct blkcg_gq *blkg;
1725+
bool use_memdelay = current->use_memdelay;
1726+
1727+
if (!q)
1728+
return;
1729+
1730+
current->throttle_queue = NULL;
1731+
current->use_memdelay = false;
1732+
1733+
rcu_read_lock();
1734+
css = kthread_blkcg();
1735+
if (css)
1736+
blkcg = css_to_blkcg(css);
1737+
else
1738+
blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1739+
1740+
if (!blkcg)
1741+
goto out;
1742+
blkg = blkg_lookup(blkcg, q);
1743+
if (!blkg)
1744+
goto out;
1745+
blkg = blkg_try_get(blkg);
1746+
if (!blkg)
1747+
goto out;
1748+
rcu_read_unlock();
1749+
blk_put_queue(q);
1750+
1751+
blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1752+
blkg_put(blkg);
1753+
return;
1754+
out:
1755+
rcu_read_unlock();
1756+
blk_put_queue(q);
1757+
}
1758+
EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1759+
1760+
/**
1761+
* blkcg_schedule_throttle - this task needs to check for throttling
1762+
* @q - the request queue IO was submitted on
1763+
* @use_memdelay - do we charge this to memory delay for PSI
1764+
*
1765+
* This is called by the IO controller when we know there's delay accumulated
1766+
* for the blkg for this task. We do not pass the blkg because there are places
1767+
* we call this that may not have that information, the swapping code for
1768+
* instance will only have a request_queue at that point. This set's the
1769+
* notify_resume for the task to check and see if it requires throttling before
1770+
* returning to user space.
1771+
*
1772+
* We will only schedule once per syscall. You can call this over and over
1773+
* again and it will only do the check once upon return to user space, and only
1774+
* throttle once. If the task needs to be throttled again it'll need to be
1775+
* re-set at the next time we see the task.
1776+
*/
1777+
void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1778+
{
1779+
if (unlikely(current->flags & PF_KTHREAD))
1780+
return;
1781+
1782+
if (!blk_get_queue(q))
1783+
return;
1784+
1785+
if (current->throttle_queue)
1786+
blk_put_queue(current->throttle_queue);
1787+
current->throttle_queue = q;
1788+
if (use_memdelay)
1789+
current->use_memdelay = use_memdelay;
1790+
set_notify_resume(current);
1791+
}
1792+
EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1793+
1794+
/**
1795+
* blkcg_add_delay - add delay to this blkg
1796+
* @now - the current time in nanoseconds
1797+
* @delta - how many nanoseconds of delay to add
1798+
*
1799+
* Charge @delta to the blkg's current delay accumulation. This is used to
1800+
* throttle tasks if an IO controller thinks we need more throttling.
1801+
*/
1802+
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1803+
{
1804+
blkcg_scale_delay(blkg, now);
1805+
atomic64_add(delta, &blkg->delay_nsec);
1806+
}
1807+
EXPORT_SYMBOL_GPL(blkcg_add_delay);
1808+
15891809
module_param(blkcg_debug_stats, bool, 0644);
15901810
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");

include/linux/blk-cgroup.h

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,12 @@ struct blkcg_gq {
136136
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
137137

138138
struct rcu_head rcu_head;
139+
140+
atomic_t use_delay;
141+
atomic64_t delay_nsec;
142+
atomic64_t delay_start;
143+
u64 last_delay;
144+
int last_use;
139145
};
140146

141147
typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -241,6 +247,26 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
241247
return css_to_blkcg(task_css(current, io_cgrp_id));
242248
}
243249

250+
static inline bool blk_cgroup_congested(void)
251+
{
252+
struct cgroup_subsys_state *css;
253+
bool ret = false;
254+
255+
rcu_read_lock();
256+
css = kthread_blkcg();
257+
if (!css)
258+
css = task_css(current, io_cgrp_id);
259+
while (css) {
260+
if (atomic_read(&css->cgroup->congestion_count)) {
261+
ret = true;
262+
break;
263+
}
264+
css = css->parent;
265+
}
266+
rcu_read_unlock();
267+
return ret;
268+
}
269+
244270
/**
245271
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
246272
* @return: true if this bio needs to be submitted with the root blkg context.
@@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
374400
atomic_inc(&blkg->refcnt);
375401
}
376402

403+
/**
404+
* blkg_try_get - try and get a blkg reference
405+
* @blkg: blkg to get
406+
*
407+
* This is for use when doing an RCU lookup of the blkg. We may be in the midst
408+
* of freeing this blkg, so we can only use it if the refcnt is not zero.
409+
*/
410+
static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
411+
{
412+
if (atomic_inc_not_zero(&blkg->refcnt))
413+
return blkg;
414+
return NULL;
415+
}
416+
417+
377418
void __blkg_release_rcu(struct rcu_head *rcu);
378419

379420
/**
@@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
734775
return !throtl;
735776
}
736777

778+
static inline void blkcg_use_delay(struct blkcg_gq *blkg)
779+
{
780+
if (atomic_add_return(1, &blkg->use_delay) == 1)
781+
atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
782+
}
783+
784+
static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
785+
{
786+
int old = atomic_read(&blkg->use_delay);
787+
788+
if (old == 0)
789+
return 0;
790+
791+
/*
792+
* We do this song and dance because we can race with somebody else
793+
* adding or removing delay. If we just did an atomic_dec we'd end up
794+
* negative and we'd already be in trouble. We need to subtract 1 and
795+
* then check to see if we were the last delay so we can drop the
796+
* congestion count on the cgroup.
797+
*/
798+
while (old) {
799+
int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
800+
if (cur == old)
801+
break;
802+
old = cur;
803+
}
804+
805+
if (old == 0)
806+
return 0;
807+
if (old == 1)
808+
atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
809+
return 1;
810+
}
811+
812+
static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
813+
{
814+
int old = atomic_read(&blkg->use_delay);
815+
if (!old)
816+
return;
817+
/* We only want 1 person clearing the congestion count for this blkg. */
818+
while (old) {
819+
int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
820+
if (cur == old) {
821+
atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
822+
break;
823+
}
824+
old = cur;
825+
}
826+
}
827+
828+
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
829+
void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
830+
void blkcg_maybe_throttle_current(void);
737831
#else /* CONFIG_BLK_CGROUP */
738832

739833
struct blkcg {
@@ -753,8 +847,13 @@ struct blkcg_policy {
753847

754848
#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
755849

850+
static inline void blkcg_maybe_throttle_current(void) { }
851+
static inline bool blk_cgroup_congested(void) { return false; }
852+
756853
#ifdef CONFIG_BLOCK
757854

855+
static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
856+
758857
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
759858
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
760859
static inline void blkcg_drain_queue(struct request_queue *q) { }

include/linux/cgroup-defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,9 @@ struct cgroup {
438438
/* used to store eBPF programs */
439439
struct cgroup_bpf bpf;
440440

441+
/* If there is block congestion on this cgroup. */
442+
atomic_t congestion_count;
443+
441444
/* ids of the ancestors at each level including self */
442445
int ancestor_ids[];
443446
};

0 commit comments

Comments
 (0)