27
27
#include <linux/atomic.h>
28
28
#include <linux/ctype.h>
29
29
#include <linux/blk-cgroup.h>
30
+ #include <linux/tracehook.h>
30
31
#include "blk.h"
31
32
32
33
#define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
999
1000
if (!blkcg_debug_stats )
1000
1001
goto next ;
1001
1002
1003
+ if (atomic_read (& blkg -> use_delay )) {
1004
+ has_stats = true;
1005
+ off += scnprintf (buf + off , size - off ,
1006
+ " use_delay=%d delay_nsec=%llu" ,
1007
+ atomic_read (& blkg -> use_delay ),
1008
+ (unsigned long long )atomic64_read (& blkg -> delay_nsec ));
1009
+ }
1010
+
1002
1011
for (i = 0 ; i < BLKCG_MAX_POLS ; i ++ ) {
1003
1012
struct blkcg_policy * pol = blkcg_policy [i ];
1004
1013
size_t written ;
@@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
1326
1335
mutex_unlock (& blkcg_pol_mutex );
1327
1336
}
1328
1337
1338
+ static void blkcg_exit (struct task_struct * tsk )
1339
+ {
1340
+ if (tsk -> throttle_queue )
1341
+ blk_put_queue (tsk -> throttle_queue );
1342
+ tsk -> throttle_queue = NULL ;
1343
+ }
1344
+
1329
1345
struct cgroup_subsys io_cgrp_subsys = {
1330
1346
.css_alloc = blkcg_css_alloc ,
1331
1347
.css_offline = blkcg_css_offline ,
@@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
1335
1351
.dfl_cftypes = blkcg_files ,
1336
1352
.legacy_cftypes = blkcg_legacy_files ,
1337
1353
.legacy_name = "blkio" ,
1354
+ .exit = blkcg_exit ,
1338
1355
#ifdef CONFIG_MEMCG
1339
1356
/*
1340
1357
* This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
1586
1603
}
1587
1604
EXPORT_SYMBOL_GPL (blkcg_policy_unregister );
1588
1605
1606
+ /*
1607
+ * Scale the accumulated delay based on how long it has been since we updated
1608
+ * the delay. We only call this when we are adding delay, in case it's been a
1609
+ * while since we added delay, and when we are checking to see if we need to
1610
+ * delay a task, to account for any delays that may have occurred.
1611
+ */
1612
+ static void blkcg_scale_delay (struct blkcg_gq * blkg , u64 now )
1613
+ {
1614
+ u64 old = atomic64_read (& blkg -> delay_start );
1615
+
1616
+ /*
1617
+ * We only want to scale down every second. The idea here is that we
1618
+ * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1619
+ * time window. We only want to throttle tasks for recent delay that
1620
+ * has occurred, in 1 second time windows since that's the maximum
1621
+ * things can be throttled. We save the current delay window in
1622
+ * blkg->last_delay so we know what amount is still left to be charged
1623
+ * to the blkg from this point onward. blkg->last_use keeps track of
1624
+ * the use_delay counter. The idea is if we're unthrottling the blkg we
1625
+ * are ok with whatever is happening now, and we can take away more of
1626
+ * the accumulated delay as we've already throttled enough that
1627
+ * everybody is happy with their IO latencies.
1628
+ */
1629
+ if (time_before64 (old + NSEC_PER_SEC , now ) &&
1630
+ atomic64_cmpxchg (& blkg -> delay_start , old , now ) == old ) {
1631
+ u64 cur = atomic64_read (& blkg -> delay_nsec );
1632
+ u64 sub = min_t (u64 , blkg -> last_delay , now - old );
1633
+ int cur_use = atomic_read (& blkg -> use_delay );
1634
+
1635
+ /*
1636
+ * We've been unthrottled, subtract a larger chunk of our
1637
+ * accumulated delay.
1638
+ */
1639
+ if (cur_use < blkg -> last_use )
1640
+ sub = max_t (u64 , sub , blkg -> last_delay >> 1 );
1641
+
1642
+ /*
1643
+ * This shouldn't happen, but handle it anyway. Our delay_nsec
1644
+ * should only ever be growing except here where we subtract out
1645
+ * min(last_delay, 1 second), but lord knows bugs happen and I'd
1646
+ * rather not end up with negative numbers.
1647
+ */
1648
+ if (unlikely (cur < sub )) {
1649
+ atomic64_set (& blkg -> delay_nsec , 0 );
1650
+ blkg -> last_delay = 0 ;
1651
+ } else {
1652
+ atomic64_sub (sub , & blkg -> delay_nsec );
1653
+ blkg -> last_delay = cur - sub ;
1654
+ }
1655
+ blkg -> last_use = cur_use ;
1656
+ }
1657
+ }
1658
+
1659
+ /*
1660
+ * This is called when we want to actually walk up the hierarchy and check to
1661
+ * see if we need to throttle, and then actually throttle if there is some
1662
+ * accumulated delay. This should only be called upon return to user space so
1663
+ * we're not holding some lock that would induce a priority inversion.
1664
+ */
1665
+ static void blkcg_maybe_throttle_blkg (struct blkcg_gq * blkg , bool use_memdelay )
1666
+ {
1667
+ u64 now = ktime_to_ns (ktime_get ());
1668
+ u64 exp ;
1669
+ u64 delay_nsec = 0 ;
1670
+ int tok ;
1671
+
1672
+ while (blkg -> parent ) {
1673
+ if (atomic_read (& blkg -> use_delay )) {
1674
+ blkcg_scale_delay (blkg , now );
1675
+ delay_nsec = max_t (u64 , delay_nsec ,
1676
+ atomic64_read (& blkg -> delay_nsec ));
1677
+ }
1678
+ blkg = blkg -> parent ;
1679
+ }
1680
+
1681
+ if (!delay_nsec )
1682
+ return ;
1683
+
1684
+ /*
1685
+ * Let's not sleep for all eternity if we've amassed a huge delay.
1686
+ * Swapping or metadata IO can accumulate 10's of seconds worth of
1687
+ * delay, and we want userspace to be able to do _something_ so cap the
1688
+ * delays at 1 second. If there's 10's of seconds worth of delay then
1689
+ * the tasks will be delayed for 1 second for every syscall.
1690
+ */
1691
+ delay_nsec = min_t (u64 , delay_nsec , 250 * NSEC_PER_MSEC );
1692
+
1693
+ /*
1694
+ * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1695
+ * that hasn't landed upstream yet. Once that stuff is in place we need
1696
+ * to do a psi_memstall_enter/leave if memdelay is set.
1697
+ */
1698
+
1699
+ exp = ktime_add_ns (now , delay_nsec );
1700
+ tok = io_schedule_prepare ();
1701
+ do {
1702
+ __set_current_state (TASK_KILLABLE );
1703
+ if (!schedule_hrtimeout (& exp , HRTIMER_MODE_ABS ))
1704
+ break ;
1705
+ } while (!fatal_signal_pending (current ));
1706
+ io_schedule_finish (tok );
1707
+ }
1708
+
1709
+ /**
1710
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1711
+ *
1712
+ * This is only called if we've been marked with set_notify_resume(). Obviously
1713
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1714
+ * check to see if current->throttle_queue is set and if not this doesn't do
1715
+ * anything. This should only ever be called by the resume code, it's not meant
1716
+ * to be called by people willy-nilly as it will actually do the work to
1717
+ * throttle the task if it is setup for throttling.
1718
+ */
1719
+ void blkcg_maybe_throttle_current (void )
1720
+ {
1721
+ struct request_queue * q = current -> throttle_queue ;
1722
+ struct cgroup_subsys_state * css ;
1723
+ struct blkcg * blkcg ;
1724
+ struct blkcg_gq * blkg ;
1725
+ bool use_memdelay = current -> use_memdelay ;
1726
+
1727
+ if (!q )
1728
+ return ;
1729
+
1730
+ current -> throttle_queue = NULL ;
1731
+ current -> use_memdelay = false;
1732
+
1733
+ rcu_read_lock ();
1734
+ css = kthread_blkcg ();
1735
+ if (css )
1736
+ blkcg = css_to_blkcg (css );
1737
+ else
1738
+ blkcg = css_to_blkcg (task_css (current , io_cgrp_id ));
1739
+
1740
+ if (!blkcg )
1741
+ goto out ;
1742
+ blkg = blkg_lookup (blkcg , q );
1743
+ if (!blkg )
1744
+ goto out ;
1745
+ blkg = blkg_try_get (blkg );
1746
+ if (!blkg )
1747
+ goto out ;
1748
+ rcu_read_unlock ();
1749
+ blk_put_queue (q );
1750
+
1751
+ blkcg_maybe_throttle_blkg (blkg , use_memdelay );
1752
+ blkg_put (blkg );
1753
+ return ;
1754
+ out :
1755
+ rcu_read_unlock ();
1756
+ blk_put_queue (q );
1757
+ }
1758
+ EXPORT_SYMBOL_GPL (blkcg_maybe_throttle_current );
1759
+
1760
+ /**
1761
+ * blkcg_schedule_throttle - this task needs to check for throttling
1762
+ * @q - the request queue IO was submitted on
1763
+ * @use_memdelay - do we charge this to memory delay for PSI
1764
+ *
1765
+ * This is called by the IO controller when we know there's delay accumulated
1766
+ * for the blkg for this task. We do not pass the blkg because there are places
1767
+ * we call this that may not have that information, the swapping code for
1768
+ * instance will only have a request_queue at that point. This set's the
1769
+ * notify_resume for the task to check and see if it requires throttling before
1770
+ * returning to user space.
1771
+ *
1772
+ * We will only schedule once per syscall. You can call this over and over
1773
+ * again and it will only do the check once upon return to user space, and only
1774
+ * throttle once. If the task needs to be throttled again it'll need to be
1775
+ * re-set at the next time we see the task.
1776
+ */
1777
+ void blkcg_schedule_throttle (struct request_queue * q , bool use_memdelay )
1778
+ {
1779
+ if (unlikely (current -> flags & PF_KTHREAD ))
1780
+ return ;
1781
+
1782
+ if (!blk_get_queue (q ))
1783
+ return ;
1784
+
1785
+ if (current -> throttle_queue )
1786
+ blk_put_queue (current -> throttle_queue );
1787
+ current -> throttle_queue = q ;
1788
+ if (use_memdelay )
1789
+ current -> use_memdelay = use_memdelay ;
1790
+ set_notify_resume (current );
1791
+ }
1792
+ EXPORT_SYMBOL_GPL (blkcg_schedule_throttle );
1793
+
1794
+ /**
1795
+ * blkcg_add_delay - add delay to this blkg
1796
+ * @now - the current time in nanoseconds
1797
+ * @delta - how many nanoseconds of delay to add
1798
+ *
1799
+ * Charge @delta to the blkg's current delay accumulation. This is used to
1800
+ * throttle tasks if an IO controller thinks we need more throttling.
1801
+ */
1802
+ void blkcg_add_delay (struct blkcg_gq * blkg , u64 now , u64 delta )
1803
+ {
1804
+ blkcg_scale_delay (blkg , now );
1805
+ atomic64_add (delta , & blkg -> delay_nsec );
1806
+ }
1807
+ EXPORT_SYMBOL_GPL (blkcg_add_delay );
1808
+
1589
1809
module_param (blkcg_debug_stats , bool , 0644 );
1590
1810
MODULE_PARM_DESC (blkcg_debug_stats , "True if you want debug stats, false if not" );
0 commit comments