Skip to content

Commit da55f2c

Browse files
osandovaxboe
authored andcommitted
blk-mq: use sbq wait queues instead of restart for driver tags
Commit 50e1dab ("blk-mq-sched: fix starvation for multiple hardware queues and shared tags") fixed one starvation issue for shared tags. However, we can still get into a situation where we fail to allocate a tag because all tags are allocated but we don't have any pending requests on any hardware queue. One solution for this would be to restart all queues that share a tag map, but that really sucks. Ideally, we could just block and wait for a tag, but that isn't always possible from blk_mq_dispatch_rq_list(). However, we can still use the struct sbitmap_queue wait queues with a custom callback instead of blocking. This has a few benefits: 1. It avoids iterating over all hardware queues when completing an I/O, which the current restart code has to do. 2. It benefits from the existing rolling wakeup code. 3. It avoids punting to another thread just to have it block. Signed-off-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
1 parent 2d19020 commit da55f2c

File tree

2 files changed

+57
-9
lines changed

2 files changed

+57
-9
lines changed

block/blk-mq.c

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list)
904904
return first != NULL;
905905
}
906906

907+
static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
908+
void *key)
909+
{
910+
struct blk_mq_hw_ctx *hctx;
911+
912+
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
913+
914+
list_del(&wait->task_list);
915+
clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
916+
blk_mq_run_hw_queue(hctx, true);
917+
return 1;
918+
}
919+
920+
static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
921+
{
922+
struct sbq_wait_state *ws;
923+
924+
/*
925+
* The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
926+
* The thread which wins the race to grab this bit adds the hardware
927+
* queue to the wait queue.
928+
*/
929+
if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
930+
test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
931+
return false;
932+
933+
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
934+
ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
935+
936+
/*
937+
* As soon as this returns, it's no longer safe to fiddle with
938+
* hctx->dispatch_wait, since a completion can wake up the wait queue
939+
* and unlock the bit.
940+
*/
941+
add_wait_queue(&ws->wait, &hctx->dispatch_wait);
942+
return true;
943+
}
944+
907945
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
908946
{
909947
struct request_queue *q = hctx->queue;
@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
931969
continue;
932970

933971
/*
934-
* We failed getting a driver tag. Mark the queue(s)
935-
* as needing a restart. Retry getting a tag again,
936-
* in case the needed IO completed right before we
937-
* marked the queue as needing a restart.
972+
* The initial allocation attempt failed, so we need to
973+
* rerun the hardware queue when a tag is freed.
938974
*/
939-
blk_mq_sched_mark_restart(hctx);
940-
if (!blk_mq_get_driver_tag(rq, &hctx, false))
975+
if (blk_mq_dispatch_wait_add(hctx)) {
976+
/*
977+
* It's possible that a tag was freed in the
978+
* window between the allocation failure and
979+
* adding the hardware queue to the wait queue.
980+
*/
981+
if (!blk_mq_get_driver_tag(rq, &hctx, false))
982+
break;
983+
} else {
941984
break;
985+
}
942986
}
987+
943988
list_del_init(&rq->queuelist);
944989

945990
bd.rq = rq;
@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
9951040
*
9961041
* blk_mq_run_hw_queue() already checks the STOPPED bit
9971042
*
998-
* If RESTART is set, then let completion restart the queue
999-
* instead of potentially looping here.
1043+
* If RESTART or TAG_WAITING is set, then let completion restart
1044+
* the queue instead of potentially looping here.
10001045
*/
1001-
if (!blk_mq_sched_needs_restart(hctx))
1046+
if (!blk_mq_sched_needs_restart(hctx) &&
1047+
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
10021048
blk_mq_run_hw_queue(hctx, true);
10031049
}
10041050

include/linux/blk-mq.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx {
3333
struct blk_mq_ctx **ctxs;
3434
unsigned int nr_ctx;
3535

36+
wait_queue_t dispatch_wait;
3637
atomic_t wait_index;
3738

3839
struct blk_mq_tags *tags;
@@ -160,6 +161,7 @@ enum {
160161
BLK_MQ_S_STOPPED = 0,
161162
BLK_MQ_S_TAG_ACTIVE = 1,
162163
BLK_MQ_S_SCHED_RESTART = 2,
164+
BLK_MQ_S_TAG_WAITING = 3,
163165

164166
BLK_MQ_MAX_DEPTH = 10240,
165167

0 commit comments

Comments
 (0)