Skip to content

Commit eb619fd

Browse files
committed
blk-mq: fix issue with shared tag queue re-running
This patch attempts to make the case of hctx re-running on driver tag failure more robust. Without this patch, it's pretty easy to trigger a stall condition with shared tags. An example is using null_blk like this: modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 which sets up 4 devices, sharing the same tag set with a depth of 1. Running a fio job ala: [global] bs=4k rw=randread norandommap direct=1 ioengine=libaio iodepth=4 [nullb0] filename=/dev/nullb0 [nullb1] filename=/dev/nullb1 [nullb2] filename=/dev/nullb2 [nullb3] filename=/dev/nullb3 will inevitably end with one or more threads being stuck waiting for a scheduler tag. That IO is then stuck forever, until someone else triggers a run of the queue. Ensure that we always re-run the hardware queue, if the driver tag we were waiting for got freed before we added our leftover request entries back on the dispatch list. Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com> Tested-by: Bart Van Assche <bart.vanassche@wdc.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent e454d12 commit eb619fd

File tree

3 files changed

+50
-41
lines changed

3 files changed

+50
-41
lines changed

block/blk-mq-debugfs.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
179179
HCTX_STATE_NAME(STOPPED),
180180
HCTX_STATE_NAME(TAG_ACTIVE),
181181
HCTX_STATE_NAME(SCHED_RESTART),
182-
HCTX_STATE_NAME(TAG_WAITING),
183182
HCTX_STATE_NAME(START_ON_RUN),
184183
};
185184
#undef HCTX_STATE_NAME

block/blk-mq.c

Lines changed: 48 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
998998
return rq->tag != -1;
999999
}
10001000

1001-
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
1002-
void *key)
1001+
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1002+
int flags, void *key)
10031003
{
10041004
struct blk_mq_hw_ctx *hctx;
10051005

10061006
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
10071007

1008-
list_del(&wait->entry);
1009-
clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
1008+
list_del_init(&wait->entry);
10101009
blk_mq_run_hw_queue(hctx, true);
10111010
return 1;
10121011
}
10131012

1014-
static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
1013+
static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
1014+
struct request *rq)
10151015
{
1016+
struct blk_mq_hw_ctx *this_hctx = *hctx;
1017+
wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
10161018
struct sbq_wait_state *ws;
10171019

1020+
if (!list_empty_careful(&wait->entry))
1021+
return false;
1022+
1023+
spin_lock(&this_hctx->lock);
1024+
if (!list_empty(&wait->entry)) {
1025+
spin_unlock(&this_hctx->lock);
1026+
return false;
1027+
}
1028+
1029+
ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
1030+
add_wait_queue(&ws->wait, wait);
1031+
10181032
/*
1019-
* The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
1020-
* The thread which wins the race to grab this bit adds the hardware
1021-
* queue to the wait queue.
1033+
* It's possible that a tag was freed in the window between the
1034+
* allocation failure and adding the hardware queue to the wait
1035+
* queue.
10221036
*/
1023-
if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
1024-
test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
1037+
if (!blk_mq_get_driver_tag(rq, hctx, false)) {
1038+
spin_unlock(&this_hctx->lock);
10251039
return false;
1026-
1027-
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
1028-
ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
1040+
}
10291041

10301042
/*
1031-
* As soon as this returns, it's no longer safe to fiddle with
1032-
* hctx->dispatch_wait, since a completion can wake up the wait queue
1033-
* and unlock the bit.
1043+
* We got a tag, remove ourselves from the wait queue to ensure
1044+
* someone else gets the wakeup.
10341045
*/
1035-
add_wait_queue(&ws->wait, &hctx->dispatch_wait);
1046+
spin_lock_irq(&ws->wait.lock);
1047+
list_del_init(&wait->entry);
1048+
spin_unlock_irq(&ws->wait.lock);
1049+
spin_unlock(&this_hctx->lock);
10361050
return true;
10371051
}
10381052

10391053
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1040-
bool got_budget)
1054+
bool got_budget)
10411055
{
10421056
struct blk_mq_hw_ctx *hctx;
10431057
struct request *rq, *nxt;
1058+
bool no_tag = false;
10441059
int errors, queued;
10451060

10461061
if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
10601075
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
10611076
/*
10621077
* The initial allocation attempt failed, so we need to
1063-
* rerun the hardware queue when a tag is freed.
1078+
* rerun the hardware queue when a tag is freed. The
1079+
* waitqueue takes care of that. If the queue is run
1080+
* before we add this entry back on the dispatch list,
1081+
* we'll re-run it below.
10641082
*/
1065-
if (!blk_mq_dispatch_wait_add(hctx)) {
1066-
if (got_budget)
1067-
blk_mq_put_dispatch_budget(hctx);
1068-
break;
1069-
}
1070-
1071-
/*
1072-
* It's possible that a tag was freed in the window
1073-
* between the allocation failure and adding the
1074-
* hardware queue to the wait queue.
1075-
*/
1076-
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1083+
if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
10771084
if (got_budget)
10781085
blk_mq_put_dispatch_budget(hctx);
1086+
no_tag = true;
10791087
break;
10801088
}
10811089
}
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
11401148
* it is no longer set that means that it was cleared by another
11411149
* thread and hence that a queue rerun is needed.
11421150
*
1143-
* If TAG_WAITING is set that means that an I/O scheduler has
1144-
* been configured and another thread is waiting for a driver
1145-
* tag. To guarantee fairness, do not rerun this hardware queue
1146-
* but let the other thread grab the driver tag.
1151+
* If 'no_tag' is set, that means that we failed getting
1152+
* a driver tag with an I/O scheduler attached. If our dispatch
1153+
* waitqueue is no longer active, ensure that we run the queue
1154+
* AFTER adding our entries back to the list.
11471155
*
11481156
* If no I/O scheduler has been configured it is possible that
11491157
* the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
11551163
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
11561164
* and dm-rq.
11571165
*/
1158-
if (!blk_mq_sched_needs_restart(hctx) &&
1159-
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
1166+
if (!blk_mq_sched_needs_restart(hctx) ||
1167+
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
11601168
blk_mq_run_hw_queue(hctx, true);
11611169
}
11621170

@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
20202028

20212029
hctx->nr_ctx = 0;
20222030

2031+
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2032+
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2033+
20232034
if (set->ops->init_hctx &&
20242035
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
20252036
goto free_bitmap;

include/linux/blk-mq.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
3535
struct blk_mq_ctx **ctxs;
3636
unsigned int nr_ctx;
3737

38-
wait_queue_entry_t dispatch_wait;
38+
wait_queue_entry_t dispatch_wait;
3939
atomic_t wait_index;
4040

4141
struct blk_mq_tags *tags;
@@ -181,8 +181,7 @@ enum {
181181
BLK_MQ_S_STOPPED = 0,
182182
BLK_MQ_S_TAG_ACTIVE = 1,
183183
BLK_MQ_S_SCHED_RESTART = 2,
184-
BLK_MQ_S_TAG_WAITING = 3,
185-
BLK_MQ_S_START_ON_RUN = 4,
184+
BLK_MQ_S_START_ON_RUN = 3,
186185

187186
BLK_MQ_MAX_DEPTH = 10240,
188187

0 commit comments

Comments
 (0)