Skip to content

Commit de14829

Browse files
Ming Leiaxboe
authored andcommitted
blk-mq: introduce .get_budget and .put_budget in blk_mq_ops
For SCSI devices, there is often a per-request-queue depth, which needs to be respected before queuing one request. Currently blk-mq always dequeues the request first, then calls .queue_rq() to dispatch the request to lld. One obvious issue with this approach is that I/O merging may not be successful, because when the per-request-queue depth can't be respected, .queue_rq() has to return BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch list. This means it never gets a chance to be merged with other IO. This patch introduces .get_budget and .put_budget callback in blk_mq_ops, then we can try to get reserved budget first before dequeuing request. If the budget for queueing I/O can't be satisfied, we don't need to dequeue request at all. Hence the request can be left in the IO scheduler queue, for more merging opportunities. Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 63ba8e3 commit de14829

File tree

5 files changed

+114
-17
lines changed

5 files changed

+114
-17
lines changed

block/blk-mq-sched.c

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -89,31 +89,57 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
8989
return false;
9090
}
9191

92-
static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
92+
/* return true if hctx need to run again */
93+
static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
9394
{
9495
struct request_queue *q = hctx->queue;
9596
struct elevator_queue *e = q->elevator;
9697
LIST_HEAD(rq_list);
9798

9899
do {
99-
struct request *rq = e->type->ops.mq.dispatch_request(hctx);
100+
struct request *rq;
101+
blk_status_t ret;
100102

101-
if (!rq)
103+
if (e->type->ops.mq.has_work &&
104+
!e->type->ops.mq.has_work(hctx))
102105
break;
106+
107+
ret = blk_mq_get_dispatch_budget(hctx);
108+
if (ret == BLK_STS_RESOURCE)
109+
return true;
110+
111+
rq = e->type->ops.mq.dispatch_request(hctx);
112+
if (!rq) {
113+
blk_mq_put_dispatch_budget(hctx);
114+
break;
115+
} else if (ret != BLK_STS_OK) {
116+
blk_mq_end_request(rq, ret);
117+
continue;
118+
}
119+
120+
/*
121+
* Now this rq owns the budget which has to be released
122+
* if this rq won't be queued to driver via .queue_rq()
123+
* in blk_mq_dispatch_rq_list().
124+
*/
103125
list_add(&rq->queuelist, &rq_list);
104-
} while (blk_mq_dispatch_rq_list(q, &rq_list));
126+
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
127+
128+
return false;
105129
}
106130

107-
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
131+
/* return true if hw queue need to be run again */
132+
bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
108133
{
109134
struct request_queue *q = hctx->queue;
110135
struct elevator_queue *e = q->elevator;
111136
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
112137
LIST_HEAD(rq_list);
138+
bool run_queue = false;
113139

114140
/* RCU or SRCU read lock is needed before checking quiesced flag */
115141
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
116-
return;
142+
return false;
117143

118144
hctx->run++;
119145

@@ -143,14 +169,23 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
143169
*/
144170
if (!list_empty(&rq_list)) {
145171
blk_mq_sched_mark_restart_hctx(hctx);
146-
if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
147-
blk_mq_do_dispatch_sched(hctx);
172+
if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
173+
has_sched_dispatch)
174+
run_queue = blk_mq_do_dispatch_sched(hctx);
148175
} else if (has_sched_dispatch) {
149-
blk_mq_do_dispatch_sched(hctx);
176+
run_queue = blk_mq_do_dispatch_sched(hctx);
150177
} else {
151178
blk_mq_flush_busy_ctxs(hctx, &rq_list);
152-
blk_mq_dispatch_rq_list(q, &rq_list);
179+
blk_mq_dispatch_rq_list(q, &rq_list, false);
153180
}
181+
182+
if (run_queue && !blk_mq_sched_needs_restart(hctx) &&
183+
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
184+
blk_mq_sched_mark_restart_hctx(hctx);
185+
return true;
186+
}
187+
188+
return false;
154189
}
155190

156191
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,

block/blk-mq-sched.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
2222
struct blk_mq_ctx *ctx,
2323
struct list_head *list, bool run_queue_async);
2424

25-
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
25+
bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
2626

2727
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
2828
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);

block/blk-mq.c

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
10481048
return true;
10491049
}
10501050

1051-
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
1051+
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1052+
bool got_budget)
10521053
{
10531054
struct blk_mq_hw_ctx *hctx;
10541055
struct request *rq;
@@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
10571058
if (list_empty(list))
10581059
return false;
10591060

1061+
WARN_ON(!list_is_singular(list) && got_budget);
1062+
10601063
/*
10611064
* Now process all the entries, sending them to the driver.
10621065
*/
@@ -1074,16 +1077,30 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
10741077
* The initial allocation attempt failed, so we need to
10751078
* rerun the hardware queue when a tag is freed.
10761079
*/
1077-
if (!blk_mq_dispatch_wait_add(hctx))
1080+
if (!blk_mq_dispatch_wait_add(hctx)) {
1081+
if (got_budget)
1082+
blk_mq_put_dispatch_budget(hctx);
10781083
break;
1084+
}
10791085

10801086
/*
10811087
* It's possible that a tag was freed in the window
10821088
* between the allocation failure and adding the
10831089
* hardware queue to the wait queue.
10841090
*/
1085-
if (!blk_mq_get_driver_tag(rq, &hctx, false))
1091+
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1092+
if (got_budget)
1093+
blk_mq_put_dispatch_budget(hctx);
1094+
break;
1095+
}
1096+
}
1097+
1098+
if (!got_budget) {
1099+
ret = blk_mq_get_dispatch_budget(hctx);
1100+
if (ret == BLK_STS_RESOURCE)
10861101
break;
1102+
if (ret != BLK_STS_OK)
1103+
goto fail_rq;
10871104
}
10881105

10891106
list_del_init(&rq->queuelist);
@@ -1111,6 +1128,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
11111128
break;
11121129
}
11131130

1131+
fail_rq:
11141132
if (unlikely(ret != BLK_STS_OK)) {
11151133
errors++;
11161134
blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1169,6 +1187,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
11691187
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
11701188
{
11711189
int srcu_idx;
1190+
bool run_queue;
11721191

11731192
/*
11741193
* We should be running this queue from one of the CPUs that
@@ -1185,15 +1204,18 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
11851204

11861205
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
11871206
rcu_read_lock();
1188-
blk_mq_sched_dispatch_requests(hctx);
1207+
run_queue = blk_mq_sched_dispatch_requests(hctx);
11891208
rcu_read_unlock();
11901209
} else {
11911210
might_sleep();
11921211

11931212
srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
1194-
blk_mq_sched_dispatch_requests(hctx);
1213+
run_queue = blk_mq_sched_dispatch_requests(hctx);
11951214
srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
11961215
}
1216+
1217+
if (run_queue)
1218+
blk_mq_run_hw_queue(hctx, true);
11971219
}
11981220

11991221
/*
@@ -1582,6 +1604,13 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
15821604
if (!blk_mq_get_driver_tag(rq, NULL, false))
15831605
goto insert;
15841606

1607+
ret = blk_mq_get_dispatch_budget(hctx);
1608+
if (ret == BLK_STS_RESOURCE) {
1609+
blk_mq_put_driver_tag(rq);
1610+
goto insert;
1611+
} else if (ret != BLK_STS_OK)
1612+
goto fail_rq;
1613+
15851614
new_cookie = request_to_qc_t(hctx, rq);
15861615

15871616
/*
@@ -1598,6 +1627,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
15981627
__blk_mq_requeue_request(rq);
15991628
goto insert;
16001629
default:
1630+
fail_rq:
16011631
*cookie = BLK_QC_T_NONE;
16021632
blk_mq_end_request(rq, ret);
16031633
return;
@@ -2582,6 +2612,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
25822612
if (!set->ops->queue_rq)
25832613
return -EINVAL;
25842614

2615+
if (!set->ops->get_budget ^ !set->ops->put_budget)
2616+
return -EINVAL;
2617+
25852618
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
25862619
pr_info("blk-mq: reduced tag depth to %u\n",
25872620
BLK_MQ_MAX_DEPTH);

block/blk-mq.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
3030
void blk_mq_free_queue(struct request_queue *q);
3131
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
3232
void blk_mq_wake_waiters(struct request_queue *q);
33-
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
33+
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
3434
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
3535
bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
3636
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -137,4 +137,22 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
137137
void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
138138
unsigned int inflight[2]);
139139

140+
static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
141+
{
142+
struct request_queue *q = hctx->queue;
143+
144+
if (q->mq_ops->put_budget)
145+
q->mq_ops->put_budget(hctx);
146+
}
147+
148+
static inline blk_status_t blk_mq_get_dispatch_budget(
149+
struct blk_mq_hw_ctx *hctx)
150+
{
151+
struct request_queue *q = hctx->queue;
152+
153+
if (q->mq_ops->get_budget)
154+
return q->mq_ops->get_budget(hctx);
155+
return BLK_STS_OK;
156+
}
157+
140158
#endif

include/linux/blk-mq.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ struct blk_mq_queue_data {
9090

9191
typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
9292
const struct blk_mq_queue_data *);
93+
typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
94+
typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
9395
typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
9496
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
9597
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -111,6 +113,15 @@ struct blk_mq_ops {
111113
*/
112114
queue_rq_fn *queue_rq;
113115

116+
/*
117+
* Reserve budget before queue request, once .queue_rq is
118+
* run, it is driver's responsibility to release the
119+
* reserved budget. Also we have to handle failure case
120+
* of .get_budget for avoiding I/O deadlock.
121+
*/
122+
get_budget_fn *get_budget;
123+
put_budget_fn *put_budget;
124+
114125
/*
115126
* Called on request timeout
116127
*/

0 commit comments

Comments
 (0)