Skip to content

Commit 14d3a3b

Browse files
author
Christoph Hellwig
committed
IB: add a proper completion queue abstraction
This adds an abstraction that allows ULPs to simply pass a completion object and completion callback with each submitted WR and let the RDMA core handle the nitty gritty details of how to handle completion interrupts and poll the CQ. In detail there is a new ib_cqe structure which just contains the completion callback, and which can be used to get at the containing object using container_of. It is pointed to by the WR and WC as an alternative to the wr_id field, similar to how many ULPs already use the field to store a pointer using casts. A driver using the new completion callbacks allocates it's CQs using the new ib_create_cq API, which in addition to the number of CQEs and the completion vectors also takes a mode on how we poll for CQEs. Three modes are available: direct for drivers that never take CQ interrupts and just poll for them, softirq to poll from softirq context using the to be renamed blk-iopoll infrastructure which takes care of rearming and budgeting, or a workqueue for consumer who want to be called from user context. Thanks a lot to Sagi Grimberg who helped reviewing the API, wrote the current version of the workqueue code because my two previous attempts sucked too much and converted the iSER initiator to the new API. Signed-off-by: Christoph Hellwig <hch@lst.de>
1 parent 839a301 commit 14d3a3b

File tree

7 files changed

+264
-9
lines changed

7 files changed

+264
-9
lines changed

drivers/infiniband/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ menuconfig INFINIBAND
55
depends on NET
66
depends on INET
77
depends on m || IPV6 != m
8+
select IRQ_POLL
89
---help---
910
Core support for InfiniBand (IB). Make sure to also select
1011
any protocols you wish to use as well as drivers for your

drivers/infiniband/core/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
88
obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
99
$(user_access-y)
1010

11-
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
11+
ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \
1212
device.o fmr_pool.o cache.o netlink.o \
1313
roce_gid_mgmt.o
1414
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o

drivers/infiniband/core/cq.c

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
/*
2+
* Copyright (c) 2015 HGST, a Western Digital Company.
3+
*
4+
* This program is free software; you can redistribute it and/or modify it
5+
* under the terms and conditions of the GNU General Public License,
6+
* version 2, as published by the Free Software Foundation.
7+
*
8+
* This program is distributed in the hope it will be useful, but WITHOUT
9+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11+
* more details.
12+
*/
13+
#include <linux/module.h>
14+
#include <linux/err.h>
15+
#include <linux/slab.h>
16+
#include <rdma/ib_verbs.h>
17+
18+
/* # of WCs to poll for with a single call to ib_poll_cq */
19+
#define IB_POLL_BATCH 16
20+
21+
/* # of WCs to iterate over before yielding */
22+
#define IB_POLL_BUDGET_IRQ 256
23+
#define IB_POLL_BUDGET_WORKQUEUE 65536
24+
25+
#define IB_POLL_FLAGS \
26+
(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
27+
28+
static int __ib_process_cq(struct ib_cq *cq, int budget)
29+
{
30+
int i, n, completed = 0;
31+
32+
while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
33+
for (i = 0; i < n; i++) {
34+
struct ib_wc *wc = &cq->wc[i];
35+
36+
if (wc->wr_cqe)
37+
wc->wr_cqe->done(cq, wc);
38+
else
39+
WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
40+
}
41+
42+
completed += n;
43+
44+
if (n != IB_POLL_BATCH ||
45+
(budget != -1 && completed >= budget))
46+
break;
47+
}
48+
49+
return completed;
50+
}
51+
52+
/**
53+
* ib_process_direct_cq - process a CQ in caller context
54+
* @cq: CQ to process
55+
* @budget: number of CQEs to poll for
56+
*
57+
* This function is used to process all outstanding CQ entries on a
58+
* %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different
59+
* context and does not ask for completion interrupts from the HCA.
60+
*
61+
* Note: for compatibility reasons -1 can be passed in %budget for unlimited
62+
* polling. Do not use this feature in new code, it will be removed soon.
63+
*/
64+
int ib_process_cq_direct(struct ib_cq *cq, int budget)
65+
{
66+
WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
67+
68+
return __ib_process_cq(cq, budget);
69+
}
70+
EXPORT_SYMBOL(ib_process_cq_direct);
71+
72+
static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
73+
{
74+
WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
75+
}
76+
77+
static int ib_poll_handler(struct irq_poll *iop, int budget)
78+
{
79+
struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
80+
int completed;
81+
82+
completed = __ib_process_cq(cq, budget);
83+
if (completed < budget) {
84+
irq_poll_complete(&cq->iop);
85+
if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
86+
irq_poll_sched(&cq->iop);
87+
}
88+
89+
return completed;
90+
}
91+
92+
static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
93+
{
94+
irq_poll_sched(&cq->iop);
95+
}
96+
97+
static void ib_cq_poll_work(struct work_struct *work)
98+
{
99+
struct ib_cq *cq = container_of(work, struct ib_cq, work);
100+
int completed;
101+
102+
completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
103+
if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
104+
ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
105+
queue_work(ib_comp_wq, &cq->work);
106+
}
107+
108+
static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
109+
{
110+
queue_work(ib_comp_wq, &cq->work);
111+
}
112+
113+
/**
114+
* ib_alloc_cq - allocate a completion queue
115+
* @dev: device to allocate the CQ for
116+
* @private: driver private data, accessible from cq->cq_context
117+
* @nr_cqe: number of CQEs to allocate
118+
* @comp_vector: HCA completion vectors for this CQ
119+
* @poll_ctx: context to poll the CQ from.
120+
*
121+
* This is the proper interface to allocate a CQ for in-kernel users. A
122+
* CQ allocated with this interface will automatically be polled from the
123+
* specified context. The ULP needs must use wr->wr_cqe instead of wr->wr_id
124+
* to use this CQ abstraction.
125+
*/
126+
struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
127+
int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
128+
{
129+
struct ib_cq_init_attr cq_attr = {
130+
.cqe = nr_cqe,
131+
.comp_vector = comp_vector,
132+
};
133+
struct ib_cq *cq;
134+
int ret = -ENOMEM;
135+
136+
cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
137+
if (IS_ERR(cq))
138+
return cq;
139+
140+
cq->device = dev;
141+
cq->uobject = NULL;
142+
cq->event_handler = NULL;
143+
cq->cq_context = private;
144+
cq->poll_ctx = poll_ctx;
145+
atomic_set(&cq->usecnt, 0);
146+
147+
cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
148+
if (!cq->wc)
149+
goto out_destroy_cq;
150+
151+
switch (cq->poll_ctx) {
152+
case IB_POLL_DIRECT:
153+
cq->comp_handler = ib_cq_completion_direct;
154+
break;
155+
case IB_POLL_SOFTIRQ:
156+
cq->comp_handler = ib_cq_completion_softirq;
157+
158+
irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
159+
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
160+
break;
161+
case IB_POLL_WORKQUEUE:
162+
cq->comp_handler = ib_cq_completion_workqueue;
163+
INIT_WORK(&cq->work, ib_cq_poll_work);
164+
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
165+
break;
166+
default:
167+
ret = -EINVAL;
168+
goto out_free_wc;
169+
}
170+
171+
return cq;
172+
173+
out_free_wc:
174+
kfree(cq->wc);
175+
out_destroy_cq:
176+
cq->device->destroy_cq(cq);
177+
return ERR_PTR(ret);
178+
}
179+
EXPORT_SYMBOL(ib_alloc_cq);
180+
181+
/**
182+
* ib_free_cq - free a completion queue
183+
* @cq: completion queue to free.
184+
*/
185+
void ib_free_cq(struct ib_cq *cq)
186+
{
187+
int ret;
188+
189+
if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
190+
return;
191+
192+
switch (cq->poll_ctx) {
193+
case IB_POLL_DIRECT:
194+
break;
195+
case IB_POLL_SOFTIRQ:
196+
irq_poll_disable(&cq->iop);
197+
break;
198+
case IB_POLL_WORKQUEUE:
199+
flush_work(&cq->work);
200+
break;
201+
default:
202+
WARN_ON_ONCE(1);
203+
}
204+
205+
kfree(cq->wc);
206+
ret = cq->device->destroy_cq(cq);
207+
WARN_ON_ONCE(ret);
208+
}
209+
EXPORT_SYMBOL(ib_free_cq);

drivers/infiniband/core/device.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ struct ib_client_data {
5858
bool going_down;
5959
};
6060

61+
struct workqueue_struct *ib_comp_wq;
6162
struct workqueue_struct *ib_wq;
6263
EXPORT_SYMBOL_GPL(ib_wq);
6364

@@ -954,10 +955,18 @@ static int __init ib_core_init(void)
954955
if (!ib_wq)
955956
return -ENOMEM;
956957

958+
ib_comp_wq = alloc_workqueue("ib-comp-wq",
959+
WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
960+
WQ_UNBOUND_MAX_ACTIVE);
961+
if (!ib_comp_wq) {
962+
ret = -ENOMEM;
963+
goto err;
964+
}
965+
957966
ret = class_register(&ib_class);
958967
if (ret) {
959968
printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
960-
goto err;
969+
goto err_comp;
961970
}
962971

963972
ret = ibnl_init();
@@ -972,7 +981,8 @@ static int __init ib_core_init(void)
972981

973982
err_sysfs:
974983
class_unregister(&ib_class);
975-
984+
err_comp:
985+
destroy_workqueue(ib_comp_wq);
976986
err:
977987
destroy_workqueue(ib_wq);
978988
return ret;
@@ -983,6 +993,7 @@ static void __exit ib_core_cleanup(void)
983993
ib_cache_cleanup();
984994
ibnl_cleanup();
985995
class_unregister(&ib_class);
996+
destroy_workqueue(ib_comp_wq);
986997
/* Make sure that any pending umem accounting work is done. */
987998
destroy_workqueue(ib_wq);
988999
}

drivers/infiniband/ulp/ipoib/ipoib_cm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
7070
#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
7171

7272
static struct ib_send_wr ipoib_cm_rx_drain_wr = {
73-
.wr_id = IPOIB_CM_RX_DRAIN_WRID,
7473
.opcode = IB_WR_SEND,
7574
};
7675

@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
223222
* error" WC will be immediately generated for each WR we post.
224223
*/
225224
p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
225+
ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
226226
if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
227227
ipoib_warn(priv, "failed to post drain wr\n");
228228

drivers/infiniband/ulp/srp/ib_srp.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -457,10 +457,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
457457
static void srp_destroy_qp(struct srp_rdma_ch *ch)
458458
{
459459
static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
460-
static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
460+
static struct ib_recv_wr wr = { 0 };
461461
struct ib_recv_wr *bad_wr;
462462
int ret;
463463

464+
wr.wr_id = SRP_LAST_WR_ID;
464465
/* Destroying a QP and reusing ch->done is only safe if not connected */
465466
WARN_ON_ONCE(ch->connected);
466467

@@ -1042,13 +1043,14 @@ static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
10421043
struct ib_send_wr *bad_wr;
10431044
struct ib_send_wr wr = {
10441045
.opcode = IB_WR_LOCAL_INV,
1045-
.wr_id = LOCAL_INV_WR_ID_MASK,
10461046
.next = NULL,
10471047
.num_sge = 0,
10481048
.send_flags = 0,
10491049
.ex.invalidate_rkey = rkey,
10501050
};
10511051

1052+
wr.wr_id = LOCAL_INV_WR_ID_MASK;
1053+
10521054
return ib_post_send(ch->qp, &wr, &bad_wr);
10531055
}
10541056

0 commit comments

Comments
 (0)