Skip to content

Commit 0ad8b48

Browse files
mstsirkindavem330
authored andcommitted
vhost: fix ref cnt checking deadlock
vhost checked the counter within the refcnt before decrementing. It really wanted to know that it is the one that has the last reference, as a way to batch freeing resources a bit more efficiently. Note: we only let refcount go to 0 on device release. This works well but we now access the ref counter twice so there's a race: all users might see a high count and decide to defer freeing resources. In the end no one initiates freeing resources until the last reference is gone (which is on VM shotdown so might happen after a looooong time). Let's do what we probably should have done straight away: switch from kref to plain atomic, documenting the semantics, return the refcount value atomically after decrement, then use that to avoid the deadlock. Reported-by: Qin Chuanyu <qinchuanyu@huawei.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Jason Wang <jasowang@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 208ece1 commit 0ad8b48

File tree

1 file changed

+20
-21
lines changed

1 file changed

+20
-21
lines changed

drivers/vhost/net.c

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,12 @@ enum {
7070
};
7171

7272
struct vhost_net_ubuf_ref {
73-
struct kref kref;
73+
/* refcount follows semantics similar to kref:
74+
* 0: object is released
75+
* 1: no outstanding ubufs
76+
* >1: outstanding ubufs
77+
*/
78+
atomic_t refcount;
7479
wait_queue_head_t wait;
7580
struct vhost_virtqueue *vq;
7681
};
@@ -116,14 +121,6 @@ static void vhost_net_enable_zcopy(int vq)
116121
vhost_net_zcopy_mask |= 0x1 << vq;
117122
}
118123

119-
static void vhost_net_zerocopy_done_signal(struct kref *kref)
120-
{
121-
struct vhost_net_ubuf_ref *ubufs;
122-
123-
ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
124-
wake_up(&ubufs->wait);
125-
}
126-
127124
static struct vhost_net_ubuf_ref *
128125
vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
129126
{
@@ -134,21 +131,24 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
134131
ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
135132
if (!ubufs)
136133
return ERR_PTR(-ENOMEM);
137-
kref_init(&ubufs->kref);
134+
atomic_set(&ubufs->refcount, 1);
138135
init_waitqueue_head(&ubufs->wait);
139136
ubufs->vq = vq;
140137
return ubufs;
141138
}
142139

143-
static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
140+
static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
144141
{
145-
kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal);
142+
int r = atomic_sub_return(1, &ubufs->refcount);
143+
if (unlikely(!r))
144+
wake_up(&ubufs->wait);
145+
return r;
146146
}
147147

148148
static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
149149
{
150-
kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal);
151-
wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
150+
vhost_net_ubuf_put(ubufs);
151+
wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
152152
}
153153

154154
static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
@@ -306,22 +306,21 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
306306
{
307307
struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
308308
struct vhost_virtqueue *vq = ubufs->vq;
309-
int cnt = atomic_read(&ubufs->kref.refcount);
309+
int cnt;
310310

311311
/* set len to mark this desc buffers done DMA */
312312
vq->heads[ubuf->desc].len = success ?
313313
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
314-
vhost_net_ubuf_put(ubufs);
314+
cnt = vhost_net_ubuf_put(ubufs);
315315

316316
/*
317317
* Trigger polling thread if guest stopped submitting new buffers:
318-
* in this case, the refcount after decrement will eventually reach 1
319-
* so here it is 2.
318+
* in this case, the refcount after decrement will eventually reach 1.
320319
* We also trigger polling periodically after each 16 packets
321320
* (the value 16 here is more or less arbitrary, it's tuned to trigger
322321
* less than 10% of times).
323322
*/
324-
if (cnt <= 2 || !(cnt % 16))
323+
if (cnt <= 1 || !(cnt % 16))
325324
vhost_poll_queue(&vq->poll);
326325
}
327326

@@ -420,7 +419,7 @@ static void handle_tx(struct vhost_net *net)
420419
msg.msg_control = ubuf;
421420
msg.msg_controllen = sizeof(ubuf);
422421
ubufs = nvq->ubufs;
423-
kref_get(&ubufs->kref);
422+
atomic_inc(&ubufs->refcount);
424423
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
425424
} else {
426425
msg.msg_control = NULL;
@@ -780,7 +779,7 @@ static void vhost_net_flush(struct vhost_net *n)
780779
vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
781780
mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
782781
n->tx_flush = false;
783-
kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref);
782+
atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
784783
mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
785784
}
786785
}

0 commit comments

Comments
 (0)