Skip to content

Commit 39e6c82

Browse files
edumazetdavem330
authored andcommitted
net: solve a NAPI race
While playing with mlx4 hardware timestamping of RX packets, I found that some packets were received by TCP stack with a ~200 ms delay... Since the timestamp was provided by the NIC, and my probe was added in tcp_v4_rcv() while in BH handler, I was confident it was not a sender issue, or a drop in the network. This would happen with a very low probability, but hurting RPC workloads. A NAPI driver normally arms the IRQ after the napi_complete_done(), after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab it. Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit while IRQ are not disabled, we might have later an IRQ firing and finding this bit set, right before napi_complete_done() clears it. This can happen with busy polling users, or if gro_flush_timeout is used. But some other uses of napi_schedule() in drivers can cause this as well. thread 1 thread 2 (could be on same cpu, or not) // busy polling or napi_watchdog() napi_schedule(); ... napi->poll() device polling: read 2 packets from ring buffer Additional 3rd packet is available. device hard irq // does nothing because NAPI_STATE_SCHED bit is owned by thread 1 napi_schedule(); napi_complete_done(napi, 2); rearm_irq(); Note that rearm_irq() will not force the device to send an additional IRQ for the packet it already signaled (3rd packet in my example) This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep() can set if it could not grab NAPI_STATE_SCHED Then napi_complete_done() properly reschedules the napi to make sure we do not miss something. Since we manipulate multiple bits at once, use cmpxchg() like in sk_busy_loop() to provide proper transactions. In v2, I changed napi_watchdog() to use a relaxed variant of napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point. In v3, I added more details in the changelog and clears NAPI_STATE_MISSED in busy_poll_stop() In v4, I added the ideas given by Alexander Duyck in v3 review Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Alexander Duyck <alexander.duyck@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent b2d0fe3 commit 39e6c82

File tree

2 files changed

+81
-24
lines changed

2 files changed

+81
-24
lines changed

include/linux/netdevice.h

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,7 @@ struct napi_struct {
330330

331331
enum {
332332
NAPI_STATE_SCHED, /* Poll is scheduled */
333+
NAPI_STATE_MISSED, /* reschedule a napi */
333334
NAPI_STATE_DISABLE, /* Disable pending */
334335
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
335336
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
@@ -338,12 +339,13 @@ enum {
338339
};
339340

340341
enum {
341-
NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED),
342-
NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE),
343-
NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC),
344-
NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED),
345-
NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
346-
NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
342+
NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
343+
NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
344+
NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
345+
NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
346+
NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
347+
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
348+
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
347349
};
348350

349351
enum gro_result {
@@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n)
414416
return test_bit(NAPI_STATE_DISABLE, &n->state);
415417
}
416418

417-
/**
418-
* napi_schedule_prep - check if NAPI can be scheduled
419-
* @n: NAPI context
420-
*
421-
* Test if NAPI routine is already running, and if not mark
422-
* it as running. This is used as a condition variable to
423-
* insure only one NAPI poll instance runs. We also make
424-
* sure there is no pending NAPI disable.
425-
*/
426-
static inline bool napi_schedule_prep(struct napi_struct *n)
427-
{
428-
return !napi_disable_pending(n) &&
429-
!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
430-
}
419+
bool napi_schedule_prep(struct napi_struct *n);
431420

432421
/**
433422
* napi_schedule - schedule NAPI poll

net/core/dev.c

Lines changed: 72 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4883,6 +4883,39 @@ void __napi_schedule(struct napi_struct *n)
48834883
}
48844884
EXPORT_SYMBOL(__napi_schedule);
48854885

4886+
/**
4887+
* napi_schedule_prep - check if napi can be scheduled
4888+
* @n: napi context
4889+
*
4890+
* Test if NAPI routine is already running, and if not mark
4891+
* it as running. This is used as a condition variable
4892+
* insure only one NAPI poll instance runs. We also make
4893+
* sure there is no pending NAPI disable.
4894+
*/
4895+
bool napi_schedule_prep(struct napi_struct *n)
4896+
{
4897+
unsigned long val, new;
4898+
4899+
do {
4900+
val = READ_ONCE(n->state);
4901+
if (unlikely(val & NAPIF_STATE_DISABLE))
4902+
return false;
4903+
new = val | NAPIF_STATE_SCHED;
4904+
4905+
/* Sets STATE_MISSED bit if STATE_SCHED was already set
4906+
* This was suggested by Alexander Duyck, as compiler
4907+
* emits better code than :
4908+
* if (val & NAPIF_STATE_SCHED)
4909+
* new |= NAPIF_STATE_MISSED;
4910+
*/
4911+
new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4912+
NAPIF_STATE_MISSED;
4913+
} while (cmpxchg(&n->state, val, new) != val);
4914+
4915+
return !(val & NAPIF_STATE_SCHED);
4916+
}
4917+
EXPORT_SYMBOL(napi_schedule_prep);
4918+
48864919
/**
48874920
* __napi_schedule_irqoff - schedule for receive
48884921
* @n: entry to schedule
@@ -4897,7 +4930,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
48974930

48984931
bool napi_complete_done(struct napi_struct *n, int work_done)
48994932
{
4900-
unsigned long flags;
4933+
unsigned long flags, val, new;
49014934

49024935
/*
49034936
* 1) Don't let napi dequeue from the cpu poll list
@@ -4927,7 +4960,27 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
49274960
list_del_init(&n->poll_list);
49284961
local_irq_restore(flags);
49294962
}
4930-
WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4963+
4964+
do {
4965+
val = READ_ONCE(n->state);
4966+
4967+
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
4968+
4969+
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
4970+
4971+
/* If STATE_MISSED was set, leave STATE_SCHED set,
4972+
* because we will call napi->poll() one more time.
4973+
* This C code was suggested by Alexander Duyck to help gcc.
4974+
*/
4975+
new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
4976+
NAPIF_STATE_SCHED;
4977+
} while (cmpxchg(&n->state, val, new) != val);
4978+
4979+
if (unlikely(val & NAPIF_STATE_MISSED)) {
4980+
__napi_schedule(n);
4981+
return false;
4982+
}
4983+
49314984
return true;
49324985
}
49334986
EXPORT_SYMBOL(napi_complete_done);
@@ -4953,6 +5006,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
49535006
{
49545007
int rc;
49555008

5009+
/* Busy polling means there is a high chance device driver hard irq
5010+
* could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5011+
* set in napi_schedule_prep().
5012+
* Since we are about to call napi->poll() once more, we can safely
5013+
* clear NAPI_STATE_MISSED.
5014+
*
5015+
* Note: x86 could use a single "lock and ..." instruction
5016+
* to perform these two clear_bit()
5017+
*/
5018+
clear_bit(NAPI_STATE_MISSED, &napi->state);
49565019
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
49575020

49585021
local_bh_disable();
@@ -5088,8 +5151,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
50885151
struct napi_struct *napi;
50895152

50905153
napi = container_of(timer, struct napi_struct, timer);
5091-
if (napi->gro_list)
5092-
napi_schedule_irqoff(napi);
5154+
5155+
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
5156+
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
5157+
*/
5158+
if (napi->gro_list && !napi_disable_pending(napi) &&
5159+
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5160+
__napi_schedule_irqoff(napi);
50935161

50945162
return HRTIMER_NORESTART;
50955163
}

0 commit comments

Comments
 (0)