Skip to content

Commit 2b74e2a

Browse files
apopplempe
authored andcommitted
powerpc/powernv/npu: Fix deadlock in mmio_invalidate()
When sending TLB invalidates to the NPU we need to send extra flushes due to a hardware issue. The original implementation would lock the all the ATSD MMIO registers sequentially before unlocking and relocking each of them sequentially to do the extra flush. This introduced a deadlock as it is possible for one thread to hold one ATSD register whilst waiting for another register to be freed while the other thread is holding that register waiting for the one in the first thread to be freed. For example if there are two threads and two ATSD registers: Thread A Thread B ---------------------- Acquire 1 Acquire 2 Release 1 Acquire 1 Wait 1 Wait 2 Both threads will be stuck waiting to acquire a register resulting in an RCU stall warning or soft lockup. This patch solves the deadlock by refactoring the code to ensure registers are not released between flushes and to ensure all registers are either acquired or released together and in order. Fixes: bbd5ff5 ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD") Signed-off-by: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent c554ac9 commit 2b74e2a

File tree

1 file changed

+141
-88
lines changed

1 file changed

+141
-88
lines changed

arch/powerpc/platforms/powernv/npu-dma.c

Lines changed: 141 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,11 @@ struct npu_context {
410410
void *priv;
411411
};
412412

413+
struct mmio_atsd_reg {
414+
struct npu *npu;
415+
int reg;
416+
};
417+
413418
/*
414419
* Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
415420
* if none are available.
@@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu)
419424
int i;
420425

421426
for (i = 0; i < npu->mmio_atsd_count; i++) {
422-
if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
427+
if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
423428
return i;
424429
}
425430

@@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu)
428433

429434
static void put_mmio_atsd_reg(struct npu *npu, int reg)
430435
{
431-
clear_bit(reg, &npu->mmio_atsd_usage);
436+
clear_bit_unlock(reg, &npu->mmio_atsd_usage);
432437
}
433438

434439
/* MMIO ATSD register offsets */
435440
#define XTS_ATSD_AVA 1
436441
#define XTS_ATSD_STAT 2
437442

438-
static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
439-
unsigned long va)
443+
static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
444+
unsigned long launch, unsigned long va)
440445
{
441-
int mmio_atsd_reg;
442-
443-
do {
444-
mmio_atsd_reg = get_mmio_atsd_reg(npu);
445-
cpu_relax();
446-
} while (mmio_atsd_reg < 0);
446+
struct npu *npu = mmio_atsd_reg->npu;
447+
int reg = mmio_atsd_reg->reg;
447448

448449
__raw_writeq(cpu_to_be64(va),
449-
npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
450+
npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
450451
eieio();
451-
__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
452-
453-
return mmio_atsd_reg;
452+
__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]);
454453
}
455454

456-
static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)
455+
static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
456+
unsigned long pid, bool flush)
457457
{
458+
int i;
458459
unsigned long launch;
459460

460-
/* IS set to invalidate matching PID */
461-
launch = PPC_BIT(12);
461+
for (i = 0; i <= max_npu2_index; i++) {
462+
if (mmio_atsd_reg[i].reg < 0)
463+
continue;
464+
465+
/* IS set to invalidate matching PID */
466+
launch = PPC_BIT(12);
462467

463-
/* PRS set to process-scoped */
464-
launch |= PPC_BIT(13);
468+
/* PRS set to process-scoped */
469+
launch |= PPC_BIT(13);
465470

466-
/* AP */
467-
launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
471+
/* AP */
472+
launch |= (u64)
473+
mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
468474

469-
/* PID */
470-
launch |= pid << PPC_BITLSHIFT(38);
475+
/* PID */
476+
launch |= pid << PPC_BITLSHIFT(38);
471477

472-
/* No flush */
473-
launch |= !flush << PPC_BITLSHIFT(39);
478+
/* No flush */
479+
launch |= !flush << PPC_BITLSHIFT(39);
474480

475-
/* Invalidating the entire process doesn't use a va */
476-
return mmio_launch_invalidate(npu, launch, 0);
481+
/* Invalidating the entire process doesn't use a va */
482+
mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
483+
}
477484
}
478485

479-
static int mmio_invalidate_va(struct npu *npu, unsigned long va,
480-
unsigned long pid, bool flush)
486+
static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
487+
unsigned long va, unsigned long pid, bool flush)
481488
{
489+
int i;
482490
unsigned long launch;
483491

484-
/* IS set to invalidate target VA */
485-
launch = 0;
492+
for (i = 0; i <= max_npu2_index; i++) {
493+
if (mmio_atsd_reg[i].reg < 0)
494+
continue;
495+
496+
/* IS set to invalidate target VA */
497+
launch = 0;
486498

487-
/* PRS set to process scoped */
488-
launch |= PPC_BIT(13);
499+
/* PRS set to process scoped */
500+
launch |= PPC_BIT(13);
489501

490-
/* AP */
491-
launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
502+
/* AP */
503+
launch |= (u64)
504+
mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
492505

493-
/* PID */
494-
launch |= pid << PPC_BITLSHIFT(38);
506+
/* PID */
507+
launch |= pid << PPC_BITLSHIFT(38);
495508

496-
/* No flush */
497-
launch |= !flush << PPC_BITLSHIFT(39);
509+
/* No flush */
510+
launch |= !flush << PPC_BITLSHIFT(39);
498511

499-
return mmio_launch_invalidate(npu, launch, va);
512+
mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
513+
}
500514
}
501515

502516
#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
503517

504-
struct mmio_atsd_reg {
505-
struct npu *npu;
506-
int reg;
507-
};
508-
509518
static void mmio_invalidate_wait(
510-
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)
519+
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
511520
{
512521
struct npu *npu;
513522
int i, reg;
@@ -522,16 +531,67 @@ static void mmio_invalidate_wait(
522531
reg = mmio_atsd_reg[i].reg;
523532
while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
524533
cpu_relax();
534+
}
535+
}
536+
537+
/*
538+
* Acquires all the address translation shootdown (ATSD) registers required to
539+
* launch an ATSD on all links this npu_context is active on.
540+
*/
541+
static void acquire_atsd_reg(struct npu_context *npu_context,
542+
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
543+
{
544+
int i, j;
545+
struct npu *npu;
546+
struct pci_dev *npdev;
547+
struct pnv_phb *nphb;
525548

526-
put_mmio_atsd_reg(npu, reg);
549+
for (i = 0; i <= max_npu2_index; i++) {
550+
mmio_atsd_reg[i].reg = -1;
551+
for (j = 0; j < NV_MAX_LINKS; j++) {
552+
/*
553+
* There are no ordering requirements with respect to
554+
* the setup of struct npu_context, but to ensure
555+
* consistent behaviour we need to ensure npdev[][] is
556+
* only read once.
557+
*/
558+
npdev = READ_ONCE(npu_context->npdev[i][j]);
559+
if (!npdev)
560+
continue;
527561

562+
nphb = pci_bus_to_host(npdev->bus)->private_data;
563+
npu = &nphb->npu;
564+
mmio_atsd_reg[i].npu = npu;
565+
mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
566+
while (mmio_atsd_reg[i].reg < 0) {
567+
mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
568+
cpu_relax();
569+
}
570+
break;
571+
}
572+
}
573+
}
574+
575+
/*
576+
* Release previously acquired ATSD registers. To avoid deadlocks the registers
577+
* must be released in the same order they were acquired above in
578+
* acquire_atsd_reg.
579+
*/
580+
static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
581+
{
582+
int i;
583+
584+
for (i = 0; i <= max_npu2_index; i++) {
528585
/*
529-
* The GPU requires two flush ATSDs to ensure all entries have
530-
* been flushed. We use PID 0 as it will never be used for a
531-
* process on the GPU.
586+
* We can't rely on npu_context->npdev[][] being the same here
587+
* as when acquire_atsd_reg() was called, hence we use the
588+
* values stored in mmio_atsd_reg during the acquire phase
589+
* rather than re-reading npdev[][].
532590
*/
533-
if (flush)
534-
mmio_invalidate_pid(npu, 0, true);
591+
if (mmio_atsd_reg[i].reg < 0)
592+
continue;
593+
594+
put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
535595
}
536596
}
537597

@@ -542,10 +602,6 @@ static void mmio_invalidate_wait(
542602
static void mmio_invalidate(struct npu_context *npu_context, int va,
543603
unsigned long address, bool flush)
544604
{
545-
int i, j;
546-
struct npu *npu;
547-
struct pnv_phb *nphb;
548-
struct pci_dev *npdev;
549605
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
550606
unsigned long pid = npu_context->mm->context.id;
551607

@@ -561,37 +617,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
561617
* Loop over all the NPUs this process is active on and launch
562618
* an invalidate.
563619
*/
564-
for (i = 0; i <= max_npu2_index; i++) {
565-
mmio_atsd_reg[i].reg = -1;
566-
for (j = 0; j < NV_MAX_LINKS; j++) {
567-
npdev = npu_context->npdev[i][j];
568-
if (!npdev)
569-
continue;
570-
571-
nphb = pci_bus_to_host(npdev->bus)->private_data;
572-
npu = &nphb->npu;
573-
mmio_atsd_reg[i].npu = npu;
574-
575-
if (va)
576-
mmio_atsd_reg[i].reg =
577-
mmio_invalidate_va(npu, address, pid,
578-
flush);
579-
else
580-
mmio_atsd_reg[i].reg =
581-
mmio_invalidate_pid(npu, pid, flush);
582-
583-
/*
584-
* The NPU hardware forwards the shootdown to all GPUs
585-
* so we only have to launch one shootdown per NPU.
586-
*/
587-
break;
588-
}
620+
acquire_atsd_reg(npu_context, mmio_atsd_reg);
621+
if (va)
622+
mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
623+
else
624+
mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
625+
626+
mmio_invalidate_wait(mmio_atsd_reg);
627+
if (flush) {
628+
/*
629+
* The GPU requires two flush ATSDs to ensure all entries have
630+
* been flushed. We use PID 0 as it will never be used for a
631+
* process on the GPU.
632+
*/
633+
mmio_invalidate_pid(mmio_atsd_reg, 0, true);
634+
mmio_invalidate_wait(mmio_atsd_reg);
635+
mmio_invalidate_pid(mmio_atsd_reg, 0, true);
636+
mmio_invalidate_wait(mmio_atsd_reg);
589637
}
590-
591-
mmio_invalidate_wait(mmio_atsd_reg, flush);
592-
if (flush)
593-
/* Wait for the flush to complete */
594-
mmio_invalidate_wait(mmio_atsd_reg, false);
638+
release_atsd_reg(mmio_atsd_reg);
595639
}
596640

597641
static void pnv_npu2_mn_release(struct mmu_notifier *mn,
@@ -726,7 +770,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
726770
if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
727771
&nvlink_index)))
728772
return ERR_PTR(-ENODEV);
729-
npu_context->npdev[npu->index][nvlink_index] = npdev;
773+
774+
/*
775+
* npdev is a pci_dev pointer setup by the PCI code. We assign it to
776+
* npdev[][] to indicate to the mmu notifiers that an invalidation
777+
* should also be sent over this nvlink. The notifiers don't use any
778+
* other fields in npu_context, so we just need to ensure that when they
779+
* deference npu_context->npdev[][] it is either a valid pointer or
780+
* NULL.
781+
*/
782+
WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
730783

731784
if (!nphb->npu.nmmu_flush) {
732785
/*
@@ -778,7 +831,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
778831
if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
779832
&nvlink_index)))
780833
return;
781-
npu_context->npdev[npu->index][nvlink_index] = NULL;
834+
WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
782835
opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
783836
PCI_DEVID(gpdev->bus->number, gpdev->devfn));
784837
kref_put(&npu_context->kref, pnv_npu2_release_context);

0 commit comments

Comments
 (0)