Skip to content

Commit 9d53cae

Browse files
jchu314atgithubdavem330
authored andcommitted
sparc64: Measure receiver forward progress to avoid send mondo timeout
A large sun4v SPARC system may have moments of intensive xcall activities, usually caused by unmapping many pages on many CPUs concurrently. This can flood receivers with CPU mondo interrupts for an extended period, causing some unlucky senders to hit send-mondo timeout. This problem gets worse as cpu count increases because sometimes mappings must be invalidated on all CPUs, and sometimes all CPUs may gang up on a single CPU. But a busy system is not a broken system. In the above scenario, as long as the receiver is making forward progress processing mondo interrupts, the sender should continue to retry. This patch implements the receiver's forward progress meter by introducing a per cpu counter 'cpu_mondo_counter[cpu]' where 'cpu' is in the range of 0..NR_CPUS. The receiver increments its counter as soon as it receives a mondo and the sender tracks the receiver's counter. If the receiver has stopped making forward progress when the retry limit is reached, the sender declares send-mondo-timeout and panic; otherwise, the receiver is allowed to keep making forward progress. In addition, it's been observed that PCIe hotplug events generate Correctable Errors that are handled by hypervisor and then OS. Hypervisor 'borrows' a guest cpu strand briefly to provide the service. If the cpu strand is simultaneously the only cpu targeted by a mondo, it may not be available for the mondo in 20msec, causing SUN4V mondo timeout. It appears that 1 second is the agreed wait time between hypervisor and guest OS, this patch makes the adjustment. Orabug: 25476541 Orabug: 26417466 Signed-off-by: Jane Chu <jane.chu@oracle.com> Reviewed-by: Steve Sistare <steven.sistare@oracle.com> Reviewed-by: Anthony Yznaga <anthony.yznaga@oracle.com> Reviewed-by: Rob Gardner <rob.gardner@oracle.com> Reviewed-by: Thomas Tai <thomas.tai@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 2ad6714 commit 9d53cae

File tree

4 files changed

+132
-70
lines changed

4 files changed

+132
-70
lines changed

arch/sparc/include/asm/trap_block.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR_CPUS];
5454
void init_cur_cpu_trap(struct thread_info *);
5555
void setup_tba(void);
5656
extern int ncpus_probed;
57+
extern u64 cpu_mondo_counter[NR_CPUS];
5758

5859
unsigned long real_hard_smp_processor_id(void);
5960

arch/sparc/kernel/smp_64.c

Lines changed: 115 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -622,117 +622,162 @@ static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
622622
}
623623
}
624624

625-
/* Multi-cpu list version. */
625+
#define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid])
626+
#define MONDO_USEC_WAIT_MIN 2
627+
#define MONDO_USEC_WAIT_MAX 100
628+
#define MONDO_RETRY_LIMIT 500000
629+
630+
/* Multi-cpu list version.
631+
*
632+
* Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
633+
* Sometimes not all cpus receive the mondo, requiring us to re-send
634+
* the mondo until all cpus have received, or cpus are truly stuck
635+
* unable to receive mondo, and we timeout.
636+
* Occasionally a target cpu strand is borrowed briefly by hypervisor to
637+
* perform guest service, such as PCIe error handling. Consider the
638+
* service time, 1 second overall wait is reasonable for 1 cpu.
639+
* Here two in-between mondo check wait time are defined: 2 usec for
640+
* single cpu quick turn around and up to 100usec for large cpu count.
641+
* Deliver mondo to large number of cpus could take longer, we adjusts
642+
* the retry count as long as target cpus are making forward progress.
643+
*/
626644
static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
627645
{
628-
int retries, this_cpu, prev_sent, i, saw_cpu_error;
646+
int this_cpu, tot_cpus, prev_sent, i, rem;
647+
int usec_wait, retries, tot_retries;
648+
u16 first_cpu = 0xffff;
649+
unsigned long xc_rcvd = 0;
629650
unsigned long status;
651+
int ecpuerror_id = 0;
652+
int enocpu_id = 0;
630653
u16 *cpu_list;
654+
u16 cpu;
631655

632656
this_cpu = smp_processor_id();
633-
634657
cpu_list = __va(tb->cpu_list_pa);
635-
636-
saw_cpu_error = 0;
637-
retries = 0;
658+
usec_wait = cnt * MONDO_USEC_WAIT_MIN;
659+
if (usec_wait > MONDO_USEC_WAIT_MAX)
660+
usec_wait = MONDO_USEC_WAIT_MAX;
661+
retries = tot_retries = 0;
662+
tot_cpus = cnt;
638663
prev_sent = 0;
664+
639665
do {
640-
int forward_progress, n_sent;
666+
int n_sent, mondo_delivered, target_cpu_busy;
641667

642668
status = sun4v_cpu_mondo_send(cnt,
643669
tb->cpu_list_pa,
644670
tb->cpu_mondo_block_pa);
645671

646672
/* HV_EOK means all cpus received the xcall, we're done. */
647673
if (likely(status == HV_EOK))
648-
break;
674+
goto xcall_done;
675+
676+
/* If not these non-fatal errors, panic */
677+
if (unlikely((status != HV_EWOULDBLOCK) &&
678+
(status != HV_ECPUERROR) &&
679+
(status != HV_ENOCPU)))
680+
goto fatal_errors;
649681

650682
/* First, see if we made any forward progress.
683+
*
684+
* Go through the cpu_list, count the target cpus that have
685+
* received our mondo (n_sent), and those that did not (rem).
686+
* Re-pack cpu_list with the cpus remain to be retried in the
687+
* front - this simplifies tracking the truly stalled cpus.
651688
*
652689
* The hypervisor indicates successful sends by setting
653690
* cpu list entries to the value 0xffff.
691+
*
692+
* EWOULDBLOCK means some target cpus did not receive the
693+
* mondo and retry usually helps.
694+
*
695+
* ECPUERROR means at least one target cpu is in error state,
696+
* it's usually safe to skip the faulty cpu and retry.
697+
*
698+
* ENOCPU means one of the target cpu doesn't belong to the
699+
* domain, perhaps offlined which is unexpected, but not
700+
* fatal and it's okay to skip the offlined cpu.
654701
*/
702+
rem = 0;
655703
n_sent = 0;
656704
for (i = 0; i < cnt; i++) {
657-
if (likely(cpu_list[i] == 0xffff))
705+
cpu = cpu_list[i];
706+
if (likely(cpu == 0xffff)) {
658707
n_sent++;
708+
} else if ((status == HV_ECPUERROR) &&
709+
(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
710+
ecpuerror_id = cpu + 1;
711+
} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
712+
enocpu_id = cpu + 1;
713+
} else {
714+
cpu_list[rem++] = cpu;
715+
}
659716
}
660717

661-
forward_progress = 0;
662-
if (n_sent > prev_sent)
663-
forward_progress = 1;
718+
/* No cpu remained, we're done. */
719+
if (rem == 0)
720+
break;
664721

665-
prev_sent = n_sent;
722+
/* Otherwise, update the cpu count for retry. */
723+
cnt = rem;
666724

667-
/* If we get a HV_ECPUERROR, then one or more of the cpus
668-
* in the list are in error state. Use the cpu_state()
669-
* hypervisor call to find out which cpus are in error state.
725+
/* Record the overall number of mondos received by the
726+
* first of the remaining cpus.
670727
*/
671-
if (unlikely(status == HV_ECPUERROR)) {
672-
for (i = 0; i < cnt; i++) {
673-
long err;
674-
u16 cpu;
728+
if (first_cpu != cpu_list[0]) {
729+
first_cpu = cpu_list[0];
730+
xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
731+
}
675732

676-
cpu = cpu_list[i];
677-
if (cpu == 0xffff)
678-
continue;
733+
/* Was any mondo delivered successfully? */
734+
mondo_delivered = (n_sent > prev_sent);
735+
prev_sent = n_sent;
679736

680-
err = sun4v_cpu_state(cpu);
681-
if (err == HV_CPU_STATE_ERROR) {
682-
saw_cpu_error = (cpu + 1);
683-
cpu_list[i] = 0xffff;
684-
}
685-
}
686-
} else if (unlikely(status != HV_EWOULDBLOCK))
687-
goto fatal_mondo_error;
737+
/* or, was any target cpu busy processing other mondos? */
738+
target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
739+
xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
688740

689-
/* Don't bother rewriting the CPU list, just leave the
690-
* 0xffff and non-0xffff entries in there and the
691-
* hypervisor will do the right thing.
692-
*
693-
* Only advance timeout state if we didn't make any
694-
* forward progress.
741+
/* Retry count is for no progress. If we're making progress,
742+
* reset the retry count.
695743
*/
696-
if (unlikely(!forward_progress)) {
697-
if (unlikely(++retries > 10000))
698-
goto fatal_mondo_timeout;
699-
700-
/* Delay a little bit to let other cpus catch up
701-
* on their cpu mondo queue work.
702-
*/
703-
udelay(2 * cnt);
744+
if (likely(mondo_delivered || target_cpu_busy)) {
745+
tot_retries += retries;
746+
retries = 0;
747+
} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
748+
goto fatal_mondo_timeout;
704749
}
705-
} while (1);
706750

707-
if (unlikely(saw_cpu_error))
708-
goto fatal_mondo_cpu_error;
751+
/* Delay a little bit to let other cpus catch up on
752+
* their cpu mondo queue work.
753+
*/
754+
if (!mondo_delivered)
755+
udelay(usec_wait);
709756

710-
return;
757+
retries++;
758+
} while (1);
711759

712-
fatal_mondo_cpu_error:
713-
printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
714-
"(including %d) were in error state\n",
715-
this_cpu, saw_cpu_error - 1);
760+
xcall_done:
761+
if (unlikely(ecpuerror_id > 0)) {
762+
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
763+
this_cpu, ecpuerror_id - 1);
764+
} else if (unlikely(enocpu_id > 0)) {
765+
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
766+
this_cpu, enocpu_id - 1);
767+
}
716768
return;
717769

770+
fatal_errors:
771+
/* fatal errors include bad alignment, etc */
772+
pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
773+
this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
774+
panic("Unexpected SUN4V mondo error %lu\n", status);
775+
718776
fatal_mondo_timeout:
719-
printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
720-
" progress after %d retries.\n",
721-
this_cpu, retries);
722-
goto dump_cpu_list_and_out;
723-
724-
fatal_mondo_error:
725-
printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
726-
this_cpu, status);
727-
printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
728-
"mondo_block_pa(%lx)\n",
729-
this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
730-
731-
dump_cpu_list_and_out:
732-
printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
733-
for (i = 0; i < cnt; i++)
734-
printk("%u ", cpu_list[i]);
735-
printk("]\n");
777+
/* some cpus being non-responsive to the cpu mondo */
778+
pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
779+
this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
780+
panic("SUN4V mondo timeout panic\n");
736781
}
737782

738783
static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);

arch/sparc/kernel/sun4v_ivec.S

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@ sun4v_cpu_mondo:
2626
ldxa [%g0] ASI_SCRATCHPAD, %g4
2727
sub %g4, TRAP_PER_CPU_FAULT_INFO, %g4
2828

29+
/* Get smp_processor_id() into %g3 */
30+
sethi %hi(trap_block), %g5
31+
or %g5, %lo(trap_block), %g5
32+
sub %g4, %g5, %g3
33+
srlx %g3, TRAP_BLOCK_SZ_SHIFT, %g3
34+
35+
/* Increment cpu_mondo_counter[smp_processor_id()] */
36+
sethi %hi(cpu_mondo_counter), %g5
37+
or %g5, %lo(cpu_mondo_counter), %g5
38+
sllx %g3, 3, %g3
39+
add %g5, %g3, %g5
40+
ldx [%g5], %g3
41+
add %g3, 1, %g3
42+
stx %g3, [%g5]
43+
2944
/* Get CPU mondo queue base phys address into %g7. */
3045
ldx [%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7
3146

arch/sparc/kernel/traps_64.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2733,6 +2733,7 @@ void do_getpsr(struct pt_regs *regs)
27332733
}
27342734
}
27352735

2736+
u64 cpu_mondo_counter[NR_CPUS] = {0};
27362737
struct trap_per_cpu trap_block[NR_CPUS];
27372738
EXPORT_SYMBOL(trap_block);
27382739

0 commit comments

Comments
 (0)