Skip to content

Commit 2a485ad

Browse files
Gavin Shanozbenh
authored andcommitted
powerpc/powernv: Drop PHB operation next_error()
The patch drops PHB EEH operation next_error() and merges its logic to eeh_ops::next_error(). Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
1 parent 40ae5f6 commit 2a485ad

File tree

3 files changed

+327
-359
lines changed

3 files changed

+327
-359
lines changed

arch/powerpc/platforms/powernv/eeh-ioda.c

Lines changed: 0 additions & 351 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,6 @@
3434
#include "powernv.h"
3535
#include "pci.h"
3636

37-
static void ioda_eeh_phb_diag(struct eeh_pe *pe)
38-
{
39-
struct pnv_phb *phb = pe->phb->private_data;
40-
long rc;
41-
42-
rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data,
43-
PNV_PCI_DIAG_BUF_SIZE);
44-
if (rc != OPAL_SUCCESS)
45-
pr_warn("%s: Failed to get diag-data for PHB#%x (%ld)\n",
46-
__func__, pe->phb->global_number, rc);
47-
}
48-
4937
static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
5038
{
5139
s64 rc = OPAL_HARDWARE;
@@ -280,345 +268,6 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
280268
return ret;
281269
}
282270

283-
static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data)
284-
{
285-
/* GEM */
286-
if (data->gemXfir || data->gemRfir ||
287-
data->gemRirqfir || data->gemMask || data->gemRwof)
288-
pr_info(" GEM: %016llx %016llx %016llx %016llx %016llx\n",
289-
be64_to_cpu(data->gemXfir),
290-
be64_to_cpu(data->gemRfir),
291-
be64_to_cpu(data->gemRirqfir),
292-
be64_to_cpu(data->gemMask),
293-
be64_to_cpu(data->gemRwof));
294-
295-
/* LEM */
296-
if (data->lemFir || data->lemErrMask ||
297-
data->lemAction0 || data->lemAction1 || data->lemWof)
298-
pr_info(" LEM: %016llx %016llx %016llx %016llx %016llx\n",
299-
be64_to_cpu(data->lemFir),
300-
be64_to_cpu(data->lemErrMask),
301-
be64_to_cpu(data->lemAction0),
302-
be64_to_cpu(data->lemAction1),
303-
be64_to_cpu(data->lemWof));
304-
}
305-
306-
static void ioda_eeh_hub_diag(struct pci_controller *hose)
307-
{
308-
struct pnv_phb *phb = hose->private_data;
309-
struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag;
310-
long rc;
311-
312-
rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data));
313-
if (rc != OPAL_SUCCESS) {
314-
pr_warn("%s: Failed to get HUB#%llx diag-data (%ld)\n",
315-
__func__, phb->hub_id, rc);
316-
return;
317-
}
318-
319-
switch (data->type) {
320-
case OPAL_P7IOC_DIAG_TYPE_RGC:
321-
pr_info("P7IOC diag-data for RGC\n\n");
322-
ioda_eeh_hub_diag_common(data);
323-
if (data->rgc.rgcStatus || data->rgc.rgcLdcp)
324-
pr_info(" RGC: %016llx %016llx\n",
325-
be64_to_cpu(data->rgc.rgcStatus),
326-
be64_to_cpu(data->rgc.rgcLdcp));
327-
break;
328-
case OPAL_P7IOC_DIAG_TYPE_BI:
329-
pr_info("P7IOC diag-data for BI %s\n\n",
330-
data->bi.biDownbound ? "Downbound" : "Upbound");
331-
ioda_eeh_hub_diag_common(data);
332-
if (data->bi.biLdcp0 || data->bi.biLdcp1 ||
333-
data->bi.biLdcp2 || data->bi.biFenceStatus)
334-
pr_info(" BI: %016llx %016llx %016llx %016llx\n",
335-
be64_to_cpu(data->bi.biLdcp0),
336-
be64_to_cpu(data->bi.biLdcp1),
337-
be64_to_cpu(data->bi.biLdcp2),
338-
be64_to_cpu(data->bi.biFenceStatus));
339-
break;
340-
case OPAL_P7IOC_DIAG_TYPE_CI:
341-
pr_info("P7IOC diag-data for CI Port %d\n\n",
342-
data->ci.ciPort);
343-
ioda_eeh_hub_diag_common(data);
344-
if (data->ci.ciPortStatus || data->ci.ciPortLdcp)
345-
pr_info(" CI: %016llx %016llx\n",
346-
be64_to_cpu(data->ci.ciPortStatus),
347-
be64_to_cpu(data->ci.ciPortLdcp));
348-
break;
349-
case OPAL_P7IOC_DIAG_TYPE_MISC:
350-
pr_info("P7IOC diag-data for MISC\n\n");
351-
ioda_eeh_hub_diag_common(data);
352-
break;
353-
case OPAL_P7IOC_DIAG_TYPE_I2C:
354-
pr_info("P7IOC diag-data for I2C\n\n");
355-
ioda_eeh_hub_diag_common(data);
356-
break;
357-
default:
358-
pr_warn("%s: Invalid type of HUB#%llx diag-data (%d)\n",
359-
__func__, phb->hub_id, data->type);
360-
}
361-
}
362-
363-
static int ioda_eeh_get_pe(struct pci_controller *hose,
364-
u16 pe_no, struct eeh_pe **pe)
365-
{
366-
struct pnv_phb *phb = hose->private_data;
367-
struct pnv_ioda_pe *pnv_pe;
368-
struct eeh_pe *dev_pe;
369-
struct eeh_dev edev;
370-
371-
/*
372-
* If PHB supports compound PE, to fetch
373-
* the master PE because slave PE is invisible
374-
* to EEH core.
375-
*/
376-
pnv_pe = &phb->ioda.pe_array[pe_no];
377-
if (pnv_pe->flags & PNV_IODA_PE_SLAVE) {
378-
pnv_pe = pnv_pe->master;
379-
WARN_ON(!pnv_pe ||
380-
!(pnv_pe->flags & PNV_IODA_PE_MASTER));
381-
pe_no = pnv_pe->pe_number;
382-
}
383-
384-
/* Find the PE according to PE# */
385-
memset(&edev, 0, sizeof(struct eeh_dev));
386-
edev.phb = hose;
387-
edev.pe_config_addr = pe_no;
388-
dev_pe = eeh_pe_get(&edev);
389-
if (!dev_pe)
390-
return -EEXIST;
391-
392-
/* Freeze the (compound) PE */
393-
*pe = dev_pe;
394-
if (!(dev_pe->state & EEH_PE_ISOLATED))
395-
phb->freeze_pe(phb, pe_no);
396-
397-
/*
398-
* At this point, we're sure the (compound) PE should
399-
* have been frozen. However, we still need poke until
400-
* hitting the frozen PE on top level.
401-
*/
402-
dev_pe = dev_pe->parent;
403-
while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) {
404-
int ret;
405-
int active_flags = (EEH_STATE_MMIO_ACTIVE |
406-
EEH_STATE_DMA_ACTIVE);
407-
408-
ret = eeh_ops->get_state(dev_pe, NULL);
409-
if (ret <= 0 || (ret & active_flags) == active_flags) {
410-
dev_pe = dev_pe->parent;
411-
continue;
412-
}
413-
414-
/* Frozen parent PE */
415-
*pe = dev_pe;
416-
if (!(dev_pe->state & EEH_PE_ISOLATED))
417-
phb->freeze_pe(phb, dev_pe->addr);
418-
419-
/* Next one */
420-
dev_pe = dev_pe->parent;
421-
}
422-
423-
return 0;
424-
}
425-
426-
/**
427-
* ioda_eeh_next_error - Retrieve next error for EEH core to handle
428-
* @pe: The affected PE
429-
*
430-
* The function is expected to be called by EEH core while it gets
431-
* special EEH event (without binding PE). The function calls to
432-
* OPAL APIs for next error to handle. The informational error is
433-
* handled internally by platform. However, the dead IOC, dead PHB,
434-
* fenced PHB and frozen PE should be handled by EEH core eventually.
435-
*/
436-
static int ioda_eeh_next_error(struct eeh_pe **pe)
437-
{
438-
struct pci_controller *hose;
439-
struct pnv_phb *phb;
440-
struct eeh_pe *phb_pe, *parent_pe;
441-
__be64 frozen_pe_no;
442-
__be16 err_type, severity;
443-
int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
444-
long rc;
445-
int state, ret = EEH_NEXT_ERR_NONE;
446-
447-
/*
448-
* While running here, it's safe to purge the event queue.
449-
* And we should keep the cached OPAL notifier event sychronized
450-
* between the kernel and firmware.
451-
*/
452-
eeh_remove_event(NULL, false);
453-
opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
454-
455-
list_for_each_entry(hose, &hose_list, list_node) {
456-
/*
457-
* If the subordinate PCI buses of the PHB has been
458-
* removed or is exactly under error recovery, we
459-
* needn't take care of it any more.
460-
*/
461-
phb = hose->private_data;
462-
phb_pe = eeh_phb_pe_get(hose);
463-
if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
464-
continue;
465-
466-
rc = opal_pci_next_error(phb->opal_id,
467-
&frozen_pe_no, &err_type, &severity);
468-
469-
/* If OPAL API returns error, we needn't proceed */
470-
if (rc != OPAL_SUCCESS) {
471-
pr_devel("%s: Invalid return value on "
472-
"PHB#%x (0x%lx) from opal_pci_next_error",
473-
__func__, hose->global_number, rc);
474-
continue;
475-
}
476-
477-
/* If the PHB doesn't have error, stop processing */
478-
if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR ||
479-
be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
480-
pr_devel("%s: No error found on PHB#%x\n",
481-
__func__, hose->global_number);
482-
continue;
483-
}
484-
485-
/*
486-
* Processing the error. We're expecting the error with
487-
* highest priority reported upon multiple errors on the
488-
* specific PHB.
489-
*/
490-
pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n",
491-
__func__, be16_to_cpu(err_type), be16_to_cpu(severity),
492-
be64_to_cpu(frozen_pe_no), hose->global_number);
493-
switch (be16_to_cpu(err_type)) {
494-
case OPAL_EEH_IOC_ERROR:
495-
if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) {
496-
pr_err("EEH: dead IOC detected\n");
497-
ret = EEH_NEXT_ERR_DEAD_IOC;
498-
} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
499-
pr_info("EEH: IOC informative error "
500-
"detected\n");
501-
ioda_eeh_hub_diag(hose);
502-
ret = EEH_NEXT_ERR_NONE;
503-
}
504-
505-
break;
506-
case OPAL_EEH_PHB_ERROR:
507-
if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
508-
*pe = phb_pe;
509-
pr_err("EEH: dead PHB#%x detected, "
510-
"location: %s\n",
511-
hose->global_number,
512-
eeh_pe_loc_get(phb_pe));
513-
ret = EEH_NEXT_ERR_DEAD_PHB;
514-
} else if (be16_to_cpu(severity) ==
515-
OPAL_EEH_SEV_PHB_FENCED) {
516-
*pe = phb_pe;
517-
pr_err("EEH: Fenced PHB#%x detected, "
518-
"location: %s\n",
519-
hose->global_number,
520-
eeh_pe_loc_get(phb_pe));
521-
ret = EEH_NEXT_ERR_FENCED_PHB;
522-
} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
523-
pr_info("EEH: PHB#%x informative error "
524-
"detected, location: %s\n",
525-
hose->global_number,
526-
eeh_pe_loc_get(phb_pe));
527-
ioda_eeh_phb_diag(phb_pe);
528-
pnv_pci_dump_phb_diag_data(hose, phb_pe->data);
529-
ret = EEH_NEXT_ERR_NONE;
530-
}
531-
532-
break;
533-
case OPAL_EEH_PE_ERROR:
534-
/*
535-
* If we can't find the corresponding PE, we
536-
* just try to unfreeze.
537-
*/
538-
if (ioda_eeh_get_pe(hose,
539-
be64_to_cpu(frozen_pe_no), pe)) {
540-
/* Try best to clear it */
541-
pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
542-
hose->global_number, frozen_pe_no);
543-
pr_info("EEH: PHB location: %s\n",
544-
eeh_pe_loc_get(phb_pe));
545-
opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no,
546-
OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
547-
ret = EEH_NEXT_ERR_NONE;
548-
} else if ((*pe)->state & EEH_PE_ISOLATED ||
549-
eeh_pe_passed(*pe)) {
550-
ret = EEH_NEXT_ERR_NONE;
551-
} else {
552-
pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
553-
(*pe)->addr, (*pe)->phb->global_number);
554-
pr_err("EEH: PE location: %s, PHB location: %s\n",
555-
eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe));
556-
ret = EEH_NEXT_ERR_FROZEN_PE;
557-
}
558-
559-
break;
560-
default:
561-
pr_warn("%s: Unexpected error type %d\n",
562-
__func__, be16_to_cpu(err_type));
563-
}
564-
565-
/*
566-
* EEH core will try recover from fenced PHB or
567-
* frozen PE. In the time for frozen PE, EEH core
568-
* enable IO path for that before collecting logs,
569-
* but it ruins the site. So we have to dump the
570-
* log in advance here.
571-
*/
572-
if ((ret == EEH_NEXT_ERR_FROZEN_PE ||
573-
ret == EEH_NEXT_ERR_FENCED_PHB) &&
574-
!((*pe)->state & EEH_PE_ISOLATED)) {
575-
eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
576-
ioda_eeh_phb_diag(*pe);
577-
578-
if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
579-
pnv_pci_dump_phb_diag_data((*pe)->phb,
580-
(*pe)->data);
581-
}
582-
583-
/*
584-
* We probably have the frozen parent PE out there and
585-
* we need have to handle frozen parent PE firstly.
586-
*/
587-
if (ret == EEH_NEXT_ERR_FROZEN_PE) {
588-
parent_pe = (*pe)->parent;
589-
while (parent_pe) {
590-
/* Hit the ceiling ? */
591-
if (parent_pe->type & EEH_PE_PHB)
592-
break;
593-
594-
/* Frozen parent PE ? */
595-
state = eeh_ops->get_state(parent_pe, NULL);
596-
if (state > 0 &&
597-
(state & active_flags) != active_flags)
598-
*pe = parent_pe;
599-
600-
/* Next parent level */
601-
parent_pe = parent_pe->parent;
602-
}
603-
604-
/* We possibly migrate to another PE */
605-
eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
606-
}
607-
608-
/*
609-
* If we have no errors on the specific PHB or only
610-
* informative error there, we continue poking it.
611-
* Otherwise, we need actions to be taken by upper
612-
* layer.
613-
*/
614-
if (ret > EEH_NEXT_ERR_INF)
615-
break;
616-
}
617-
618-
return ret;
619-
}
620-
621271
struct pnv_eeh_ops ioda_eeh_ops = {
622272
.reset = ioda_eeh_reset,
623-
.next_error = ioda_eeh_next_error
624273
};

0 commit comments

Comments
 (0)