|
24 | 24 | #include <asm/io.h>
|
25 | 25 |
|
26 | 26 | #include "cxl.h"
|
| 27 | +#include <misc/cxl.h> |
27 | 28 |
|
28 | 29 |
|
29 | 30 | #define CXL_PCI_VSEC_ID 0x1280
|
@@ -1252,10 +1253,262 @@ static void cxl_remove(struct pci_dev *dev)
|
1252 | 1253 | cxl_remove_adapter(adapter);
|
1253 | 1254 | }
|
1254 | 1255 |
|
| 1256 | +static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, |
| 1257 | + pci_channel_state_t state) |
| 1258 | +{ |
| 1259 | + struct pci_dev *afu_dev; |
| 1260 | + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; |
| 1261 | + pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET; |
| 1262 | + |
| 1263 | + /* There should only be one entry, but go through the list |
| 1264 | + * anyway |
| 1265 | + */ |
| 1266 | + list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { |
| 1267 | + if (!afu_dev->driver) |
| 1268 | + continue; |
| 1269 | + |
| 1270 | + afu_dev->error_state = state; |
| 1271 | + |
| 1272 | + if (afu_dev->driver->err_handler) |
| 1273 | + afu_result = afu_dev->driver->err_handler->error_detected(afu_dev, |
| 1274 | + state); |
| 1275 | + /* Disconnect trumps all, NONE trumps NEED_RESET */ |
| 1276 | + if (afu_result == PCI_ERS_RESULT_DISCONNECT) |
| 1277 | + result = PCI_ERS_RESULT_DISCONNECT; |
| 1278 | + else if ((afu_result == PCI_ERS_RESULT_NONE) && |
| 1279 | + (result == PCI_ERS_RESULT_NEED_RESET)) |
| 1280 | + result = PCI_ERS_RESULT_NONE; |
| 1281 | + } |
| 1282 | + return result; |
| 1283 | +} |
| 1284 | + |
| 1285 | +static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev, |
| 1286 | + pci_channel_state_t state) |
| 1287 | +{ |
| 1288 | + struct cxl *adapter = pci_get_drvdata(pdev); |
| 1289 | + struct cxl_afu *afu; |
| 1290 | + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; |
| 1291 | + int i; |
| 1292 | + |
| 1293 | + /* At this point, we could still have an interrupt pending. |
| 1294 | + * Let's try to get them out of the way before they do |
| 1295 | + * anything we don't like. |
| 1296 | + */ |
| 1297 | + schedule(); |
| 1298 | + |
| 1299 | + /* If we're permanently dead, give up. */ |
| 1300 | + if (state == pci_channel_io_perm_failure) { |
| 1301 | + /* Tell the AFU drivers; but we don't care what they |
| 1302 | + * say, we're going away. |
| 1303 | + */ |
| 1304 | + for (i = 0; i < adapter->slices; i++) { |
| 1305 | + afu = adapter->afu[i]; |
| 1306 | + cxl_vphb_error_detected(afu, state); |
| 1307 | + } |
| 1308 | + return PCI_ERS_RESULT_DISCONNECT; |
| 1309 | + } |
| 1310 | + |
| 1311 | + /* Are we reflashing? |
| 1312 | + * |
| 1313 | + * If we reflash, we could come back as something entirely |
| 1314 | + * different, including a non-CAPI card. As such, by default |
| 1315 | + * we don't participate in the process. We'll be unbound and |
| 1316 | + * the slot re-probed. (TODO: check EEH doesn't blindly rebind |
| 1317 | + * us!) |
| 1318 | + * |
| 1319 | + * However, this isn't the entire story: for reliablity |
| 1320 | + * reasons, we usually want to reflash the FPGA on PERST in |
| 1321 | + * order to get back to a more reliable known-good state. |
| 1322 | + * |
| 1323 | + * This causes us a bit of a problem: if we reflash we can't |
| 1324 | + * trust that we'll come back the same - we could have a new |
| 1325 | + * image and been PERSTed in order to load that |
| 1326 | + * image. However, most of the time we actually *will* come |
| 1327 | + * back the same - for example a regular EEH event. |
| 1328 | + * |
| 1329 | + * Therefore, we allow the user to assert that the image is |
| 1330 | + * indeed the same and that we should continue on into EEH |
| 1331 | + * anyway. |
| 1332 | + */ |
| 1333 | + if (adapter->perst_loads_image && !adapter->perst_same_image) { |
| 1334 | + /* TODO take the PHB out of CXL mode */ |
| 1335 | + dev_info(&pdev->dev, "reflashing, so opting out of EEH!\n"); |
| 1336 | + return PCI_ERS_RESULT_NONE; |
| 1337 | + } |
| 1338 | + |
| 1339 | + /* |
| 1340 | + * At this point, we want to try to recover. We'll always |
| 1341 | + * need a complete slot reset: we don't trust any other reset. |
| 1342 | + * |
| 1343 | + * Now, we go through each AFU: |
| 1344 | + * - We send the driver, if bound, an error_detected callback. |
| 1345 | + * We expect it to clean up, but it can also tell us to give |
| 1346 | + * up and permanently detach the card. To simplify things, if |
| 1347 | + * any bound AFU driver doesn't support EEH, we give up on EEH. |
| 1348 | + * |
| 1349 | + * - We detach all contexts associated with the AFU. This |
| 1350 | + * does not free them, but puts them into a CLOSED state |
| 1351 | + * which causes any the associated files to return useful |
| 1352 | + * errors to userland. It also unmaps, but does not free, |
| 1353 | + * any IRQs. |
| 1354 | + * |
| 1355 | + * - We clean up our side: releasing and unmapping resources we hold |
| 1356 | + * so we can wire them up again when the hardware comes back up. |
| 1357 | + * |
| 1358 | + * Driver authors should note: |
| 1359 | + * |
| 1360 | + * - Any contexts you create in your kernel driver (except |
| 1361 | + * those associated with anonymous file descriptors) are |
| 1362 | + * your responsibility to free and recreate. Likewise with |
| 1363 | + * any attached resources. |
| 1364 | + * |
| 1365 | + * - We will take responsibility for re-initialising the |
| 1366 | + * device context (the one set up for you in |
| 1367 | + * cxl_pci_enable_device_hook and accessed through |
| 1368 | + * cxl_get_context). If you've attached IRQs or other |
| 1369 | + * resources to it, they remains yours to free. |
| 1370 | + * |
| 1371 | + * You can call the same functions to release resources as you |
| 1372 | + * normally would: we make sure that these functions continue |
| 1373 | + * to work when the hardware is down. |
| 1374 | + * |
| 1375 | + * Two examples: |
| 1376 | + * |
| 1377 | + * 1) If you normally free all your resources at the end of |
| 1378 | + * each request, or if you use anonymous FDs, your |
| 1379 | + * error_detected callback can simply set a flag to tell |
| 1380 | + * your driver not to start any new calls. You can then |
| 1381 | + * clear the flag in the resume callback. |
| 1382 | + * |
| 1383 | + * 2) If you normally allocate your resources on startup: |
| 1384 | + * * Set a flag in error_detected as above. |
| 1385 | + * * Let CXL detach your contexts. |
| 1386 | + * * In slot_reset, free the old resources and allocate new ones. |
| 1387 | + * * In resume, clear the flag to allow things to start. |
| 1388 | + */ |
| 1389 | + for (i = 0; i < adapter->slices; i++) { |
| 1390 | + afu = adapter->afu[i]; |
| 1391 | + |
| 1392 | + result = cxl_vphb_error_detected(afu, state); |
| 1393 | + |
| 1394 | + /* Only continue if everyone agrees on NEED_RESET */ |
| 1395 | + if (result != PCI_ERS_RESULT_NEED_RESET) |
| 1396 | + return result; |
| 1397 | + |
| 1398 | + cxl_context_detach_all(afu); |
| 1399 | + cxl_afu_deactivate_mode(afu); |
| 1400 | + cxl_deconfigure_afu(afu); |
| 1401 | + } |
| 1402 | + cxl_deconfigure_adapter(adapter); |
| 1403 | + |
| 1404 | + return result; |
| 1405 | +} |
| 1406 | + |
| 1407 | +static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) |
| 1408 | +{ |
| 1409 | + struct cxl *adapter = pci_get_drvdata(pdev); |
| 1410 | + struct cxl_afu *afu; |
| 1411 | + struct cxl_context *ctx; |
| 1412 | + struct pci_dev *afu_dev; |
| 1413 | + pci_ers_result_t afu_result = PCI_ERS_RESULT_RECOVERED; |
| 1414 | + pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED; |
| 1415 | + int i; |
| 1416 | + |
| 1417 | + if (cxl_configure_adapter(adapter, pdev)) |
| 1418 | + goto err; |
| 1419 | + |
| 1420 | + for (i = 0; i < adapter->slices; i++) { |
| 1421 | + afu = adapter->afu[i]; |
| 1422 | + |
| 1423 | + if (cxl_configure_afu(afu, adapter, pdev)) |
| 1424 | + goto err; |
| 1425 | + |
| 1426 | + if (cxl_afu_select_best_mode(afu)) |
| 1427 | + goto err; |
| 1428 | + |
| 1429 | + cxl_pci_vphb_reconfigure(afu); |
| 1430 | + |
| 1431 | + list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { |
| 1432 | + /* Reset the device context. |
| 1433 | + * TODO: make this less disruptive |
| 1434 | + */ |
| 1435 | + ctx = cxl_get_context(afu_dev); |
| 1436 | + |
| 1437 | + if (ctx && cxl_release_context(ctx)) |
| 1438 | + goto err; |
| 1439 | + |
| 1440 | + ctx = cxl_dev_context_init(afu_dev); |
| 1441 | + if (!ctx) |
| 1442 | + goto err; |
| 1443 | + |
| 1444 | + afu_dev->dev.archdata.cxl_ctx = ctx; |
| 1445 | + |
| 1446 | + if (cxl_afu_check_and_enable(afu)) |
| 1447 | + goto err; |
| 1448 | + |
| 1449 | + afu_dev->error_state = pci_channel_io_normal; |
| 1450 | + |
| 1451 | + /* If there's a driver attached, allow it to |
| 1452 | + * chime in on recovery. Drivers should check |
| 1453 | + * if everything has come back OK, but |
| 1454 | + * shouldn't start new work until we call |
| 1455 | + * their resume function. |
| 1456 | + */ |
| 1457 | + if (!afu_dev->driver) |
| 1458 | + continue; |
| 1459 | + |
| 1460 | + if (afu_dev->driver->err_handler && |
| 1461 | + afu_dev->driver->err_handler->slot_reset) |
| 1462 | + afu_result = afu_dev->driver->err_handler->slot_reset(afu_dev); |
| 1463 | + |
| 1464 | + if (afu_result == PCI_ERS_RESULT_DISCONNECT) |
| 1465 | + result = PCI_ERS_RESULT_DISCONNECT; |
| 1466 | + } |
| 1467 | + } |
| 1468 | + return result; |
| 1469 | + |
| 1470 | +err: |
| 1471 | + /* All the bits that happen in both error_detected and cxl_remove |
| 1472 | + * should be idempotent, so we don't need to worry about leaving a mix |
| 1473 | + * of unconfigured and reconfigured resources. |
| 1474 | + */ |
| 1475 | + dev_err(&pdev->dev, "EEH recovery failed. Asking to be disconnected.\n"); |
| 1476 | + return PCI_ERS_RESULT_DISCONNECT; |
| 1477 | +} |
| 1478 | + |
| 1479 | +static void cxl_pci_resume(struct pci_dev *pdev) |
| 1480 | +{ |
| 1481 | + struct cxl *adapter = pci_get_drvdata(pdev); |
| 1482 | + struct cxl_afu *afu; |
| 1483 | + struct pci_dev *afu_dev; |
| 1484 | + int i; |
| 1485 | + |
| 1486 | + /* Everything is back now. Drivers should restart work now. |
| 1487 | + * This is not the place to be checking if everything came back up |
| 1488 | + * properly, because there's no return value: do that in slot_reset. |
| 1489 | + */ |
| 1490 | + for (i = 0; i < adapter->slices; i++) { |
| 1491 | + afu = adapter->afu[i]; |
| 1492 | + |
| 1493 | + list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { |
| 1494 | + if (afu_dev->driver && afu_dev->driver->err_handler && |
| 1495 | + afu_dev->driver->err_handler->resume) |
| 1496 | + afu_dev->driver->err_handler->resume(afu_dev); |
| 1497 | + } |
| 1498 | + } |
| 1499 | +} |
| 1500 | + |
| 1501 | +static const struct pci_error_handlers cxl_err_handler = { |
| 1502 | + .error_detected = cxl_pci_error_detected, |
| 1503 | + .slot_reset = cxl_pci_slot_reset, |
| 1504 | + .resume = cxl_pci_resume, |
| 1505 | +}; |
| 1506 | + |
1255 | 1507 | struct pci_driver cxl_pci_driver = {
|
1256 | 1508 | .name = "cxl-pci",
|
1257 | 1509 | .id_table = cxl_pci_tbl,
|
1258 | 1510 | .probe = cxl_probe,
|
1259 | 1511 | .remove = cxl_remove,
|
1260 | 1512 | .shutdown = cxl_remove,
|
| 1513 | + .err_handler = &cxl_err_handler, |
1261 | 1514 | };
|
0 commit comments