Skip to content

Commit 7cb5101

Browse files
committed
habanalabs: prevent host crash during suspend/resume
This patch fixes the implementation of suspend/resume of the device so that upon resume of the device, the host won't crash due to PCI completion timeout. Upon suspend, the device is being reset due to PERST. Therefore, upon resume, the driver must initialize the PCI controller as if the driver was loaded. If the controller is not initialized and the device tries to access the device through the PCI bars, the host will crash with PCI completion timeout error. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
1 parent cbaa99e commit 7cb5101

File tree

2 files changed

+43
-66
lines changed

2 files changed

+43
-66
lines changed

drivers/misc/habanalabs/device.c

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,27 @@ int hl_device_suspend(struct hl_device *hdev)
416416

417417
pci_save_state(hdev->pdev);
418418

419+
/* Block future CS/VM/JOB completion operations */
420+
rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
421+
if (rc) {
422+
dev_err(hdev->dev, "Can't suspend while in reset\n");
423+
return -EIO;
424+
}
425+
426+
/* This blocks all other stuff that is not blocked by in_reset */
427+
hdev->disabled = true;
428+
429+
/*
430+
* Flush anyone that is inside the critical section of enqueue
431+
* jobs to the H/W
432+
*/
433+
hdev->asic_funcs->hw_queues_lock(hdev);
434+
hdev->asic_funcs->hw_queues_unlock(hdev);
435+
436+
/* Flush processes that are sending message to CPU */
437+
mutex_lock(&hdev->send_cpu_message_lock);
438+
mutex_unlock(&hdev->send_cpu_message_lock);
439+
419440
rc = hdev->asic_funcs->suspend(hdev);
420441
if (rc)
421442
dev_err(hdev->dev,
@@ -443,21 +464,38 @@ int hl_device_resume(struct hl_device *hdev)
443464

444465
pci_set_power_state(hdev->pdev, PCI_D0);
445466
pci_restore_state(hdev->pdev);
446-
rc = pci_enable_device(hdev->pdev);
467+
rc = pci_enable_device_mem(hdev->pdev);
447468
if (rc) {
448469
dev_err(hdev->dev,
449470
"Failed to enable PCI device in resume\n");
450471
return rc;
451472
}
452473

474+
pci_set_master(hdev->pdev);
475+
453476
rc = hdev->asic_funcs->resume(hdev);
454477
if (rc) {
455-
dev_err(hdev->dev,
456-
"Failed to enable PCI access from device CPU\n");
457-
return rc;
478+
dev_err(hdev->dev, "Failed to resume device after suspend\n");
479+
goto disable_device;
480+
}
481+
482+
483+
hdev->disabled = false;
484+
atomic_set(&hdev->in_reset, 0);
485+
486+
rc = hl_device_reset(hdev, true, false);
487+
if (rc) {
488+
dev_err(hdev->dev, "Failed to reset device during resume\n");
489+
goto disable_device;
458490
}
459491

460492
return 0;
493+
494+
disable_device:
495+
pci_clear_master(hdev->pdev);
496+
pci_disable_device(hdev->pdev);
497+
498+
return rc;
461499
}
462500

463501
static void hl_device_hard_reset_pending(struct work_struct *work)

drivers/misc/habanalabs/goya/goya.c

Lines changed: 1 addition & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,15 +1201,6 @@ static int goya_stop_external_queues(struct hl_device *hdev)
12011201
return retval;
12021202
}
12031203

1204-
static void goya_resume_external_queues(struct hl_device *hdev)
1205-
{
1206-
WREG32(mmDMA_QM_0_GLBL_CFG1, 0);
1207-
WREG32(mmDMA_QM_1_GLBL_CFG1, 0);
1208-
WREG32(mmDMA_QM_2_GLBL_CFG1, 0);
1209-
WREG32(mmDMA_QM_3_GLBL_CFG1, 0);
1210-
WREG32(mmDMA_QM_4_GLBL_CFG1, 0);
1211-
}
1212-
12131204
/*
12141205
* goya_init_cpu_queues - Initialize PQ/CQ/EQ of CPU
12151206
*
@@ -2178,36 +2169,6 @@ static int goya_stop_internal_queues(struct hl_device *hdev)
21782169
return retval;
21792170
}
21802171

2181-
static void goya_resume_internal_queues(struct hl_device *hdev)
2182-
{
2183-
WREG32(mmMME_QM_GLBL_CFG1, 0);
2184-
WREG32(mmMME_CMDQ_GLBL_CFG1, 0);
2185-
2186-
WREG32(mmTPC0_QM_GLBL_CFG1, 0);
2187-
WREG32(mmTPC0_CMDQ_GLBL_CFG1, 0);
2188-
2189-
WREG32(mmTPC1_QM_GLBL_CFG1, 0);
2190-
WREG32(mmTPC1_CMDQ_GLBL_CFG1, 0);
2191-
2192-
WREG32(mmTPC2_QM_GLBL_CFG1, 0);
2193-
WREG32(mmTPC2_CMDQ_GLBL_CFG1, 0);
2194-
2195-
WREG32(mmTPC3_QM_GLBL_CFG1, 0);
2196-
WREG32(mmTPC3_CMDQ_GLBL_CFG1, 0);
2197-
2198-
WREG32(mmTPC4_QM_GLBL_CFG1, 0);
2199-
WREG32(mmTPC4_CMDQ_GLBL_CFG1, 0);
2200-
2201-
WREG32(mmTPC5_QM_GLBL_CFG1, 0);
2202-
WREG32(mmTPC5_CMDQ_GLBL_CFG1, 0);
2203-
2204-
WREG32(mmTPC6_QM_GLBL_CFG1, 0);
2205-
WREG32(mmTPC6_CMDQ_GLBL_CFG1, 0);
2206-
2207-
WREG32(mmTPC7_QM_GLBL_CFG1, 0);
2208-
WREG32(mmTPC7_CMDQ_GLBL_CFG1, 0);
2209-
}
2210-
22112172
static void goya_dma_stall(struct hl_device *hdev)
22122173
{
22132174
WREG32(mmDMA_QM_0_GLBL_CFG1, 1 << DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT);
@@ -2905,20 +2866,6 @@ int goya_suspend(struct hl_device *hdev)
29052866
{
29062867
int rc;
29072868

2908-
rc = goya_stop_internal_queues(hdev);
2909-
2910-
if (rc) {
2911-
dev_err(hdev->dev, "failed to stop internal queues\n");
2912-
return rc;
2913-
}
2914-
2915-
rc = goya_stop_external_queues(hdev);
2916-
2917-
if (rc) {
2918-
dev_err(hdev->dev, "failed to stop external queues\n");
2919-
return rc;
2920-
}
2921-
29222869
rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
29232870
if (rc)
29242871
dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
@@ -2928,15 +2875,7 @@ int goya_suspend(struct hl_device *hdev)
29282875

29292876
int goya_resume(struct hl_device *hdev)
29302877
{
2931-
int rc;
2932-
2933-
goya_resume_external_queues(hdev);
2934-
goya_resume_internal_queues(hdev);
2935-
2936-
rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
2937-
if (rc)
2938-
dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
2939-
return rc;
2878+
return goya_init_iatu(hdev);
29402879
}
29412880

29422881
static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,

0 commit comments

Comments
 (0)