Skip to content

Commit 1679ae8

Browse files
fxkamdogabbay
authored andcommitted
drm/amdkfd: Use ordered workqueue to restore processes
Restoring multiple processes concurrently can lead to live-locks where each process prevents the other from validating all its BOs. v2: fix duplicate check of same variable Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
1 parent 810955b commit 1679ae8

File tree

3 files changed

+32
-6
lines changed

3 files changed

+32
-6
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_module.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,9 @@ static int __init kfd_module_init(void)
133133
if (err < 0)
134134
goto err_topology;
135135

136-
kfd_process_create_wq();
136+
err = kfd_process_create_wq();
137+
if (err < 0)
138+
goto err_create_wq;
137139

138140
kfd_debugfs_init();
139141

@@ -143,6 +145,8 @@ static int __init kfd_module_init(void)
143145

144146
return 0;
145147

148+
err_create_wq:
149+
kfd_topology_shutdown();
146150
err_topology:
147151
kfd_chardev_exit();
148152
err_ioctl:

drivers/gpu/drm/amd/amdkfd/kfd_priv.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,7 @@ struct amdkfd_ioctl_desc {
674674
const char *name;
675675
};
676676

677-
void kfd_process_create_wq(void);
677+
int kfd_process_create_wq(void);
678678
void kfd_process_destroy_wq(void);
679679
struct kfd_process *kfd_create_process(struct file *filep);
680680
struct kfd_process *kfd_get_process(const struct task_struct *);

drivers/gpu/drm/amd/amdkfd/kfd_process.c

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,17 @@ static DEFINE_MUTEX(kfd_processes_mutex);
4848

4949
DEFINE_SRCU(kfd_processes_srcu);
5050

51+
/* For process termination handling */
5152
static struct workqueue_struct *kfd_process_wq;
5253

54+
/* Ordered, single-threaded workqueue for restoring evicted
55+
* processes. Restoring multiple processes concurrently under memory
56+
* pressure can lead to processes blocking each other from validating
57+
* their BOs and result in a live-lock situation where processes
58+
* remain evicted indefinitely.
59+
*/
60+
static struct workqueue_struct *kfd_restore_wq;
61+
5362
static struct kfd_process *find_process(const struct task_struct *thread);
5463
static void kfd_process_ref_release(struct kref *ref);
5564
static struct kfd_process *create_process(const struct task_struct *thread,
@@ -59,10 +68,19 @@ static void evict_process_worker(struct work_struct *work);
5968
static void restore_process_worker(struct work_struct *work);
6069

6170

62-
void kfd_process_create_wq(void)
71+
int kfd_process_create_wq(void)
6372
{
6473
if (!kfd_process_wq)
6574
kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
75+
if (!kfd_restore_wq)
76+
kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
77+
78+
if (!kfd_process_wq || !kfd_restore_wq) {
79+
kfd_process_destroy_wq();
80+
return -ENOMEM;
81+
}
82+
83+
return 0;
6684
}
6785

6886
void kfd_process_destroy_wq(void)
@@ -71,6 +89,10 @@ void kfd_process_destroy_wq(void)
7189
destroy_workqueue(kfd_process_wq);
7290
kfd_process_wq = NULL;
7391
}
92+
if (kfd_restore_wq) {
93+
destroy_workqueue(kfd_restore_wq);
94+
kfd_restore_wq = NULL;
95+
}
7496
}
7597

7698
static void kfd_process_free_gpuvm(struct kgd_mem *mem,
@@ -869,7 +891,7 @@ static void evict_process_worker(struct work_struct *work)
869891
dma_fence_signal(p->ef);
870892
dma_fence_put(p->ef);
871893
p->ef = NULL;
872-
schedule_delayed_work(&p->restore_work,
894+
queue_delayed_work(kfd_restore_wq, &p->restore_work,
873895
msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
874896

875897
pr_debug("Finished evicting pasid %d\n", p->pasid);
@@ -918,7 +940,7 @@ static void restore_process_worker(struct work_struct *work)
918940
if (ret) {
919941
pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n",
920942
p->pasid, PROCESS_BACK_OFF_TIME_MS);
921-
ret = schedule_delayed_work(&p->restore_work,
943+
ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
922944
msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
923945
WARN(!ret, "reschedule restore work failed\n");
924946
return;
@@ -957,7 +979,7 @@ int kfd_resume_all_processes(void)
957979
int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
958980

959981
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
960-
if (!schedule_delayed_work(&p->restore_work, 0)) {
982+
if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
961983
pr_err("Restore process %d failed during resume\n",
962984
p->pasid);
963985
ret = -EFAULT;

0 commit comments

Comments
 (0)