@@ -48,8 +48,17 @@ static DEFINE_MUTEX(kfd_processes_mutex);
48
48
49
49
DEFINE_SRCU (kfd_processes_srcu );
50
50
51
+ /* For process termination handling */
51
52
static struct workqueue_struct * kfd_process_wq ;
52
53
54
+ /* Ordered, single-threaded workqueue for restoring evicted
55
+ * processes. Restoring multiple processes concurrently under memory
56
+ * pressure can lead to processes blocking each other from validating
57
+ * their BOs and result in a live-lock situation where processes
58
+ * remain evicted indefinitely.
59
+ */
60
+ static struct workqueue_struct * kfd_restore_wq ;
61
+
53
62
static struct kfd_process * find_process (const struct task_struct * thread );
54
63
static void kfd_process_ref_release (struct kref * ref );
55
64
static struct kfd_process * create_process (const struct task_struct * thread ,
@@ -59,10 +68,19 @@ static void evict_process_worker(struct work_struct *work);
59
68
static void restore_process_worker (struct work_struct * work );
60
69
61
70
62
- void kfd_process_create_wq (void )
71
+ int kfd_process_create_wq (void )
63
72
{
64
73
if (!kfd_process_wq )
65
74
kfd_process_wq = alloc_workqueue ("kfd_process_wq" , 0 , 0 );
75
+ if (!kfd_restore_wq )
76
+ kfd_restore_wq = alloc_ordered_workqueue ("kfd_restore_wq" , 0 );
77
+
78
+ if (!kfd_process_wq || !kfd_restore_wq ) {
79
+ kfd_process_destroy_wq ();
80
+ return - ENOMEM ;
81
+ }
82
+
83
+ return 0 ;
66
84
}
67
85
68
86
void kfd_process_destroy_wq (void )
@@ -71,6 +89,10 @@ void kfd_process_destroy_wq(void)
71
89
destroy_workqueue (kfd_process_wq );
72
90
kfd_process_wq = NULL ;
73
91
}
92
+ if (kfd_restore_wq ) {
93
+ destroy_workqueue (kfd_restore_wq );
94
+ kfd_restore_wq = NULL ;
95
+ }
74
96
}
75
97
76
98
static void kfd_process_free_gpuvm (struct kgd_mem * mem ,
@@ -869,7 +891,7 @@ static void evict_process_worker(struct work_struct *work)
869
891
dma_fence_signal (p -> ef );
870
892
dma_fence_put (p -> ef );
871
893
p -> ef = NULL ;
872
- schedule_delayed_work ( & p -> restore_work ,
894
+ queue_delayed_work ( kfd_restore_wq , & p -> restore_work ,
873
895
msecs_to_jiffies (PROCESS_RESTORE_TIME_MS ));
874
896
875
897
pr_debug ("Finished evicting pasid %d\n" , p -> pasid );
@@ -918,7 +940,7 @@ static void restore_process_worker(struct work_struct *work)
918
940
if (ret ) {
919
941
pr_debug ("Failed to restore BOs of pasid %d, retry after %d ms\n" ,
920
942
p -> pasid , PROCESS_BACK_OFF_TIME_MS );
921
- ret = schedule_delayed_work ( & p -> restore_work ,
943
+ ret = queue_delayed_work ( kfd_restore_wq , & p -> restore_work ,
922
944
msecs_to_jiffies (PROCESS_BACK_OFF_TIME_MS ));
923
945
WARN (!ret , "reschedule restore work failed\n" );
924
946
return ;
@@ -957,7 +979,7 @@ int kfd_resume_all_processes(void)
957
979
int ret = 0 , idx = srcu_read_lock (& kfd_processes_srcu );
958
980
959
981
hash_for_each_rcu (kfd_processes_table , temp , p , kfd_processes ) {
960
- if (!schedule_delayed_work ( & p -> restore_work , 0 )) {
982
+ if (!queue_delayed_work ( kfd_restore_wq , & p -> restore_work , 0 )) {
961
983
pr_err ("Restore process %d failed during resume\n" ,
962
984
p -> pasid );
963
985
ret = - EFAULT ;
0 commit comments