Skip to content

Commit

Permalink
drm/amdkfd: Add sync to kfd_resume_all_processes
Browse files Browse the repository at this point in the history
Add a flag to kfd_resume_all_processes() to allow it to wait
for all the delayed work to complete before returning.

This is required for kgd_gfx_v9_set_barrier_auto_waitcnt().

In kgd_gfx_v9_set_barrier_auto_waitcnt(), when we call
amdgpu_amdkfd_resume(), we need to wait for the
delayed work to complete before we complete.

This is to prevent any subsequent calls amdgpu_amdkfd_suspend()
from cancelling any of the delayed work in amdgpu_amdkfd_resume().

This can happen in a multi-gpu system.  If we enable the debugger
on one node, it will do the amdgpu_amdkfd_suspend() and
amdgpu_amdkfd_resume().  The call to enable the debugger will return
to the caller before the delayed work is complete.  If the debugger
is then enabled on another node, the next call to
amdgpu_amdkfd_suspend() will cancel any outstanding delayed work
from amdgpu_amdkfd_resume().

To prevent this, we need to ensure that the delayed work in
amdgpu_amdkfd_resume() has completed before we return to the caller.

Signed-off-by: Philip Cox <[email protected]>
Reviewed-by: Jonathan Kim <[email protected]>
  • Loading branch information
mrphilcox authored and amd-aakash committed Feb 25, 2021
1 parent 32572fa commit 9b83380
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 10 deletions.
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,12 @@ void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
kgd2kfd_suspend(adev->kfd.dev, run_pm);
}

int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm, bool sync)
{
int r = 0;

if (adev->kfd.dev)
r = kgd2kfd_resume(adev->kfd.dev, run_pm);
r = kgd2kfd_resume(adev->kfd.dev, run_pm, sync);

return r;
}
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ int amdgpu_amdkfd_init(void);
void amdgpu_amdkfd_fini(void);

void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm, bool sync);
void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
const void *ih_ring_entry);
void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
Expand Down Expand Up @@ -289,7 +289,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
const struct kgd2kfd_shared_resources *gpu_resources);
void kgd2kfd_device_exit(struct kfd_dev *kfd);
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm, bool sync);
int kgd2kfd_pre_reset(struct kfd_dev *kfd);
int kgd2kfd_post_reset(struct kfd_dev *kfd);
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
Expand Down
21 changes: 20 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,26 @@ static void kgd_gfx_v9_set_barrier_auto_waitcnt(struct amdgpu_device *adev,

out:
kgd_gfx_v9_suspend_resume_compute_scheduler(adev, false);
amdgpu_amdkfd_resume(adev, false);

/* When we call amdgpu_amdkfd_resume(), we need to wait for the
* delayed work to complete before we complete, so pass true
* for the 'sync' option.
*
* This is to prevent any subsequent calls amdgpu_amdkfd_suspend()
* from cancelling any of the delayed work in amdgpu_amdkfd_resume().
*
* This can happen in a multi-gpu system. If we enable the debugger
* on one node, it will do the amdgpu_amdkfd_suspend() and
* amdgpu_amdkfd_resume(). The call to enable the debugger will return
* to the caller before the delayed work is complete. If the debugger
* is then enabled on another node, the next call to
* amdgpu_amdkfd_suspend() will cancel any outstanding delayed work
* from amdgpu_amdkfd_resume().
*
* To prevent this, we need to ensure that the delayed work in
* amdgpu_amdkfd_resume() has completed before we return to the caller.
*/
amdgpu_amdkfd_resume(adev, false, true);

up_read(&adev->reset_sem);
}
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -3836,7 +3836,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
}
}
}
r = amdgpu_amdkfd_resume(adev, !fbcon);
r = amdgpu_amdkfd_resume(adev, !fbcon, false);
if (r)
return r;

Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdkfd/kfd_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -926,7 +926,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
kfd_iommu_suspend(kfd);
}

int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm, bool sync)
{
int ret, count;

Expand All @@ -942,7 +942,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
count = atomic_dec_return(&kfd_locked);
WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
if (count == 0)
ret = kfd_resume_all_processes();
ret = kfd_resume_all_processes(sync);
}

return ret;
Expand Down
13 changes: 12 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,18 @@ void kfd_unref_process(struct kfd_process *p);
int kfd_process_evict_queues(struct kfd_process *p);
int kfd_process_restore_queues(struct kfd_process *p);
void kfd_suspend_all_processes(void);
int kfd_resume_all_processes(void);
/*
* kfd_resume_all_processes:
* bool sync: If kfd_resume_all_processes() should wait for the
* delayed work to complete or not.
* If there will be multiple calls to kfd_suspend_all_processes()
* and kfd_resume_all_processes(), we need to wait for the
* delayed sync work for kfd_resume_all_processes() to complete
* or else the subsequent call to kfd_suspend_all_processes()
* may cancel any outstanding delayed work. This can happen
* when the kfd debugger is started on a multi-gpu system.
*/
int kfd_resume_all_processes(bool sync);

int kfd_process_device_init_vm(struct kfd_process_device *pdd,
struct file *drm_file);
Expand Down
12 changes: 11 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_process.c
Original file line number Diff line number Diff line change
Expand Up @@ -1955,7 +1955,7 @@ void kfd_suspend_all_processes(void)
srcu_read_unlock(&kfd_processes_srcu, idx);
}

int kfd_resume_all_processes(void)
int kfd_resume_all_processes(bool sync)
{
struct kfd_process *p;
unsigned int temp;
Expand All @@ -1967,6 +1967,16 @@ int kfd_resume_all_processes(void)
p->pasid);
ret = -EFAULT;
}
/*
* When there are multiple calls to kfd_suspend_all_processes()
* and kfd_resume_all_processes(), we need to wait for the
* delayed sync work for kfd_resume_all_processes() to complete
* or else the subsequent call to kfd_suspend_all_processes()
* may cancel any outstanding delayed work. This can happen
* when the kfd debugger is started on a multi-gpu system.
*/
if (sync)
flush_delayed_work(&p->restore_work);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
return ret;
Expand Down

0 comments on commit 9b83380

Please sign in to comment.