aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorLijo Lazar <lijo.lazar@amd.com>2025-10-06 10:39:03 +0530
committerAlex Deucher <alexander.deucher@amd.com>2025-10-07 14:09:19 -0400
commit2e97663760e5fb7ee14f399c68e57b894f01e505 (patch)
treef01046bcb0d239e8e434c1d3b1fb43491175475e /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parentdrm/amdgpu: partially revert "revert to old status lock handling v3" (diff)
downloadlinux-2e97663760e5fb7ee14f399c68e57b894f01e505.tar.gz
linux-2e97663760e5fb7ee14f399c68e57b894f01e505.zip
drm/amdgpu: Report individual reset error
If reinitialization of one of the GPUs fails after reset, it logs failure on all subsequent GPUs eventhough they have resumed successfully. A sample log where only device at 0000:95:00.0 had a failure - amdgpu 0000:15:00.0: amdgpu: GPU reset(19) succeeded! amdgpu 0000:65:00.0: amdgpu: GPU reset(19) succeeded! amdgpu 0000:75:00.0: amdgpu: GPU reset(19) succeeded! amdgpu 0000:85:00.0: amdgpu: GPU reset(19) succeeded! amdgpu 0000:95:00.0: amdgpu: GPU reset(19) failed amdgpu 0000:e5:00.0: amdgpu: GPU reset(19) failed amdgpu 0000:f5:00.0: amdgpu: GPU reset(19) failed amdgpu 0000:05:00.0: amdgpu: GPU reset(19) failed amdgpu 0000:15:00.0: amdgpu: GPU reset end with ret = -5 To avoid confusion, report the error for each device separately and return the first error as the overall result. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c25
1 files changed, 15 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 929936c8d87c..7a899fb4de29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6389,23 +6389,28 @@ static int amdgpu_device_sched_resume(struct list_head *device_list,
if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
- if (tmp_adev->asic_reset_res)
- r = tmp_adev->asic_reset_res;
-
- tmp_adev->asic_reset_res = 0;
-
- if (r) {
+ if (tmp_adev->asic_reset_res) {
/* bad news, how to tell it to userspace ?
* for ras error, we should report GPU bad status instead of
* reset failure
*/
if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
!amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
- dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
- atomic_read(&tmp_adev->gpu_reset_counter));
- amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+ dev_info(
+ tmp_adev->dev,
+ "GPU reset(%d) failed with error %d \n",
+ atomic_read(
+ &tmp_adev->gpu_reset_counter),
+ tmp_adev->asic_reset_res);
+ amdgpu_vf_error_put(tmp_adev,
+ AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
+ tmp_adev->asic_reset_res);
+ if (!r)
+ r = tmp_adev->asic_reset_res;
+ tmp_adev->asic_reset_res = 0;
} else {
- dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
+ atomic_read(&tmp_adev->gpu_reset_counter));
if (amdgpu_acpi_smart_shift_update(tmp_adev,
AMDGPU_SS_DEV_D0))
dev_warn(tmp_adev->dev,