summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorGuchun Chen <guchun.chen@amd.com>2020-07-23 16:20:02 +0800
committerAlex Deucher <alexander.deucher@amd.com>2020-08-04 17:26:54 -0400
commite8fbaf03429d085b05e66b153a8363b746c94b43 (patch)
treeaf9a73464c815142d4fae4d336d1c277296c87ef /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent9c06f91ff2349da67651cddf88507aa967b58b08 (diff)
drm/amdgpu: break GPU recovery once it's in bad state(v4)
When GPU executes recovery and retriving bad GPU tag from external eerpom device, the recovery will be broken and error message is printed as well for user's awareness. v2: Refine warning message in threshold reaching case, and fix spelling typo. v3: Fix explicit calling of bad gpu. v4: Rename function names. Signed-off-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c20
1 files changed, 17 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 905c5ab486a1..d01d7969d47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4125,8 +4125,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
amdgpu_fbdev_set_suspend(tmp_adev, 0);
- /* must succeed. */
- amdgpu_ras_resume(tmp_adev);
+ /*
+ * The GPU enters bad state once faulty pages
+ * by ECC has reached the threshold, and ras
+ * recovery is scheduled next. So add one check
+ * here to break recovery if it indeed exceeds
+ * bad page threshold, and remind user to
+ * retire this GPU or setting one bigger
+ * bad_page_threshold value to fix this once
+ * probing driver again.
+ */
+ if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+ /* must succeed. */
+ amdgpu_ras_resume(tmp_adev);
+ } else {
+ r = -EINVAL;
+ goto out;
+ }
/* Update PSP FW topology after reset */
if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@@ -4134,7 +4149,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
}
}
-
out:
if (!r) {
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);