summaryrefslogtreecommitdiff
path: root/drivers/gpu
diff options
context:
space:
mode:
authorCandice Li <candice.li@amd.com>2022-06-01 17:10:44 +0800
committerAlex Deucher <alexander.deucher@amd.com>2022-06-01 15:58:45 -0400
commit2a460963350ec6b1534d28d7f943b5f84815aff2 (patch)
tree0570476ab4b973a0ff453755f8354bc4ad9d451a /drivers/gpu
parent28caf8c467e2cb98b90a760b65767fa9aaf52ffb (diff)
drm/amdgpu: Resolve RAS GFX error count issue after cold boot on Arcturus
Adjust the sequence for ras late init and separate ras reset error status from query status. v2: squash in fix from Candice Signed-off-by: Candice Li <candice.li@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c27
2 files changed, 28 insertions, 8 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ede2fa56f6c9..16699158e00d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -594,17 +594,20 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value)
int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
int r;
- r = amdgpu_ras_block_late_init(adev, ras_block);
- if (r)
- return r;
if (amdgpu_ras_is_supported(adev, ras_block->block)) {
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX);
+ r = amdgpu_ras_block_late_init(adev, ras_block);
+ if (r)
+ return r;
+
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
if (r)
goto late_fini;
+ } else {
+ amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1b1b502897ef..dac202ae864d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
+ /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
+ if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+ if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
+ dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
+ }
+
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
"ue", info.ue_count,
"ce", info.ce_count);
@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
- if (obj->adev->asic_type == CHIP_ALDEBARAN) {
+ if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
- DRM_WARN("Failed to reset error counter and error status");
+ dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
}
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
@@ -1027,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
}
}
- if (!amdgpu_persistent_edc_harvesting_supported(adev))
- amdgpu_ras_reset_error_status(adev, info->head.block);
-
return 0;
}
@@ -1149,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
if (res)
return res;
+ if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+ if (amdgpu_ras_reset_error_status(adev, info.head.block))
+ dev_warn(adev->dev, "Failed to reset error counter and error status");
+ }
+
ce += info.ce_count;
ue += info.ue_count;
}
@@ -1792,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
continue;
amdgpu_ras_query_error_status(adev, &info);
+
+ if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+ if (amdgpu_ras_reset_error_status(adev, info.head.block))
+ dev_warn(adev->dev, "Failed to reset error counter and error status");
+ }
}
}