diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 |
1 files changed, 53 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 035891ec59d5..dac202ae864d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; + /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ + if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) + dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); + } + s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", "ue", info.ue_count, "ce", info.ce_count); @@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; - if (obj->adev->asic_type == CHIP_ALDEBARAN) { + if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) - DRM_WARN("Failed to reset error counter and error status"); + dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); } return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, @@ -726,7 +734,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, /* Do not enable if it is not allowed. */ WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); - if (!amdgpu_ras_intr_triggered()) { + /* Only enable ras feature operation handle on host side */ + if (!amdgpu_sriov_vf(adev) && + !amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", @@ -1025,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } } - if (!amdgpu_persistent_edc_harvesting_supported(adev)) - amdgpu_ras_reset_error_status(adev, info->head.block); - return 0; } @@ -1147,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, if (res) return res; + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ras_reset_error_status(adev, info.head.block)) + dev_warn(adev->dev, "Failed to reset error counter and error status"); + } + ce += info.ce_count; ue += info.ue_count; } @@ -1523,7 +1536,9 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) */ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) { - if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) + /* Fatal error events are handled on host side */ + if (amdgpu_sriov_vf(adev) || + !amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) return; if (adev->nbio.ras && @@ -1788,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) continue; amdgpu_ras_query_error_status(adev, &info); + + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ras_reset_error_status(adev, info.head.block)) + dev_warn(adev->dev, "Failed to reset error counter and error status"); + } } } @@ -2270,10 +2291,15 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) { adev->ras_hw_enabled = adev->ras_enabled = 0; - if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || + if (!adev->is_atom_fw || !amdgpu_ras_asic_supported(adev)) return; + /* If driver run on sriov guest side, only enable ras for aldebaran */ + if (amdgpu_sriov_vf(adev) && + adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2)) + return; + if (!adev->gmc.xgmi.connected_to_cpu) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { dev_info(adev->dev, "MEM ECC is active.\n"); @@ -2285,15 +2311,21 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); - adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); - - if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0)) - adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | - 1 << AMDGPU_RAS_BLOCK__JPEG); - else - adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | - 1 << AMDGPU_RAS_BLOCK__JPEG); + if (!amdgpu_sriov_vf(adev)) { + adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + + if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0)) + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + else + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + } else { + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | + 1 << AMDGPU_RAS_BLOCK__SDMA | + 1 << AMDGPU_RAS_BLOCK__GFX); + } } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } @@ -2637,6 +2669,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) struct amdgpu_ras_block_object *obj; int r; + /* Guest side doesn't need init ras feature */ + if (amdgpu_sriov_vf(adev)) + return 0; + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { if (!node->ras_obj) { dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); |