diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 |
1 files changed, 34 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 04394c45aa03..cef94e2169fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -315,7 +315,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * default: ret = -EINVAL; break; - }; + } if (ret) return -EINVAL; @@ -686,6 +686,7 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, { struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; + int i; if (!obj) return -EINVAL; @@ -700,6 +701,13 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, if (adev->umc.funcs->query_ras_error_address) adev->umc.funcs->query_ras_error_address(adev, &err_data); break; + case AMDGPU_RAS_BLOCK__SDMA: + if (adev->sdma.funcs->query_ras_error_count) { + for (i = 0; i < adev->sdma.num_instances; i++) + adev->sdma.funcs->query_ras_error_count(adev, i, + &err_data); + } + break; case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_count) adev->gfx.funcs->query_ras_error_count(adev, &err_data); @@ -734,6 +742,20 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, return 0; } +uint64_t get_xgmi_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr) +{ + uint32_t df_inst_id; + + if ((!adev->df.funcs) || + (!adev->df.funcs->get_df_inst_id) || + (!adev->df.funcs->get_dram_base_addr)) + return addr; + + df_inst_id = adev->df.funcs->get_df_inst_id(adev); + + return addr + adev->df.funcs->get_dram_base_addr(adev, df_inst_id); +} + /* wrapper of psp_ras_trigger_error */ int amdgpu_ras_error_inject(struct amdgpu_device *adev, struct ras_inject_if *info) @@ -751,6 +773,12 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, if (!obj) return -EINVAL; + /* Calculate XGMI relative offset */ + if (adev->gmc.xgmi.num_physical_nodes > 1) { + block_info.address = get_xgmi_relative_phy_addr(adev, + block_info.address); + } + switch (info->head.block) { case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->ras_error_inject) @@ -1311,6 +1339,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, data = con->eh_data; if (!data || data->count == 0) { *bps = NULL; + ret = -EINVAL; goto out; } @@ -1344,7 +1373,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_ras *ras = container_of(work, struct amdgpu_ras, recovery_work); - amdgpu_device_gpu_recover(ras->adev, 0); + if (amdgpu_device_should_recover_gpu(ras->adev)) + amdgpu_device_gpu_recover(ras->adev, 0); atomic_set(&ras->in_recovery, 0); } @@ -1870,7 +1900,7 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) * See feature_enable_on_boot */ amdgpu_ras_disable_all_features(adev, 1); - amdgpu_ras_reset_gpu(adev, 0); + amdgpu_ras_reset_gpu(adev); } } @@ -1933,6 +1963,6 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); - amdgpu_ras_reset_gpu(adev, false); + amdgpu_ras_reset_gpu(adev); } } |