Merge tag 'drm-next-5.5-2019-10-25' of git://people.freedesktop.org/~agd5f/linux into drm-next

drm-next-5.5-2019-10-25: amdgpu: - BACO support for CI and VI asics - Quick memory training support for navi - MSI-X support - RAS fixes - Display AVI infoframe fixes - Display ref clock fixes for renoir - Fix number of audio endpoints in renoir - Fix for discovery tables - Powerplay fixes - Documentation fixes - Misc cleanups radeon: - revert a PPC fix which broke x86 Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexdeucher@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191025221020.203546-1-alexander.deucher@amd.com
author: Dave Airlie <airlied@redhat.com> 2019-10-30 05:46:00 +1000
committer: Dave Airlie <airlied@redhat.com> 2019-10-30 05:46:09 +1000
commit: 60845e34f0c5c19a9e86af477b429993952f585b (patch)
tree: 6b2fed16145e80c3904ffae2d996efedb80db3a0 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent: 3275a71e76fac5bc276f0d60e027b18c2e8d7a5b (diff)
parent: 0e04ad7d1857670944786a8465930a049aaf995f (diff)
1 files changed, 54 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0e2ee5869b5f..6220394521e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,6 +71,9 @@ const char *ras_block_string[] = {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+				uint64_t addr);
+
 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
@@ -215,11 +218,12 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * value to the address.
  *
  * Second member: struct ras_debug_if::op.
- * It has three kinds of operations.
+ * It has four kinds of operations.
  *
  * - 0: disable RAS on the block. Take ::head as its data.
  * - 1: enable RAS on the block. Take ::head as its data.
  * - 2: inject errors on the block. Take ::inject as its data.
+ * - 3: reboot on unrecoverable error
  *
  * How to use the interface?
  * programs:
@@ -228,13 +232,13 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  *
  * .. code-block:: bash
  *
- *	echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl
+ *	echo op block [error [sub_block address value]] > .../ras/ras_ctrl
  *
  * op: disable, enable, inject
  *	disable: only block is needed
  *	enable: block and error are needed
  *	inject: error, address, value are needed
- * block: umc, smda, gfx, .........
+ * block: umc, sdma, gfx, .........
  *	see ras_block_string[] for details
  * error: ue, ce
  *	ue: multi_uncorrectable
@@ -290,6 +294,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 			break;
 		}
 
+		/* umc ce/ue error injection for a bad page is not allowed */
+		if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
+		    amdgpu_ras_check_bad_page(adev, data.inject.address)) {
+			DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
+					data.inject.address);
+			break;
+		}
+
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
@@ -1430,6 +1442,39 @@ out:
 	return ret;
 }
 
+/*
+ * check if an address belongs to bad page
+ *
+ * Note: this check is only for umc block
+ */
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+				uint64_t addr)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	int i;
+	bool ret = false;
+
+	if (!con || !con->eh_data)
+		return ret;
+
+	mutex_lock(&con->recovery_lock);
+	data = con->eh_data;
+	if (!data)
+		goto out;
+
+	addr >>= AMDGPU_GPU_PAGE_SHIFT;
+	for (i = 0; i < data->count; i++)
+		if (addr == data->bps[i].retired_page) {
+			ret = true;
+			goto out;
+		}
+
+out:
+	mutex_unlock(&con->recovery_lock);
+	return ret;
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 {
@@ -1843,6 +1888,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
+	uint32_t hw_supported, supported;
+
+	amdgpu_ras_check_supported(adev, &hw_supported, &supported);
+	if (!hw_supported)
+		return;
+
 	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
 		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
author	Dave Airlie <airlied@redhat.com>	2019-10-30 05:46:00 +1000
committer	Dave Airlie <airlied@redhat.com>	2019-10-30 05:46:09 +1000
commit	60845e34f0c5c19a9e86af477b429993952f585b (patch)
tree	6b2fed16145e80c3904ffae2d996efedb80db3a0 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent	3275a71e76fac5bc276f0d60e027b18c2e8d7a5b (diff)
parent	0e04ad7d1857670944786a8465930a049aaf995f (diff)