From 4d041216c83dd9933c7c72b40511bb3585fa1724 Mon Sep 17 00:00:00 2001 From: Yuri Nudelman Date: Sun, 13 Jun 2021 09:22:20 +0300 Subject: debugfs: add skip_reset_on_timeout option To be able to debug long-running CS better, without changing the userspace code, we are adding a new option through debugfs interface to skip the reset of the device in case of CS timeout. Signed-off-by: Yuri Nudelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 1 + drivers/misc/habanalabs/common/debugfs.c | 5 +++++ drivers/misc/habanalabs/common/habanalabs.h | 3 +++ 3 files changed, 9 insertions(+) (limited to 'drivers/misc/habanalabs') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 6d51f54030c1..adedb288d452 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP); cs->timeout_jiffies = timeout; cs->skip_reset_on_timeout = + hdev->skip_reset_on_timeout || !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT); cs->submission_time_jiffies = jiffies; INIT_LIST_HEAD(&cs->job_list); diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 8381155578a0..703d79fb6f3f 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev) dev_entry->root, &dev_entry->blob_desc); + debugfs_create_x8("skip_reset_on_timeout", + 0644, + dev_entry->root, + &hdev->skip_reset_on_timeout); + for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { debugfs_create_file(hl_debugfs_list[i].name, 0444, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index b4413c398142..09b89fdeba0b 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2191,6 +2191,8 @@ struct hl_mmu_funcs { * @supports_staged_submission: true if staged submissions are supported * @curr_reset_cause: saves an enumerated reset cause when a hard reset is * triggered, and cleared after it is shared with preboot. + * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to + * complete instead. */ struct hl_device { struct pci_dev *pdev; @@ -2305,6 +2307,7 @@ struct hl_device { u8 device_fini_pending; u8 supports_staged_submission; u8 curr_reset_cause; + u8 skip_reset_on_timeout; /* Parameters for bring-up */ u64 nic_ports_mask; -- cgit v1.2.3