summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDanielle Ratson <danieller@nvidia.com>2021-12-20 12:56:14 +0200
committerDavid S. Miller <davem@davemloft.net>2021-12-20 11:32:21 +0000
commit239cdd3f4cb0fe87da95e9a64a41644ee5a744ec (patch)
tree3bb6ee75c85762ab8b2557d834abf3421e5c6acb
parente25c060c5f24b19d2596445c6d0cf40da6f13cc1 (diff)
mlxsw: core: Extend devlink health reporter with new events and parameters
Extend the devlink health reporter registered by mlxsw to report new health events and their related parameters. These are meant to aid in debugging of hardware / firmware issues. Beside the test event ('MLXSW_REG_MFDE_EVENT_ID_TEST') that is triggered following the devlink health 'test' sub-command, the new events are used to report the triggering of asserts in firmware code ('MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT') and hardware issues ('MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE'). Each event is accompanied with a severity parameter and per-event parameters that are meant to help root cause the detected issue. Signed-off-by: Danielle Ratson <danieller@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core.c131
1 files changed, 131 insertions, 0 deletions
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index d9f12d9cd0ff..866b9357939b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1709,11 +1709,92 @@ static const struct mlxsw_listener mlxsw_core_health_listener =
MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE);
static int
+mlxsw_core_health_fw_fatal_dump_fatal_cause(const char *mfde_pl,
+ struct devlink_fmsg *fmsg)
+{
+ u32 val, tile_v;
+ int err;
+
+ val = mlxsw_reg_mfde_fatal_cause_id_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "cause_id", val);
+ if (err)
+ return err;
+ tile_v = mlxsw_reg_mfde_fatal_cause_tile_v_get(mfde_pl);
+ if (tile_v) {
+ val = mlxsw_reg_mfde_fatal_cause_tile_index_get(mfde_pl);
+ err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+mlxsw_core_health_fw_fatal_dump_fw_assert(const char *mfde_pl,
+ struct devlink_fmsg *fmsg)
+{
+ u32 val, tile_v;
+ int err;
+
+ val = mlxsw_reg_mfde_fw_assert_var0_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "var0", val);
+ if (err)
+ return err;
+ val = mlxsw_reg_mfde_fw_assert_var1_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "var1", val);
+ if (err)
+ return err;
+ val = mlxsw_reg_mfde_fw_assert_var2_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "var2", val);
+ if (err)
+ return err;
+ val = mlxsw_reg_mfde_fw_assert_var3_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "var3", val);
+ if (err)
+ return err;
+ val = mlxsw_reg_mfde_fw_assert_var4_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "var4", val);
+ if (err)
+ return err;
+ val = mlxsw_reg_mfde_fw_assert_existptr_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "existptr", val);
+ if (err)
+ return err;
+ val = mlxsw_reg_mfde_fw_assert_callra_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "callra", val);
+ if (err)
+ return err;
+ val = mlxsw_reg_mfde_fw_assert_oe_get(mfde_pl);
+ err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
+ if (err)
+ return err;
+ tile_v = mlxsw_reg_mfde_fw_assert_tile_v_get(mfde_pl);
+ if (tile_v) {
+ val = mlxsw_reg_mfde_fw_assert_tile_index_get(mfde_pl);
+ err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
+ if (err)
+ return err;
+ }
+ val = mlxsw_reg_mfde_fw_assert_ext_synd_get(mfde_pl);
+ err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", val);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
mlxsw_core_health_fw_fatal_dump_kvd_im_stop(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val;
+ int err;
+ val = mlxsw_reg_mfde_kvd_im_stop_oe_get(mfde_pl);
+ err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
+ if (err)
+ return err;
val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl);
return devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val);
}
@@ -1729,6 +1810,10 @@ mlxsw_core_health_fw_fatal_dump_crspace_to(const char *mfde_pl,
err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val);
if (err)
return err;
+ val = mlxsw_reg_mfde_crspace_to_oe_get(mfde_pl);
+ err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
+ if (err)
+ return err;
val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "log_irisc_id", val);
if (err)
@@ -1774,6 +1859,15 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
val_str = "KVD insertion machine stopped";
break;
+ case MLXSW_REG_MFDE_EVENT_ID_TEST:
+ val_str = "Test";
+ break;
+ case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
+ val_str = "FW assert";
+ break;
+ case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
+ val_str = "Fatal cause";
+ break;
default:
val_str = NULL;
}
@@ -1782,6 +1876,38 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
if (err)
return err;
}
+
+ err = devlink_fmsg_arr_pair_nest_end(fmsg);
+ if (err)
+ return err;
+
+ err = devlink_fmsg_arr_pair_nest_start(fmsg, "severity");
+ if (err)
+ return err;
+
+ val = mlxsw_reg_mfde_severity_get(mfde_pl);
+ err = devlink_fmsg_u8_pair_put(fmsg, "id", val);
+ if (err)
+ return err;
+ switch (val) {
+ case MLXSW_REG_MFDE_SEVERITY_FATL:
+ val_str = "Fatal";
+ break;
+ case MLXSW_REG_MFDE_SEVERITY_NRML:
+ val_str = "Normal";
+ break;
+ case MLXSW_REG_MFDE_SEVERITY_INTR:
+ val_str = "Debug";
+ break;
+ default:
+ val_str = NULL;
+ }
+ if (val_str) {
+ err = devlink_fmsg_string_pair_put(fmsg, "desc", val_str);
+ if (err)
+ return err;
+ }
+
err = devlink_fmsg_arr_pair_nest_end(fmsg);
if (err)
return err;
@@ -1840,6 +1966,11 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
return mlxsw_core_health_fw_fatal_dump_kvd_im_stop(mfde_pl,
fmsg);
+ case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
+ return mlxsw_core_health_fw_fatal_dump_fw_assert(mfde_pl, fmsg);
+ case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
+ return mlxsw_core_health_fw_fatal_dump_fatal_cause(mfde_pl,
+ fmsg);
}
return 0;