From ddfde25f16ba31fb480d2e83b29631aaa56526cb Mon Sep 17 00:00:00 2001 From: Petri Latvala Date: Mon, 9 Sep 2019 14:38:07 +0300 Subject: runner: Add support for aborting on network failure If the network goes down while testing, CI tends to interpret that as the device being down, cutting its power after a while. This causes an incomplete to an innocent test, increasing noise in the results. A new flag to --abort-on-monitored-error, "ping", uses liboping to ping a host configured in .igtrc with one ping after each test execution and aborts the run if there is no reply in a hardcoded amount of time. v2: - Use a higher timeout - Allow hostname configuration from environment v3: - Use runner_c_args for holding c args for runner - Handle runner's meson options in runner/meson.build - Instead of one ping with 20 second timeout, ping with 1 second timeout for a duration of 20 seconds v4: - Rebase - Use now-exported igt_load_igtrc instead of copypaste code - Use define for timeout, clearer var name for single attempt timeout Signed-off-by: Petri Latvala Cc: Arkadiusz Hiler Cc: Martin Peres Cc: Tomi Sarvela Cc: Daniel Vetter Reviewed-by: Arkadiusz Hiler --- runner/executor.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++ runner/meson.build | 14 +++++- runner/settings.c | 4 ++ runner/settings.h | 5 +- 4 files changed, 152 insertions(+), 3 deletions(-) (limited to 'runner') diff --git a/runner/executor.c b/runner/executor.c index 4846684e..c1cfcce8 100644 --- a/runner/executor.c +++ b/runner/executor.c @@ -1,6 +1,10 @@ #include #include +#include #include +#if HAVE_OPING +#include +#endif #include #include #include @@ -18,6 +22,7 @@ #include #include +#include "igt_aux.h" #include "igt_core.h" #include "executor.h" #include "output_strings.h" @@ -147,6 +152,129 @@ static void ping_watchdogs(void) } } +#if HAVE_OPING +static pingobj_t *pingobj = NULL; + +static bool load_ping_config_from_file(void) +{ + GError *error = NULL; + GKeyFile *key_file = NULL; + const char *ping_hostname; + + /* Load igt config file */ + key_file = igt_load_igtrc(); + if (!key_file) + return false; + + ping_hostname = + g_key_file_get_string(key_file, "DUT", + "PingHostName", &error); + + g_clear_error(&error); + g_key_file_free(key_file); + + if (!ping_hostname) + return false; + + if (ping_host_add(pingobj, ping_hostname)) { + fprintf(stderr, + "abort on ping: Cannot use hostname from config file\n"); + return false; + } + + return true; +} + +static bool load_ping_config_from_env(void) +{ + const char *ping_hostname; + + ping_hostname = getenv("IGT_PING_HOSTNAME"); + if (!ping_hostname) + return false; + + if (ping_host_add(pingobj, ping_hostname)) { + fprintf(stderr, + "abort on ping: Cannot use hostname from environment\n"); + return false; + } + + return true; +} + +/* + * On some hosts, getting network back up after suspend takes + * upwards of 10 seconds. 20 seconds should be enough to see + * if network comes back at all, and hopefully not too long to + * make external monitoring freak out. + */ +#define PING_ABORT_DEADLINE 20 + +static bool can_ping(void) +{ + igt_until_timeout(PING_ABORT_DEADLINE) { + pingobj_iter_t *iter; + + ping_send(pingobj); + + for (iter = ping_iterator_get(pingobj); + iter != NULL; + iter = ping_iterator_next(iter)) { + double latency; + size_t len = sizeof(latency); + + ping_iterator_get_info(iter, + PING_INFO_LATENCY, + &latency, + &len); + if (latency >= 0.0) + return true; + } + } + + return false; +} + +#endif + +static void ping_config(void) +{ +#if HAVE_OPING + double single_attempt_timeout = 1.0; + + if (pingobj) + return; + + pingobj = ping_construct(); + + /* Try env first, then config file */ + if (!load_ping_config_from_env() && !load_ping_config_from_file()) { + fprintf(stderr, + "abort on ping: No host to ping configured\n"); + ping_destroy(pingobj); + pingobj = NULL; + return; + } + + ping_setopt(pingobj, PING_OPT_TIMEOUT, &single_attempt_timeout); +#endif +} + +static char *handle_ping(void) +{ +#if HAVE_OPING + if (pingobj && !can_ping()) { + char *reason; + + asprintf(&reason, + "Ping host did not respond to ping, network down"); + return reason; + } +#endif + + return NULL; +} + static char *handle_lockdep(void) { const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n"; @@ -236,6 +364,7 @@ static const struct { } abort_handlers[] = { { ABORT_LOCKDEP, handle_lockdep }, { ABORT_TAINT, handle_taint }, + { ABORT_PING, handle_ping }, { 0, 0 }, }; @@ -1361,6 +1490,9 @@ bool execute(struct execute_state *state, init_watchdogs(settings); + if (settings->abort_mask & ABORT_PING) + ping_config(); + if (!uname(&unamebuf)) { dprintf(unamefd, "%s %s %s %s %s\n", unamebuf.sysname, diff --git a/runner/meson.build b/runner/meson.build index 86521f94..6d8d3ab2 100644 --- a/runner/meson.build +++ b/runner/meson.build @@ -13,6 +13,14 @@ runner_test_sources = [ 'runner_tests.c' ] runner_json_test_sources = [ 'runner_json_tests.c' ] jsonc = dependency('json-c', required: build_runner) +runner_deps = [jsonc, glib] +runner_c_args = [] + +liboping = dependency('liboping', required: get_option('oping')) +if liboping.found() + runner_deps += liboping + runner_c_args += '-DHAVE_OPING=1' +endif if not build_tests and jsonc.found() error('Building test runner requires building tests') @@ -23,7 +31,8 @@ if jsonc.found() runnerlib = static_library('igt_runner', runnerlib_sources, include_directories : inc, - dependencies : [jsonc, glib]) + c_args : runner_c_args, + dependencies : runner_deps) runner = executable('igt_runner', runner_sources, link_with : runnerlib, @@ -61,6 +70,9 @@ if jsonc.found() test('runner_json', runner_json_test) build_info += 'Build test runner: true' + if liboping.found() + build_info += 'Build test runner with oping: true' + endif else build_info += 'Build test runner: false' endif diff --git a/runner/settings.c b/runner/settings.c index 8b39c063..d601cd11 100644 --- a/runner/settings.c +++ b/runner/settings.c @@ -51,6 +51,7 @@ static struct { } abort_conditions[] = { { ABORT_TAINT, "taint" }, { ABORT_LOCKDEP, "lockdep" }, + { ABORT_PING, "ping" }, { ABORT_ALL, "all" }, { 0, 0 }, }; @@ -140,6 +141,9 @@ static const char *usage_str = " Possible conditions:\n" " lockdep - abort when kernel lockdep has been angered.\n" " taint - abort when kernel becomes fatally tainted.\n" + " ping - abort when a host configured in .igtrc or\n" + " environment variable IGT_PING_HOSTNAME does\n" + " not respond to ping.\n" " all - abort for all of the above.\n" " -s, --sync Sync results to disk after every test\n" " -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n" diff --git a/runner/settings.h b/runner/settings.h index 6dcfa8c5..13409f04 100644 --- a/runner/settings.h +++ b/runner/settings.h @@ -15,9 +15,10 @@ enum { #define ABORT_TAINT (1 << 0) #define ABORT_LOCKDEP (1 << 1) -#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP) +#define ABORT_PING (1 << 2) +#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING) -_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd"); +_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd"); struct regex_list { char **regex_strings; -- cgit v1.2.3