runner: Refactor timeouting

Instead of aiming for inactivity_timeout and splitting that into suitable intervals for watchdog pinging, replace the whole logic with one-second select() timeouts and checking if we're reaching a timeout condition based on current time and the time passed since a particular event, be it the last activity or the time of signaling the child processes. With the refactoring, we gain a couple of new features for free: - use-watchdog now makes sense even without inactivity-timeout. Previously use-watchdog was silently ignored if inactivity-timeout was not set. Now, watchdogs will be used always if configured so, effectively ensuring the device gets rebooted if userspace dies without other timeout tracking. - Killing tests early on kernel taint now happens even earlier. Previously on an inactive system we possibly waited for some tens of seconds before checking kernel taints. Signed-off-by: Petri Latvala <petri.latvala@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
author: Petri Latvala <petri.latvala@intel.com> 2020-02-17 16:50:41 +0200
committer: Petri Latvala <petri.latvala@intel.com> 2020-02-19 12:34:36 +0200
commit: 4dce5cf1f39192656e113260e062e7f6782a4b46 (patch)
tree: c76b4fd7cd476a301ce783e071e9a276c375bd87 /runner
parent: 5c5d5952a04117074db660ad25d8a6010cbf6116 (diff)
1 files changed, 113 insertions, 111 deletions
diff --git a/runner/executor.c b/runner/executor.c
index 3ea5d167..33610c9e 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -93,7 +93,7 @@ static void init_watchdogs(struct settings *settings)
 
 	memset(&watchdogs, 0, sizeof(watchdogs));
 
-	if (!settings->use_watchdog || settings->inactivity_timeout <= 0)
+	if (!settings->use_watchdog)
 		return;
 
 	if (settings->log_level >= LOG_LEVEL_VERBOSE) {
@@ -672,6 +672,69 @@ static void show_kernel_task_state(void)
 	sysrq('t');
 }
 
+static const char *need_to_timeout(struct settings *settings,
+				   int killed,
+				   unsigned long taints,
+				   double time_since_activity,
+				   double time_since_kill)
+{
+	if (killed) {
+		/*
+		 * Timeout after being killed is a hardcoded amount
+		 * depending on which signal we already used. The
+		 * exception is SIGKILL which just immediately bails
+		 * out if the kernel is tainted, because there's
+		 * little to no hope of the process dying gracefully
+		 * or at all.
+		 *
+		 * Note that if killed == SIGKILL, the caller needs
+		 * special handling anyway and should ignore the
+		 * actual string returned.
+		 */
+		const double kill_timeout = killed == SIGKILL ? 20.0 : 120.0;
+
+		if ((killed == SIGKILL && is_tainted(taints)) ||
+		    time_since_kill > kill_timeout)
+			return "Timeout. Killing the current test with SIGKILL.\n";
+
+		/*
+		 * We don't care for the other reasons to timeout if
+		 * we're already killing the test.
+		 */
+		return NULL;
+	}
+
+	/*
+	 * If we're configured to care about taints, kill the
+	 * test if there's a taint.
+	 */
+	if (settings->abort_mask & ABORT_TAINT &&
+	    is_tainted(taints))
+		return "Killing the test because the kernel is tainted.\n";
+
+	if (settings->inactivity_timeout != 0 &&
+	    time_since_activity > settings->inactivity_timeout) {
+		show_kernel_task_state();
+		return "Timeout. Killing the current test with SIGQUIT.\n";
+	}
+
+	return NULL;
+}
+
+static int next_kill_signal(int killed)
+{
+	switch (killed) {
+	case 0:
+		return SIGQUIT;
+	case SIGQUIT:
+		return SIGKILL;
+	case SIGKILL:
+	default:
+		assert(!"Unreachable");
+		return SIGKILL;
+	}
+}
+
 /*
  * Returns:
  *  =0 - Success
@@ -693,18 +756,15 @@ static int monitor_output(pid_t child,
 	ssize_t s;
 	int n, status;
 	int nfds = outfd;
-	int timeout = settings->inactivity_timeout;
-	int timeout_intervals = 1, intervals_left;
-	int wd_extra = 10;
+	const int interval_length = 1;
+	int wd_timeout;
 	int killed = 0; /* 0 if not killed, signal number otherwise */
-	int sigkill_timeout = 120;
-	int sigkill_interval = 20;
-	int sigkill_intervals_left = sigkill_timeout / sigkill_interval;
-	struct timespec time_beg, time_end;
+	struct timespec time_beg, time_now, time_last_activity, time_killed;
 	unsigned long taints = 0;
 	bool aborting = false;
 
 	igt_gettime(&time_beg);
+	time_last_activity = time_killed = time_beg;
 
 	if (errfd > nfds)
 		nfds = errfd;
@@ -714,32 +774,32 @@ static int monitor_output(pid_t child,
 		nfds = sigfd;
 	nfds++;
 
-	if (timeout > 0) {
+	/*
+	 * If we're still alive, we want to kill the test process
+	 * instead of cutting power. Use a healthy 2 minute watchdog
+	 * timeout that gets automatically reduced if the device
+	 * doesn't support it.
+	 *
+	 * watchdogs_set_timeout() is a no-op and returns the given
+	 * timeout if we don't have use_watchdog set in settings.
+	 */
+	wd_timeout = watchdogs_set_timeout(120);
+
+	if (wd_timeout < 120) {
 		/*
-		 * Use original timeout plus some leeway. If we're still
-		 * alive, we want to kill the test process instead of cutting
-		 * power.
+		 * Watchdog timeout smaller, warn the user. With the
+		 * short select() timeout we're using we're able to
+		 * ping the watchdog regardless.
 		 */
-		int wd_timeout = watchdogs_set_timeout(timeout + wd_extra);
-
-		if (wd_timeout < timeout + wd_extra) {
-			/* Watchdog timeout smaller, so ping it more often */
-			if (wd_timeout - wd_extra < 0)
-				wd_extra = wd_timeout / 2;
-			timeout_intervals = timeout / (wd_timeout - wd_extra);
-			timeout /= timeout_intervals;
-
-			if (settings->log_level >= LOG_LEVEL_VERBOSE) {
-				outf("Watchdog doesn't support the timeout we requested (shortened to %d seconds). Using %d intervals of %d seconds.\n",
-				     wd_timeout, timeout_intervals, timeout);
-			}
+		if (settings->log_level >= LOG_LEVEL_VERBOSE) {
+			outf("Watchdog doesn't support the timeout we requested (shortened to %d seconds).\n",
+			     wd_timeout);
 		}
 	}
 
-	intervals_left = timeout_intervals;
-
 	while (outfd >= 0 || errfd >= 0 || sigfd >= 0) {
-		struct timeval tv = { .tv_sec = timeout };
+		const char *timeout_reason;
+		struct timeval tv = { .tv_sec = interval_length };
 
 		FD_ZERO(&set);
 		if (outfd >= 0)
@@ -751,7 +811,7 @@ static int monitor_output(pid_t child,
 		if (sigfd >= 0)
 			FD_SET(sigfd, &set);
 
-		n = select(nfds, &set, NULL, NULL, timeout == 0 ? NULL : &tv);
+		n = select(nfds, &set, NULL, NULL, &tv);
 		ping_watchdogs();
 
 		if (n < 0) {
@@ -759,86 +819,18 @@ static int monitor_output(pid_t child,
 			return -1;
 		}
 
-		/*
-		 * If we're configured to care about taints, kill the
-		 * test if there's a taint. But only if we didn't
-		 * already kill it, and make sure we still process the
-		 * fds select() marked for us.
-		 */
-		if (settings->abort_mask & ABORT_TAINT &&
-		    tainted(&taints) &&
-		    killed == 0) {
-			if (settings->log_level >= LOG_LEVEL_NORMAL) {
-				outf("Killing the test because the kernel is tainted.\n");
-				fflush(stdout);
-			}
-
-			killed = SIGQUIT;
-			if (!kill_child(killed, child))
-				return -1;
-
-			/*
-			 * Now continue the loop and let the
-			 * dying child be handled normally.
-			 */
-			timeout = 20;
-			watchdogs_set_timeout(120);
-			intervals_left = timeout_intervals = 1;
-		} else if (n == 0) {
-			if (--intervals_left)
-				continue;
+		igt_gettime(&time_now);
 
-			switch (killed) {
-			case 0:
-				show_kernel_task_state();
-				if (settings->log_level >= LOG_LEVEL_NORMAL) {
-					outf("Timeout. Killing the current test with SIGQUIT.\n");
-					fflush(stdout);
-				}
-
-				killed = SIGQUIT;
-				if (!kill_child(killed, child))
-					return -1;
-
-				/*
-				 * Now continue the loop and let the
-				 * dying child be handled normally.
-				 */
-				timeout = 20;
-				watchdogs_set_timeout(120);
-				intervals_left = timeout_intervals = 1;
-				break;
-			case SIGQUIT:
-				if (settings->log_level >= LOG_LEVEL_NORMAL) {
-					outf("Timeout. Killing the current test with SIGKILL.\n");
-					fflush(stdout);
-				}
-
-				killed = SIGKILL;
-				if (!kill_child(killed, child))
-					return -1;
-
-				/*
-				 * Allow the test two minutes to die
-				 * on SIGKILL. If it takes more than
-				 * that, we're quite likely in a
-				 * scenario where we want to reboot
-				 * the machine anyway.
-				 */
-				watchdogs_set_timeout(sigkill_timeout);
-				timeout = sigkill_interval;
-				intervals_left = 1; /* Intervals handled separately for sigkill */
-				break;
-			case SIGKILL:
-				if (!is_tainted(taints) && --sigkill_intervals_left) {
-					intervals_left = 1;
-					break;
-				}
+		timeout_reason = need_to_timeout(settings, killed, tainted(&taints),
+						 igt_time_elapsed(&time_last_activity, &time_now),
+						 igt_time_elapsed(&time_killed, &time_now));
 
+		if (timeout_reason) {
+			if (killed == SIGKILL) {
 				/* Nothing that can be done, really. Let's tell the caller we want to abort. */
 
 				if (settings->log_level >= LOG_LEVEL_NORMAL) {
-					errf("Child refuses to die, tainted %lx. Aborting.\n",
+					errf("Child refuses to die, tainted 0x%lx. Aborting.\n",
 					     taints);
 					if (kill(child, 0) && errno == ESRCH)
 						errf("The test process no longer exists, "
@@ -853,15 +845,23 @@ static int monitor_output(pid_t child,
 				return -1;
 			}
 
-			continue;
-		}
+			if (settings->log_level >= LOG_LEVEL_NORMAL) {
+				outf("%s", timeout_reason);
+				fflush(stdout);
+			}
 
-		intervals_left = timeout_intervals;
+			killed = next_kill_signal(killed);
+			if (!kill_child(killed, child))
+				return -1;
+			time_killed = time_now;
+		}
 
 		/* TODO: Refactor these handlers to their own functions */
 		if (outfd >= 0 && FD_ISSET(outfd, &set)) {
 			char *newline;
 
+			time_last_activity = time_now;
+
 			s = read(outfd, buf, sizeof(buf));
 			if (s <= 0) {
 				if (s < 0) {
@@ -929,6 +929,8 @@ static int monitor_output(pid_t child,
 	out_end:
 
 		if (errfd >= 0 && FD_ISSET(errfd, &set)) {
+			time_last_activity = time_now;
+
 			s = read(errfd, buf, sizeof(buf));
 			if (s <= 0) {
 				if (s < 0) {
@@ -945,6 +947,8 @@ static int monitor_output(pid_t child,
 		}
 
 		if (kmsgfd >= 0 && FD_ISSET(kmsgfd, &set)) {
+			time_last_activity = time_now;
+
 			s = read(kmsgfd, buf, sizeof(buf));
 			if (s < 0) {
 				if (errno != EPIPE && errno != EINVAL) {
@@ -995,17 +999,15 @@ static int monitor_output(pid_t child,
 				}
 
 				aborting = true;
-				timeout = 2;
 				killed = SIGQUIT;
 				if (!kill_child(killed, child))
 					return -1;
+				time_killed = time_now;
 
 				continue;
 			}
 
-			igt_gettime(&time_end);
-
-			time = igt_time_elapsed(&time_beg, &time_end);
+			time = igt_time_elapsed(&time_beg, &time_now);
 			if (time < 0.0)
 				time = 0.0;
author	Petri Latvala <petri.latvala@intel.com>	2020-02-17 16:50:41 +0200
committer	Petri Latvala <petri.latvala@intel.com>	2020-02-19 12:34:36 +0200
commit	4dce5cf1f39192656e113260e062e7f6782a4b46 (patch)
tree	c76b4fd7cd476a301ce783e071e9a276c375bd87 /runner
parent	5c5d5952a04117074db660ad25d8a6010cbf6116 (diff)