summaryrefslogtreecommitdiff
path: root/runner
diff options
context:
space:
mode:
authorPetri Latvala <petri.latvala@intel.com>2019-12-03 13:15:37 +0200
committerPetri Latvala <petri.latvala@intel.com>2019-12-05 12:19:08 +0200
commit59be90b3c76113d03a1bb095c4d4585e51058f4a (patch)
tree0744444adadaf7cd9b9a2500270df1a81a500119 /runner
parente26259c7570390aa0664b250cb842439598bcbff (diff)
runner: Don't wait forever for processes to die
While the originally written timeout for process killing (2 seconds) was way too short, waiting indefinitely is suboptimal as well. We're seeing cases where the test is stuck for possibly hours in uninterruptible sleep (IO). Wait a fairly longer selected time period of 2 minutes, because even making progress for that long means the machine is in bad enough state to require a good kicking and booting. v2: - Abort quicker if kernel is tainted (Chris) - Correctly convert process-exists check with kill() to process-does-not-exist Signed-off-by: Petri Latvala <petri.latvala@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'runner')
-rw-r--r--runner/executor.c29
1 files changed, 20 insertions, 9 deletions
diff --git a/runner/executor.c b/runner/executor.c
index e6086772..f36bfd3d 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -682,6 +682,9 @@ static int monitor_output(pid_t child,
int timeout_intervals = 1, intervals_left;
int wd_extra = 10;
int killed = 0; /* 0 if not killed, signal number otherwise */
+ int sigkill_timeout = 120;
+ int sigkill_interval = 20;
+ int sigkill_intervals_left = sigkill_timeout / sigkill_interval;
struct timespec time_beg, time_end;
unsigned long taints = 0;
bool aborting = false;
@@ -776,25 +779,33 @@ static int monitor_output(pid_t child,
if (!kill_child(killed, child))
return -1;
- intervals_left = timeout_intervals = 1;
- break;
- case SIGKILL:
/*
- * If the child still exists, and the kernel
- * hasn't oopsed, assume it is still making
- * forward progress towards exiting (i.e. still
- * freeing all of its resources).
+ * Allow the test two minutes to die
+ * on SIGKILL. If it takes more than
+ * that, we're quite likely in a
+ * scenario where we want to reboot
+ * the machine anyway.
*/
- if (kill(child, 0) == 0 && !tainted(&taints)) {
- intervals_left = 1;
+ watchdogs_set_timeout(sigkill_timeout);
+ timeout = sigkill_interval;
+ intervals_left = 1; /* Intervals handled separately for sigkill */
+ break;
+ case SIGKILL:
+ if (!tainted(&taints) && --sigkill_intervals_left) {
+ intervals_left = 1;
break;
}
/* Nothing that can be done, really. Let's tell the caller we want to abort. */
+
if (settings->log_level >= LOG_LEVEL_NORMAL) {
errf("Child refuses to die, tainted %lx. Aborting.\n",
taints);
+ if (kill(child, 0) && errno == ESRCH)
+ errf("The test process no longer exists, "
+ "but we didn't get informed of its demise...\n");
}
+
close_watchdogs(settings);
free(outbuf);
close(outfd);