From 756f3e0cb7e7d7351e3eb955ca782a438c6aa887 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 22 Mar 2016 11:33:41 +0000 Subject: lib: Add a GPU error detector If we listen to the uevents from the kernel, we can detect when the GPU hangs. This requires us to fork a helper process to do so and send a signal back to the parent. Signed-off-by: Chris Wilson --- benchmarks/Makefile.am | 2 +- debugger/Makefile.am | 2 +- demos/Makefile.am | 2 +- lib/Makefile.am | 12 ++++++-- lib/igt_aux.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/igt_aux.h | 3 ++ tests/Makefile.am | 3 +- tests/gem_exec_whisper.c | 4 +++ tools/Makefile.am | 2 +- 9 files changed, 102 insertions(+), 8 deletions(-) diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index c67f4722..2c2d1006 100644 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -3,7 +3,7 @@ include Makefile.sources AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS) -LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm +LDADD = $(top_builddir)/lib/libintel_tools.la benchmarks_LTLIBRARIES = gem_exec_tracer.la gem_exec_tracer_la_LDFLAGS = -module -avoid-version -no-undefined diff --git a/debugger/Makefile.am b/debugger/Makefile.am index 5a523f5e..9d231d3f 100644 --- a/debugger/Makefile.am +++ b/debugger/Makefile.am @@ -15,4 +15,4 @@ AM_CFLAGS = \ $(LIBUNWIND_CFLAGS) \ $(CWARNFLAGS) -LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) +LDADD = $(top_builddir)/lib/libintel_tools.la diff --git a/demos/Makefile.am b/demos/Makefile.am index d18a705f..e6fbb3ba 100644 --- a/demos/Makefile.am +++ b/demos/Makefile.am @@ -4,4 +4,4 @@ bin_PROGRAMS = \ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS) -LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) +LDADD = $(top_builddir)/lib/libintel_tools.la diff --git a/lib/Makefile.am b/lib/Makefile.am index a8a1eb6d..d2f2e16b 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -15,12 +15,20 @@ if HAVE_VC4 endif AM_CPPFLAGS = -I$(top_srcdir) -AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(LIBUNWIND_CFLAGS) $(DEBUG_CFLAGS) \ +AM_CFLAGS = $(CWARNFLAGS) $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(LIBUNWIND_CFLAGS) $(DEBUG_CFLAGS) \ -DIGT_SRCDIR=\""$(abs_top_srcdir)/tests"\" \ -DIGT_DATADIR=\""$(pkgdatadir)"\" \ -DIGT_LOG_DOMAIN=\""$(subst _,-,$*)"\" \ -pthread -LDADD = $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm AM_CFLAGS += $(CAIRO_CFLAGS) +libintel_tools_la_LIBADD = \ + $(DRM_LIBS) \ + $(PCIACCESS_LIBS) \ + $(CAIRO_LIBS) \ + $(LIBUDEV_LIBS) \ + $(LIBUNWIND_LIBS) \ + $(TIMER_LIBS) \ + -lm + diff --git a/lib/igt_aux.c b/lib/igt_aux.c index b4c301e5..bfeaa168 100644 --- a/lib/igt_aux.c +++ b/lib/igt_aux.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -359,6 +360,85 @@ void igt_stop_signal_helper(void) sig_stat = 0; } +#if HAVE_UDEV +#include + +static struct igt_helper_process hang_detector; +static void __attribute__((noreturn)) +hang_detector_process(pid_t pid, dev_t rdev) +{ + struct udev_monitor *mon = + udev_monitor_new_from_netlink(udev_new(), "kernel"); + struct pollfd pfd; + + udev_monitor_filter_add_match_subsystem_devtype(mon, "drm", NULL); + udev_monitor_enable_receiving(mon); + + pfd.fd = udev_monitor_get_fd(mon); + pfd.events = POLLIN; + + while (poll(&pfd, 1, -1) > 0) { + struct udev_device *dev = udev_monitor_receive_device(mon); + dev_t devnum; + + if (dev == NULL) + break; + + devnum = udev_device_get_devnum(dev); + if (memcmp(&rdev, &devnum, sizeof(dev_t)) == 0) { + const char *str; + + str = udev_device_get_property_value(dev, "ERROR"); + if (str && atoi(str) == 1) + kill(pid, SIGRTMAX); + } + + udev_device_unref(dev); + if (kill(pid, 0)) /* Parent has died, so must we. */ + break; + } + + exit(0); +} + +static void sig_abort(int sig) +{ + igt_assert(!"GPU hung"); +} + +void igt_fork_hang_detector(int fd) +{ + struct stat st; + + if (igt_only_list_subtests()) + return; + + igt_assert(fstat(fd, &st) == 0); + + signal(SIGRTMAX, sig_abort); + igt_fork_helper(&hang_detector) + hang_detector_process(getppid(), st.st_rdev); +} + +void igt_stop_hang_detector(void) +{ + if (igt_only_list_subtests()) + return; + + igt_stop_helper(&hang_detector); +} +#else +void igt_fork_hang_detector(int fd) +{ + if (igt_only_list_subtests()) + return; +} + +void igt_stop_hang_detector(void) +{ +} +#endif + /** * igt_check_boolean_env_var: * @env_var: environment variable name diff --git a/lib/igt_aux.h b/lib/igt_aux.h index 101fad12..cdaed297 100644 --- a/lib/igt_aux.h +++ b/lib/igt_aux.h @@ -40,6 +40,9 @@ extern int num_trash_bos; void igt_fork_signal_helper(void); void igt_stop_signal_helper(void); +void igt_fork_hang_detector(int fd); +void igt_stop_hang_detector(void); + struct igt_sigiter { unsigned pass; }; diff --git a/tests/Makefile.am b/tests/Makefile.am index 0ed40f7d..771e9eed 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -56,9 +56,8 @@ AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(DEBUG_CFLAGS)\ $(LIBUNWIND_CFLAGS) \ $(NULL) -LDADD = ../lib/libintel_tools.la $(PCIACCESS_LIBS) $(DRM_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) +LDADD = ../lib/libintel_tools.la $(GLIB_LIBS) -LDADD += $(CAIRO_LIBS) $(LIBUDEV_LIBS) $(GLIB_LIBS) -lm AM_CFLAGS += $(CAIRO_CFLAGS) $(LIBUDEV_CFLAGS) $(GLIB_CFLAGS) AM_LDFLAGS = -Wl,--as-needed diff --git a/tests/gem_exec_whisper.c b/tests/gem_exec_whisper.c index b84f1a27..1991fed7 100644 --- a/tests/gem_exec_whisper.c +++ b/tests/gem_exec_whisper.c @@ -368,6 +368,8 @@ igt_main igt_fixture fd = drm_open_driver_master(DRIVER_INTEL); + igt_fork_hang_detector(fd); + for (const struct mode *m = modes; m->name; m++) igt_subtest_f("%s", *m->name ? m->name : "basic") whisper(fd, -1, m->flags); @@ -382,6 +384,8 @@ igt_main whisper(fd, e->exec_id | e->flags, m->flags); } + igt_stop_hang_detector(); + igt_fixture close(fd); } diff --git a/tools/Makefile.am b/tools/Makefile.am index 74c55218..df48d94a 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -4,7 +4,7 @@ SUBDIRS = null_state_gen registers AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib AM_CFLAGS = $(DEBUG_CFLAGS) $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS) -DPKGDATADIR=\"$(pkgdatadir)\" -LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUDEV_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm +LDADD = $(top_builddir)/lib/libintel_tools.la AM_LDFLAGS = -Wl,--as-needed -- cgit v1.2.3