Просмотр исходного кода

port r11383 from 1.1: Add a watchdog which permits to easily trigger a crash when StarPU gets stuck.

Samuel Thibault лет назад: 11
Родитель
Сommit
1988972e97
6 измененных файлов с 89 добавлено и 1 удалено
  1. 2 0
      ChangeLog
  2. 17 0
      doc/doxygen/chapters/optimize_performance.doxy
  3. 4 1
      src/core/debug.h
  4. 59 0
      src/core/task.c
  5. 3 0
      src/core/task.h
  6. 4 0
      src/core/workers.c

+ 2 - 0
ChangeLog

@@ -54,6 +54,8 @@ New features:
     cudaMalloc overhead.
   * Prefetching is now done for all schedulers when it can be done whatever
     the scheduling decision.
+  * Add a watchdog which permits to easily trigger a crash when StarPU gets
+    stuck.
 
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and

+ 17 - 0
doc/doxygen/chapters/optimize_performance.doxy

@@ -382,6 +382,23 @@ Statistics on the execution can then be obtained by using <c>export
 STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
  More details on performance feedback are provided by the next chapter.
 
+\section Detection stuck conditions
+
+It may happen that for some reason, StarPU does not make progress for a long
+period of time.  Reason are sometimes due to contention inside StarPU, but
+sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
+driver, etc.
+
+<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
+
+allows to make StarPU print an error message whenever StarPU does not terminate
+any task for 10ms. In addition to that,
+
+<c>export STARPU_WATCHDOG_CRASH=1</c>
+
+triggers a crash in that condition, thus allowing to catch the situation in gdb
+etc.
+
 \section CUDA-specificOptimizations CUDA-specific Optimizations
 
 Due to CUDA limitations, StarPU will have a hard time overlapping its own

+ 4 - 1
src/core/debug.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,4 +44,7 @@ extern int _starpu_use_fxt;
 /* Get an Ayudame id for CL */
 int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl);
 
+void _starpu_watchdog_init(void);
+void _starpu_watchdog_shutdown(void);
+
 #endif // __DEBUG_H__

+ 59 - 0
src/core/task.c

@@ -39,6 +39,7 @@
 static starpu_pthread_cond_t submitted_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t submitted_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static long int nsubmitted = 0, nready = 0;
+static int watchdog_ok;
 
 static void _starpu_increment_nsubmitted_tasks(void);
 
@@ -795,6 +796,9 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
 	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 
+	if (!watchdog_ok)
+		watchdog_ok = 1;
+
 	if (--nsubmitted == 0)
 	{
 		if (!config->submitting)
@@ -1005,3 +1009,58 @@ char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, un
 {
 	return cl->cpu_funcs_name[nimpl];
 }
+
+static starpu_pthread_t watchdog_thread;
+
+/* Check from times to times that StarPU does finish some tasks */
+static void *watchdog_func(void *foo)
+{
+	struct timespec ts, req, rem;
+	char *timeout_env;
+	unsigned long long timeout;
+
+	if (! (timeout_env = getenv("STARPU_WATCHDOG_TIMEOUT")))
+		return NULL;
+
+	timeout = atoll(timeout_env);
+	ts.tv_sec = timeout / 1000000;
+	ts.tv_nsec = (timeout % 1000000) * 1000;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+	while (_starpu_machine_is_running())
+	{
+		int last_nsubmitted = starpu_task_nsubmitted();
+		watchdog_ok = 0;
+		STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+
+		req = ts;
+		while (nanosleep(&ts, &rem))
+			ts = rem;
+
+		STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+		if (!watchdog_ok && last_nsubmitted
+				&& last_nsubmitted == starpu_task_nsubmitted())
+		{
+			fprintf(stderr,"The StarPU watchdog detected that no task finished for %u.%06us (can be configure through STARPU_WATCHDOG_TIMEOUT)\n", ts.tv_sec, ts.tv_nsec/1000);
+			if (getenv("STARPU_WATCHDOG_CRASH"))
+			{
+				fprintf(stderr,"Crashing the process\n");
+				assert(0);
+			}
+			else
+				fprintf(stderr,"Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
+		}
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+	return NULL;
+}
+
+void _starpu_watchdog_init(void)
+{
+	STARPU_PTHREAD_CREATE(&watchdog_thread, NULL, watchdog_func, NULL);
+}
+
+void _starpu_watchdog_shutdown(void)
+{
+	starpu_pthread_join(watchdog_thread, NULL);
+}

+ 3 - 0
src/core/task.h

@@ -80,4 +80,7 @@ char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, un
 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)
 
+void _starpu_watchdog_init(void);
+void _starpu_watchdog_shutdown(void);
+
 #endif // __CORE_TASK_H__

+ 4 - 0
src/core/workers.c

@@ -1041,6 +1041,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	if (!is_a_sink)
 		_starpu_launch_drivers(&config);
 
+	_starpu_watchdog_init();
+
 	STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	initialized = INITIALIZED;
 	/* Tell everybody that we initialized */
@@ -1215,6 +1217,8 @@ void starpu_shutdown(void)
 
 	_starpu_deinitialize_registered_performance_models();
 
+	_starpu_watchdog_shutdown();
+
 	/* wait for their termination */
 	_starpu_terminate_workers(&config);