소스 검색

Add STARPU_WATCHDOG_DELAY environment variable

Samuel Thibault 8 년 전
부모
커밋
a84304dfdc
3개의 변경된 파일36개의 추가작업 그리고 14개의 파일을 삭제
  1. 1 0
      ChangeLog
  2. 9 0
      doc/doxygen/chapters/501_environment_variables.doxy
  3. 26 14
      src/core/task.c

+ 1 - 0
ChangeLog

@@ -84,6 +84,7 @@ Small features:
     allow having one perfmodel per CPU core
   * Add starpu_vector_filter_list_long filter.
   * Add starpu_perfmodel_arch_comb_fetch function.
+  * Add STARPU_WATCHDOG_DELAY environment variable.
 
 Small changes:
   * Output generated through STARPU_MPI_COMM has been modified to

+ 9 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -1033,6 +1033,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 </dd>
 
+<dt>STARPU_WATCHDOG_DELAY</dt>
+<dd>
+\anchor STARPU_WATCHDOG_DELAY
+\addindex __env__STARPU_WATCHDOG_DELAY
+This delays the activation of the watchdog by the given time (in µs). This can
+be convenient for letting the application initialize data etc. before starting
+to look for idle time.
+</dd>
+
 <dt>STARPU_TASK_BREAK_ON_PUSH</dt>
 <dd>
 \anchor STARPU_TASK_BREAK_ON_PUSH

+ 26 - 14
src/core/task.c

@@ -55,6 +55,7 @@ static starpu_pthread_key_t current_task_key;
 static int limit_min_submitted_tasks;
 static int limit_max_submitted_tasks;
 static int watchdog_crash;
+static int watchdog_delay;
 
 #define _STARPU_TASK_MAGIC 42
 
@@ -65,6 +66,7 @@ void _starpu_task_init(void)
 	limit_min_submitted_tasks = starpu_get_env_number("STARPU_LIMIT_MIN_SUBMITTED_TASKS");
 	limit_max_submitted_tasks = starpu_get_env_number("STARPU_LIMIT_MAX_SUBMITTED_TASKS");
 	watchdog_crash = starpu_get_env_number("STARPU_WATCHDOG_CRASH");
+	watchdog_delay = starpu_get_env_number_default("STARPU_WATCHDOG_DELAY", 0);
 }
 
 void _starpu_task_deinit(void)
@@ -1255,20 +1257,41 @@ unsigned long starpu_task_get_job_id(struct starpu_task *task)
 
 static starpu_pthread_t watchdog_thread;
 
+static int sleep_some(float timeout) {
+	/* If we do a sleep(timeout), we might have to wait too long at the end of the computation. */
+	/* To avoid that, we do several sleep() of 1s (and check after each if starpu is still running) */
+	float t;
+	for (t = timeout ; t > 1.; t--)
+	{
+		starpu_sleep(1.);
+		if (!_starpu_machine_is_running())
+			/* Application finished, don't bother finishing the sleep */
+			return 0;
+	}
+	/* and one final sleep (of less than 1 s) with the rest (if needed) */
+	if (t > 0.)
+		starpu_sleep(t);
+	return 1;
+}
+
 /* Check from times to times that StarPU does finish some tasks */
 static void *watchdog_func(void *arg)
 {
 	char *timeout_env = arg;
-	float timeout;
+	float timeout, delay;
 
 #ifdef _MSC_VER
 	timeout = ((float) _atoi64(timeout_env)) / 1000000;
 #else
 	timeout = ((float) atoll(timeout_env)) / 1000000;
 #endif
+	delay = ((float) watchdog_delay) / 1000000;
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	starpu_pthread_setname("watchdog");
 
+	if (!sleep_some(delay))
+		return NULL;
+
 	STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
 	while (_starpu_machine_is_running())
 	{
@@ -1276,19 +1299,8 @@ static void *watchdog_func(void *arg)
 		config->watchdog_ok = 0;
 		STARPU_PTHREAD_MUTEX_UNLOCK(&config->submitted_mutex);
 
-		/* If we do a sleep(timeout), we might have to wait too long at the end of the computation. */
-		/* To avoid that, we do several sleep() of 1s (and check after each if starpu is still running) */
-		float t;
-		for (t = timeout ; t > 1.; t--)
-		{
-			starpu_sleep(1.);
-			if (!_starpu_machine_is_running())
-				/* Application finished, don't bother finishing the sleep */
-				return NULL;
-		}
-		/* and one final sleep (of less than 1 s) with the rest (if needed) */
-		if (t > 0.)
-			starpu_sleep(t);
+		if (!sleep_some(timeout))
+			return NULL;
 
 		STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
 		if (!config->watchdog_ok && last_nsubmitted