Przeglądaj źródła

New function starpu_task_watchdog_set_hook to specify a function to be called when the watchdog is raised

Alexis Juven 5 lat temu
rodzic
commit
bcfa006770
3 zmienionych plików z 31 dodań i 7 usunięć
  1. 2 0
      ChangeLog
  2. 8 2
      include/starpu_task.h
  3. 21 5
      src/core/task.c

+ 2 - 0
ChangeLog

@@ -43,6 +43,8 @@ Small features:
   * Move optimized cuda 2d copy from interfaces to new
     starpu_cuda_copy2d_async_sync and starpu_cuda_copy3d_async_sync, and use
     them from starpu_interface_copy2d and 3d.
+  * New function starpu_task_watchdog_set_hook to specify a function
+    to be called when the watchdog is raised
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

+ 8 - 2
include/starpu_task.h

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2017,2019                           Inria
+ * Copyright (C) 2011-2017,2020                           Inria
  * Copyright (C) 2009-2019                                Université de Bordeaux
- * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
+ * Copyright (C) 2010-2015,2017,2018,2019,2020            CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
  *
@@ -1629,6 +1629,12 @@ void starpu_task_ft_failed(struct starpu_task *task);
  */
 void starpu_task_ft_success(struct starpu_task *meta_task);
 
+/**
+   Set the function to call when the watchdog detects that StarPU has
+   not finished any task for STARPU_WATCHDOG_TIMEOUT seconds
+*/
+void starpu_task_watchdog_set_hook(void (*hook)(void *), void *hook_arg);
+
 /** @} */
 
 #ifdef __cplusplus

+ 21 - 5
src/core/task.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2011-2019                                Inria
  * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2017                                     Erwan Leria
- * Copyright (C) 2010-2019                                CNRS
+ * Copyright (C) 2010-2020                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
@@ -245,6 +245,12 @@ static int limit_max_submitted_tasks;
 static int watchdog_crash;
 static int watchdog_delay;
 
+/*
+ * Function to call when watchdog detects that no task has finished for more than STARPU_WATCHDOG_TIMEOUT seconds
+ */
+static void (*watchdog_hook)(void *) = NULL;
+static void * watchdog_hook_arg = NULL;
+
 #define _STARPU_TASK_MAGIC 42
 
 /* Called once at starpu_init */
@@ -1547,14 +1553,18 @@ static void *watchdog_func(void *arg)
 		if (!config->watchdog_ok && last_nsubmitted
 				&& last_nsubmitted == starpu_task_nsubmitted())
 		{
-			_STARPU_MSG("The StarPU watchdog detected that no task finished for %fs (can be configured through STARPU_WATCHDOG_TIMEOUT)\n",
-				    timeout);
+			if (watchdog_hook == NULL)
+				_STARPU_MSG("The StarPU watchdog detected that no task finished for %fs (can be configured through STARPU_WATCHDOG_TIMEOUT)\n",
+									timeout);
+			else
+				watchdog_hook(watchdog_hook_arg);
+
 			if (watchdog_crash)
 			{
 				_STARPU_MSG("Crashing the process\n");
 				raise(SIGABRT);
 			}
-			else
+			else if (watchdog_hook == NULL)
 				_STARPU_MSG("Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
 		}
 		/* Only shout again after another period */
@@ -1564,7 +1574,13 @@ static void *watchdog_func(void *arg)
 	return NULL;
 }
 
-void _starpu_watchdog_init(void)
+void starpu_task_watchdog_set_hook(void (*hook)(void *), void *hook_arg)
+{
+	watchdog_hook = hook;
+	watchdog_hook_arg = hook_arg;
+}
+
+void _starpu_watchdog_init()
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	char *timeout_env = starpu_getenv("STARPU_WATCHDOG_TIMEOUT");