Browse Source

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu

Samuel Thibault 5 years ago
parent
commit
aeee4c1872
6 changed files with 75 additions and 80 deletions
  1. 2 0
      ChangeLog
  2. 6 7
      include/starpu_task.h
  3. 3 1
      mpi/src/mpi/starpu_mpi_mpi.c
  4. 3 1
      mpi/src/nmad/starpu_mpi_nmad.c
  5. 54 67
      src/core/task.c
  6. 7 4
      src/core/topology.c

+ 2 - 0
ChangeLog

@@ -43,6 +43,8 @@ Small features:
   * Move optimized cuda 2d copy from interfaces to new
     starpu_cuda_copy2d_async_sync and starpu_cuda_copy3d_async_sync, and use
     them from starpu_interface_copy2d and 3d.
+  * New function starpu_task_watchdog_set_hook to specify a function
+    to be called when the watchdog is raised
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

+ 6 - 7
include/starpu_task.h

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2017,2019                           Inria
+ * Copyright (C) 2011-2017,2020                           Inria
  * Copyright (C) 2009-2019                                Université de Bordeaux
- * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
+ * Copyright (C) 2010-2015,2017,2018,2019,2020            CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
  *
@@ -1629,12 +1629,11 @@ void starpu_task_ft_failed(struct starpu_task *task);
  */
 void starpu_task_ft_success(struct starpu_task *meta_task);
 
-
 /**
-   Sets the function to call when the watchdog detects that StarPU has not
- * finished task for STARPU_WATCHDOG_TIMEOUT seconds */
-void starpu_task_set_watchdog_hook(void (*hook)(void *), void * hook_arg);
-
+   Set the function to call when the watchdog detects that StarPU has
+   not finished any task for STARPU_WATCHDOG_TIMEOUT seconds
+*/
+void starpu_task_watchdog_set_hook(void (*hook)(void *), void *hook_arg);
 
 /** @} */
 

+ 3 - 1
mpi/src/mpi/starpu_mpi_mpi.c

@@ -1133,7 +1133,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
 	{
-		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n");
+		char hostname[65];
+		_starpu_gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)

+ 3 - 1
mpi/src/nmad/starpu_mpi_nmad.c

@@ -414,7 +414,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
 	{
-		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n");
+		char hostname[65];
+		_starpu_gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)

+ 54 - 67
src/core/task.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2011-2019                                Inria
  * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2017                                     Erwan Leria
- * Copyright (C) 2010-2019                                CNRS
+ * Copyright (C) 2010-2020                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
@@ -246,12 +246,11 @@ static int watchdog_crash;
 static int watchdog_delay;
 
 /*
- * Fuction to call when watchdog detects that no task has finished for more than STARPU_WATCHDOG_TIMEOUT seconds
+ * Function to call when watchdog detects that no task has finished for more than STARPU_WATCHDOG_TIMEOUT seconds
  */
 static void (*watchdog_hook)(void *) = NULL;
 static void * watchdog_hook_arg = NULL;
 
-
 #define _STARPU_TASK_MAGIC 42
 
 /* Called once at starpu_init */
@@ -371,7 +370,7 @@ void _starpu_task_destroy(struct starpu_task *task)
 	/* If starpu_task_destroy is called in a callback, we just set the destroy
 	   flag. The task will be destroyed after the callback returns */
 	if (task == starpu_task_get_current()
-			&& _starpu_get_local_worker_status() == STATUS_CALLBACK)
+	    && _starpu_get_local_worker_status() == STATUS_CALLBACK)
 	{
 		task->destroy = 1;
 	}
@@ -417,7 +416,7 @@ int starpu_task_finished(struct starpu_task *task)
 
 int starpu_task_wait(struct starpu_task *task)
 {
-	_STARPU_LOG_IN();
+        _STARPU_LOG_IN();
 	STARPU_ASSERT(task);
 
 	STARPU_ASSERT_MSG(!task->detach, "starpu_task_wait can only be called on tasks with detach = 0");
@@ -445,7 +444,7 @@ int starpu_task_wait(struct starpu_task *task)
 
 	_starpu_perf_counter_update_global_sample();
 	_STARPU_TRACE_TASK_WAIT_END();
-	_STARPU_LOG_OUT();
+        _STARPU_LOG_OUT();
 	return 0;
 }
 
@@ -511,7 +510,7 @@ int _starpu_submit_job(struct _starpu_job *j, int nodeps)
 #ifdef STARPU_USE_SC_HYPERVISOR
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
 	if(sched_ctx != NULL && j->task->sched_ctx != _starpu_get_initial_sched_ctx()->id && j->task->sched_ctx != STARPU_NMAX_SCHED_CTXS
-			&& sched_ctx->perf_counters != NULL)
+	   && sched_ctx->perf_counters != NULL)
 	{
 		struct starpu_perfmodel_arch arch;
 		_STARPU_MALLOC(arch.devices, sizeof(struct starpu_perfmodel_device));
@@ -604,7 +603,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	unsigned i, some_impl;
 
 	/* Check deprecated and unset fields (where, <device>_func,
-	 * <device>_funcs) */
+ 	 * <device>_funcs) */
 
 	/* CPU */
 	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS && cl->cpu_funcs[0])
@@ -776,8 +775,8 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 		/* Check buffers */
 		if (task->dyn_handles == NULL)
 			STARPU_ASSERT_MSG(STARPU_TASK_GET_NBUFFERS(task) <= STARPU_NMAXBUFS,
-					"Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.",
-					task->cl, STARPU_TASK_GET_NBUFFERS(task), STARPU_NMAXBUFS);
+					  "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.",
+					  task->cl, STARPU_TASK_GET_NBUFFERS(task), STARPU_NMAXBUFS);
 
 		if (STARPU_UNLIKELY(task->dyn_handles))
 		{
@@ -800,9 +799,9 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 			if (handle->home_node != -1)
 				_STARPU_TASK_SET_INTERFACE(task, starpu_data_get_interface_on_node(handle, handle->home_node), i);
 			if (!(task->cl->flags & STARPU_CODELET_NOPLANS) &&
-					((handle->nplans && !handle->nchildren) || handle->siblings)
-					&& handle->partition_automatic_disabled == 0
-			)
+			    ((handle->nplans && !handle->nchildren) || handle->siblings)
+			    && handle->partition_automatic_disabled == 0
+			    )
 				/* This handle is involved with asynchronous
 				 * partitioning as a parent or a child, make
 				 * sure the right plan is active, submit
@@ -856,16 +855,16 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 	starpu_task_bundle_t bundle = task->bundle;
 	STARPU_ASSERT_MSG(!(nodeps && bundle), "not supported\n");
 	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
-	 * task structure, it is possible that this job structure was already
-	 * allocated. */
+	* task structure, it is possible that this job structure was already
+	* allocated. */
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	const unsigned continuation =
 #ifdef STARPU_OPENMP
-			j->continuation
+		j->continuation
 #else
-			0
+		0
 #endif
-			;
+		;
 	if (!_starpu_perf_counter_paused() && !j->internal && !continuation)
 	{
 		(void) STARPU_ATOMIC_ADD64(&_starpu_task__g_total_submitted__value, 1);
@@ -889,7 +888,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 	{
 		int nsubmitted_tasks = starpu_task_nsubmitted();
 		if (limit_max_submitted_tasks >= 0 && limit_max_submitted_tasks < nsubmitted_tasks
-				&& limit_min_submitted_tasks >= 0 && limit_min_submitted_tasks < nsubmitted_tasks)
+			&& limit_min_submitted_tasks >= 0 && limit_min_submitted_tasks < nsubmitted_tasks)
 		{
 			starpu_do_schedule();
 			_STARPU_TRACE_TASK_THROTTLE_START();
@@ -911,8 +910,8 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 	{
 		STARPU_ASSERT_MSG(!j->submitted || j->terminated >= 1, "Tasks can not be submitted a second time before being terminated. Please use different task structures, or use the regenerate flag to let the task resubmit itself automatically.");
 		_STARPU_TRACE_TASK_SUBMIT(j,
-				_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[0],
-				_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[1]);
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[0],
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[1]);
 	}
 
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
@@ -972,11 +971,11 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 		_starpu_sched_do_schedule(task->sched_ctx);
 		_starpu_wait_job(j);
 		if (task->destroy)
-			_starpu_task_destroy(task);
+		     _starpu_task_destroy(task);
 	}
 
 	_STARPU_TRACE_TASK_SUBMIT_END();
-	_STARPU_LOG_OUT();
+        _STARPU_LOG_OUT();
 	return ret;
 }
 
@@ -1010,7 +1009,7 @@ int starpu_task_submit_nodeps(struct starpu_task *task)
  * worker->sched_mutex must be locked when calling this function.
  */
 int _starpu_task_submit_conversion_task(struct starpu_task *task,
-		unsigned int workerid)
+					unsigned int workerid)
 {
 	int ret;
 	STARPU_ASSERT(task->cl);
@@ -1400,13 +1399,13 @@ void _starpu_task_prepare_for_continuation_ext(unsigned continuation_resubmit,
 		void (*continuation_callback_on_sleep)(void *arg), void *continuation_callback_on_sleep_arg)
 {
 	_starpu_job_prepare_for_continuation_ext(_starpu_get_job_associated_to_task(starpu_task_get_current()),
-			continuation_resubmit, continuation_callback_on_sleep, continuation_callback_on_sleep_arg);
+		continuation_resubmit, continuation_callback_on_sleep, continuation_callback_on_sleep_arg);
 }
 
 void _starpu_task_set_omp_cleanup_callback(struct starpu_task *task, void (*omp_cleanup_callback)(void *arg), void *omp_cleanup_callback_arg)
 {
 	_starpu_job_set_omp_cleanup_callback(_starpu_get_job_associated_to_task(task),
-			omp_cleanup_callback, omp_cleanup_callback_arg);
+		omp_cleanup_callback, omp_cleanup_callback_arg);
 }
 #endif
 
@@ -1433,14 +1432,14 @@ _starpu_task_uses_multiformat_handles(struct starpu_task *task)
  */
 int
 _starpu_handle_needs_conversion_task(starpu_data_handle_t handle,
-		unsigned int node)
+				     unsigned int node)
 {
 	return _starpu_handle_needs_conversion_task_for_arch(handle, starpu_node_get_kind(node));
 }
 
 int
 _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
-		enum starpu_node_kind node_kind)
+				     enum starpu_node_kind node_kind)
 {
 	/*
 	 * Here, we assume that CUDA devices and OpenCL devices use the
@@ -1449,39 +1448,39 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 	 */
 	switch (node_kind)
 	{
-	case STARPU_CPU_RAM:
-		switch(starpu_node_get_kind(handle->mf_node))
-		{
 		case STARPU_CPU_RAM:
-			return 0;
-		case STARPU_CUDA_RAM:      /* Fall through */
-		case STARPU_OPENCL_RAM:
-		case STARPU_MIC_RAM:
-		case STARPU_MPI_MS_RAM:
-			return 1;
-		default:
-			STARPU_ABORT();
-		}
-		break;
+			switch(starpu_node_get_kind(handle->mf_node))
+			{
+				case STARPU_CPU_RAM:
+					return 0;
+				case STARPU_CUDA_RAM:      /* Fall through */
+				case STARPU_OPENCL_RAM:
+				case STARPU_MIC_RAM:
+                                case STARPU_MPI_MS_RAM:
+					return 1;
+				default:
+					STARPU_ABORT();
+			}
+			break;
 		case STARPU_CUDA_RAM:    /* Fall through */
 		case STARPU_OPENCL_RAM:
 		case STARPU_MIC_RAM:
 		case STARPU_MPI_MS_RAM:
 			switch(starpu_node_get_kind(handle->mf_node))
 			{
-			case STARPU_CPU_RAM:
-				return 1;
-			case STARPU_CUDA_RAM:
-			case STARPU_OPENCL_RAM:
-			case STARPU_MIC_RAM:
-			case STARPU_MPI_MS_RAM:
-				return 0;
-			default:
-				STARPU_ABORT();
+				case STARPU_CPU_RAM:
+					return 1;
+				case STARPU_CUDA_RAM:
+				case STARPU_OPENCL_RAM:
+				case STARPU_MIC_RAM:
+                                case STARPU_MPI_MS_RAM:
+					return 0;
+				default:
+					STARPU_ABORT();
 			}
 			break;
-			default:
-				STARPU_ABORT();
+		default:
+			STARPU_ABORT();
 	}
 	/* that instruction should never be reached */
 	return -EINVAL;
@@ -1522,10 +1521,6 @@ static int sleep_some(float timeout)
 	return 1;
 }
 
-
-
-
-
 /* Check from times to times that StarPU does finish some tasks */
 static void *watchdog_func(void *arg)
 {
@@ -1572,7 +1567,6 @@ static void *watchdog_func(void *arg)
 			else if (watchdog_hook == NULL)
 				_STARPU_MSG("Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
 		}
-
 		/* Only shout again after another period */
 		config->watchdog_ok = 1;
 	}
@@ -1580,16 +1574,12 @@ static void *watchdog_func(void *arg)
 	return NULL;
 }
 
-
-/* Sets the function to call when the watchdog detects that StarPU has not
- * finished task for STARPU_WATCHDOG_TIMEOUT seconds */
-void starpu_task_set_watchdog_hook(void (*hook)(void *), void * hook_arg)
+void starpu_task_watchdog_set_hook(void (*hook)(void *), void *hook_arg)
 {
 	watchdog_hook = hook;
 	watchdog_hook_arg = hook_arg;
 }
 
-
 void _starpu_watchdog_init()
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
@@ -1603,9 +1593,6 @@ void _starpu_watchdog_init()
 	STARPU_PTHREAD_CREATE(&watchdog_thread, NULL, watchdog_func, timeout_env);
 }
 
-
-
-
 void _starpu_watchdog_shutdown(void)
 {
 	char *timeout_env = starpu_getenv("STARPU_WATCHDOG_TIMEOUT");
@@ -1677,7 +1664,7 @@ static void _starpu_default_check_ft(void *arg)
 	}
 
 	new_task = starpu_task_ft_create_retry
-			(meta_task, current_task, _starpu_default_check_ft);
+(meta_task, current_task, _starpu_default_check_ft);
 
 	ret = starpu_task_submit_nodeps(new_task);
 	STARPU_ASSERT(!ret);
@@ -1695,7 +1682,7 @@ void starpu_task_ft_prologue(void *arg)
 
 	/* Create a task which will do the actual computation */
 	new_task = starpu_task_ft_create_retry
-			(meta_task, meta_task, check_ft);
+(meta_task, meta_task, check_ft);
 
 	ret = starpu_task_submit_nodeps(new_task);
 	STARPU_ASSERT(!ret);

+ 7 - 4
src/core/topology.c

@@ -1924,12 +1924,15 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 			 (previous >= 0 && previous == workerid) ||
 			 (name && cpu_name[cpuid] && !strcmp(name, cpu_name[cpuid])) ) )
 		{
+			char hostname[65];
+			_starpu_gethostname(hostname, sizeof(hostname));
+
 			if (previous == STARPU_ACTIVETHREAD)
-				_STARPU_DISP("Warning: active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+				_STARPU_DISP("[%s] Warning: active thread %s was already bound to PU %d\n", hostname, cpu_name[cpuid], cpuid);
 			else if (previous == STARPU_NONACTIVETHREAD)
-				_STARPU_DISP("Warning: non-active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+				_STARPU_DISP("[%s] Warning: non-active thread %s was already bound to PU %d\n", hostname, cpu_name[cpuid], cpuid);
 			else
-				_STARPU_DISP("Warning: worker %d was already bound to PU %d\n", previous, cpuid);
+				_STARPU_DISP("[%s] Warning: worker %d was already bound to PU %d\n", hostname, previous, cpuid);
 
 			if (workerid == STARPU_ACTIVETHREAD)
 				_STARPU_DISP("and we were told to also bind active thread %s to it.\n", name);
@@ -1942,7 +1945,7 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
 			if (workerid >= 0)
 				/* This shouldn't happen for workers */
-				_STARPU_DISP("Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", config->topology.nhwcpus, config->topology.nhwpus);
+				_STARPU_DISP("[%s] Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", hostname, config->topology.nhwcpus, config->topology.nhwpus);
 			ret = -1;
 		}
 		else