7 years ago · 9b411391e9
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015,2017                      Inria
			
 
				  * Copyright (C) 2010-2018                                CNRS
			
 
				- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,6 +26,26 @@ performance, we give below a list of features which should be checked.
 
				 For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
			
 
				 will show roughly where time is spent, and focus correspondingly.
			
 
				 
			
 
				+\section CheckTaskSize Check Task Size
			
 
				+
			
 
				+Make sure that your tasks are not too small, because the StarPU runtime overhead
			
 
				+is not completely zero. You can run the tasks_size_overhead.sh script to get an
			
 
				+idea of the scalability of tasks depending on their duration (in µs), on your
			
 
				+own system.
			
 
				+
			
 
				+Typically, 10µs-ish tasks are definitely too small, the CUDA overhead itself is
			
 
				+much bigger than this.
			
 
				+
			
 
				+1ms-ish tasks may be a good start, but will not necessarily scale to many dozens
			
 
				+of cores, so it's better to try to get 10ms-ish tasks.
			
 
				+
			
 
				+Tasks durations can easily be observed when performance models are defined (see
			
 
				+\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
			
 
				+<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
			
 
				+
			
 
				+When using parallel tasks, the problem is even worse since StarPU has to
			
 
				+synchronize the execution of tasks.
			
 
				+
			
 
				 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
			
 
				 
			
 
				 The \ref enable-fast "--enable-fast" configuration option disables all
			
@@ -116,6 +136,16 @@ enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
 
				 number of kernels to execute concurrently.  This is useful when kernels are
			
 
				 small and do not feed the whole GPU with threads to run.
			
 
				 
			
 
				+Concerning memory allocation, you should really not use cudaMalloc/cudaFree
			
 
				+within the kernel, since cudaFree introduces a awfully lot of synchronizations
			
 
				+within CUDA itself. You should instead add a parameter to the codelet with the
			
 
				+STARPU_SCRATCH mode access. You can then pass to the task a handle registered
			
 
				+with the desired size but with the NULL pointer, that handle can even be the
			
 
				+shared between tasks, StarPU will allocate per-task data on the fly before task
			
 
				+execution, and reuse the allocated data between tasks.
			
 
				+
			
 
				+See <c>examples/pi/pi_redux.c</c> for an example of use.
			
 
				+
			
 
				 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
			
 
				 
			
 
				 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015-2017                      Inria
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
			
 
				  * Copyright (C) 2016                                     Uppsala University
			
 
				  *
			
@@ -1173,6 +1173,14 @@ If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the
 
				 discovered by StarPU.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_IDLE_FILE</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_IDLE_FILE
			
 
				+\addindex __env__STARPU_IDLE_FILE
			
 
				+If the environment variable STARPU_IDLE_FILE is defined, a file named after its contents will be created at the end of the execution.
			
 
				+The file will contain the sum of the idle times of all the workers.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheHypervisor Configuring The Hypervisor
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -620,7 +620,7 @@ do {									\
 
				 		}							\
			
 
				 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
			
 
				 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
			
 
				-		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000, (job)->task->tag_id, workerid, ((job)->job_id)); \
			
 
				+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
			
 
				 	}								\
			
 
				 } while(0);
			
 
				 
			
--- a/src/common/prio_list.h
+++ b/src/common/prio_list.h
@@ -167,7 +167,12 @@
 
				 	{ \
			
 
				 		/* Sort by decreasing order */ \
			
 
				 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
			
 
				-		return (e2->prio - prio); \
			
 
				+		if (e2->prio < prio) \
			
 
				+			return -1; \
			
 
				+		if (e2->prio == prio) \
			
 
				+			return 0; \
			
 
				+		/* e2->prio > prio */ \
			
 
				+		return 1; \
			
 
				 	} \
			
 
				 	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
			
 
				 	{ \
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1765,7 +1765,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 
			
 
				 					unsigned n = entry->nsample;
			
 
				 					entry->mean = entry->sum / n;
			
 
				-					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum))/n)/n);
			
 
				+					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum)/n))/n);
			
 
				 				}
			
 
				 
			
 
				 				if (j->task->flops != 0.)
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -397,6 +397,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
				 			struct starpu_task *alias = starpu_task_dup(task);
			
 
				 			alias->destroy = 1;
			
 
				 
			
 
				+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 			worker = _starpu_get_worker_struct(combined_workerid[j]);
			
 
				 			ret |= _starpu_push_local_task(worker, alias, 0);
			
 
				 		}
			
@@ -581,6 +582,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 					if (job->task_size > 1)
			
 
				 					{
			
 
				 						alias = starpu_task_dup(task);
			
 
				+						_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 						alias->destroy = 1;
			
 
				 					}
			
 
				 					else
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -107,12 +107,22 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 			/* rebind to single CPU */
			
 
				 			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
			
 
				 	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_TRACE_START_EXECUTING();
			
 
				+	}
			
 
				+
			
 
				+	if (is_parallel_task)
			
 
				+	{
			
 
				+		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				+		if (rank != 0)
			
 
				+			_STARPU_TRACE_END_EXECUTING();
			
 
				+	}
			
 
				 
			
 
				 	_starpu_driver_end_job(cpu_args, j, perf_arch, rank, profiling);
			
 
				 
			
 
				 	if (is_parallel_task)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		if (rank == 0)
			
 
				 		{
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -62,6 +62,7 @@ static nvmlDevice_t nvmlDev[STARPU_MAXCUDADEVS];
 
				 int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				+static char used_stream[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 /* Note: streams are not thread-safe, so we define them for each CUDA worker
			
@@ -224,6 +225,7 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 
				 {
			
 
				 	int worker = starpu_worker_get_id_check();
			
 
				 
			
 
				+	used_stream[worker] = 1;
			
 
				 	return streams[worker];
			
 
				 }
			
 
				 
			
@@ -613,6 +615,14 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	if (!used_stream[workerid])
			
 
				+	{
			
 
				+		used_stream[workerid] = 1;
			
 
				+		_STARPU_DISP("Warning: starpu_cuda_get_local_stream() was not used to submit kernel to CUDA on worker %d. CUDA will thus introduce a lot of useless synchronizations, which will prevent proper overlapping of data transfers and kernel execution. See the CUDA-specific part of the 'Check List When Performance Are Not There' of the StarPU handbook\n", workerid);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				 	{
			
 
				 		if (worker->pipeline_length == 0)
			
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011-2014,2017                           Inria
			
 
				  * Copyright (C) 2010-2012,2014-2017                      CNRS
			
 
				- * Copyright (C) 2010-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
 
				  * Copyright (C) 2013                                     Simon Archipoff
			
 
				  *
			
@@ -631,6 +631,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 
				 	task_alias[0]->task->destroy = 1;
			
 
				 	task_alias[0]->left = NULL;
			
 
				 	task_alias[0]->ntasks = combined_worker->worker_size;
			
 
				+	_STARPU_TRACE_JOB_PUSH(task_alias[0]->task, task_alias[0]->task->priority > 0);
			
 
				 	int i;
			
 
				 	for(i = 1; i < combined_worker->worker_size; i++)
			
 
				 	{
			
@@ -641,6 +642,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 
				 		task_alias[i]->left = task_alias[i-1];
			
 
				 		task_alias[i - 1]->right = task_alias[i];
			
 
				 		task_alias[i]->pntasks = &(task_alias[0]->ntasks);
			
 
				+		_STARPU_TRACE_JOB_PUSH(task_alias[i]->task, task_alias[i]->task->priority > 0);
			
 
				 	}
			
 
				 
			
 
				 	starpu_pthread_mutex_t * mutex_to_unlock = NULL;
			
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -342,6 +342,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 		struct starpu_task *alias = starpu_task_dup(task);
			
 
				 		int local_worker = combined_workerid[i];
			
 
				 		alias->destroy = 1;
			
 
				+		_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 		_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
			
 
				 	}
			
 
				 
			
@@ -352,6 +353,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
			
 
				 
			
 
				+	_STARPU_TRACE_JOB_PUSH(master_alias, master_alias->priority > 0);
			
 
				+
			
 
				 	for (i = 1; i < worker_size; i++)
			
 
				 	{
			
 
				 		int local_worker = combined_workerid[i];
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015,2017                      Inria
			
 
				- * Copyright (C) 2010-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2011-2017                                CNRS
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
 
				  *
			
@@ -175,6 +175,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 			ntasks[local_combined_workerid]++;
			
 
				 			_starpu_worker_unlock(local_combined_workerid);
			
 
				 
			
 
				+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 			ret |= starpu_push_local_task(local_combined_workerid, alias, prio);
			
 
				 		}