Browse Source

merge trunk

Nathalie Furmento 10 years ago
parent
commit
685fda907c

+ 4 - 0
ChangeLog

@@ -49,6 +49,10 @@ New features:
     schedulers have very interesting scalability properties.
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
     CUDA and OpenCL kernel execution.
+  * Add STARPU_CUDA_PIPELINE and STARPU_OPENCL_PIPELINE to specify how
+    many asynchronous tasks are submitted in advance on CUDA and
+    OpenCL devices. Setting the value to 0 forces a synchronous
+    execution of all tasks.
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
   * Add CUDA and OpenCL kernel submission pipelining, to overlap costs and allow

+ 4 - 2
doc/doxygen/chapters/40environment_variables.doxy

@@ -58,7 +58,8 @@ which will be concurrently running on the devices. The default value is 1.
 Specify how many asynchronous tasks are submitted in advance on CUDA
 devices. This for instance permits to overlap task management with the execution
 of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2.
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
 </dd>
 
 <dt>STARPU_NOPENCL</dt>
@@ -75,7 +76,8 @@ OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
 Specify how many asynchronous tasks are submitted in advance on OpenCL
 devices. This for instance permits to overlap task management with the execution
 of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2.
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
 </dd>
 
 <dt>STARPU_NMICDEVS</dt>

+ 18 - 9
src/drivers/cuda/driver_cuda.c

@@ -472,17 +472,26 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 #ifndef STARPU_SIMGRID
 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
 	{
-		/* Record event to synchronize with task termination later */
-		cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
+		if (worker->pipeline_length == 0)
+		{
+			/* Forced synchronous execution */
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			finish_job_on_cuda(j, worker);
+		}
+		else
+		{
+			/* Record event to synchronize with task termination later */
+			cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
 #ifdef STARPU_USE_FXT
-		int k;
-		for (k = 0; k < (int) worker->set->nworkers; k++)
-			if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
-				break;
-		if (k == (int) worker->set->nworkers)
-			/* Everybody busy */
-			_STARPU_TRACE_START_EXECUTING()
+			int k;
+			for (k = 0; k < (int) worker->set->nworkers; k++)
+				if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
+					break;
+			if (k == (int) worker->set->nworkers)
+				/* Everybody busy */
+				_STARPU_TRACE_START_EXECUTING();
 #endif
+		}
 	}
 	else
 #else

+ 22 - 12
src/drivers/opencl/driver_opencl.c

@@ -903,18 +903,28 @@ static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_
 		int err;
 		cl_command_queue queue;
 		starpu_opencl_get_queue(worker->devid, &queue);
-		/* the function clEnqueueMarker is deprecated from
-		 * OpenCL version 1.2. We would like to use the new
-		 * function clEnqueueMarkerWithWaitList. We could do
-		 * it by checking its availability through our own
-		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
-		 * and the OpenCL macro CL_VERSION_1_2. However these
-		 * 2 macros detect the function availability in the
-		 * ICD and not in the device implementation.
-		 */
-		err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
-		_STARPU_TRACE_START_EXECUTING();
+
+		if (worker->pipeline_length == 0)
+		{
+			starpu_opencl_get_queue(worker->devid, &queue);
+			clFinish(queue);
+			_starpu_opencl_stop_job(j, worker);
+		}
+		else
+		{
+			/* the function clEnqueueMarker is deprecated from
+			 * OpenCL version 1.2. We would like to use the new
+			 * function clEnqueueMarkerWithWaitList. We could do
+			 * it by checking its availability through our own
+			 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
+			 * and the OpenCL macro CL_VERSION_1_2. However these
+			 * 2 macros detect the function availability in the
+			 * ICD and not in the device implementation.
+			 */
+			err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
+			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+			_STARPU_TRACE_START_EXECUTING();
+		}
 	}
 	else
 #else