11 years ago · 8f26d26ad7
--- a/ChangeLog
+++ b/ChangeLog
@@ -44,6 +44,8 @@ New features:
 
																   * New functions starpu_pause() and starpu_resume()
															
 
																   * New codelet specific_nodes field to specify explicit target nodes for data.
															
 
																   * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
															
 
																+  * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
															
 
																+    CUDA and OpenCL kernel execution.
															
 
																 Small features:
															
 
																   * New functions starpu_data_acquire_cb_sequential_consistency() and
															
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -54,9 +54,30 @@ cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
																 StarPU already does appropriate calls for the CUBLAS library.
															
 
																+If the kernel can be made to only use this local stream or other self-allocated
															
 
																+streams, i.e. the whole kernel submission can be made asynchronous, then
															
 
																+one should enable asynchronous execution of the kernel. This means setting
															
 
																+the corresponding cuda_flags[] flag in the codelet and dropping the
															
 
																+cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
															
 
																+able to pipeline submitting tasks to GPUs, instead of synchronizing at each
															
 
																+kernel submission. The kernel just has to make sure that StarPU can use the
															
 
																+local stream to synchronize with the kernel startup and completion.
															
 
																+
															
 
																 Unfortunately, some CUDA libraries do not have stream variants of
															
 
																 kernels. That will lower the potential for overlapping.
															
 
																+\section OpenCL-specificOptimizations OpenCL-specific Optimizations
															
 
																+
															
 
																+If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
															
 
																+streams, i.e. the whole kernel submission can be made asynchronous, then
															
 
																+one should enable asynchronous execution of the kernel. This means setting
															
 
																+the corresponding opencl_flags[] flag in the codelet and dropping the
															
 
																+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
															
 
																+That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
															
 
																+synchronizing at each kernel submission. The kernel just has to make sure
															
 
																+that StarPU can use the command queue it has provided to synchronize with the
															
 
																+kernel startup and completion.
															
 
																+
															
 
																 \section DetectionStuckConditions Detection Stuck Conditions
															
 
																 It may happen that for some reason, StarPU does not make progress for a long
															
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -223,6 +223,10 @@ If the field starpu_codelet::where is set, then the field
 
																 starpu_codelet::cuda_funcs is ignored if ::STARPU_CUDA does not appear
															
 
																 in the field starpu_codelet::where, it must be non-null otherwise.
															
 
																+\var starpu_codelet::cuda_flags
															
 
																+Optional array of flags for CUDA execution. They specify some semantic details
															
 
																+about CUDA kernel execution, such as asynchronous execution.
															
 
																+
															
 
																 \var starpu_codelet::opencl_funcs
															
 
																 Optional array of function pointers to the OpenCL implementations of
															
 
																 the codelet. It must be terminated by a NULL value. The functions
															
@@ -235,6 +239,10 @@ starpu_codelet::opencl_funcs is ignored if ::STARPU_OPENCL does not
 
																 appear in the field starpu_codelet::where, it must be non-null
															
 
																 otherwise.
															
 
																+\var starpu_codelet::opencl_flags
															
 
																+Optional array of flags for OpenCL execution. They specify some semantic details
															
 
																+about OpenCL kernel execution, such as asynchronous execution.
															
 
																+
															
 
																 \var starpu_codelet::mic_funcs
															
 
																 Optional array of function pointers to a function which returns the
															
 
																 MIC implementation of the codelet. It must be terminated by a NULL
															
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																- * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -83,6 +83,7 @@ static struct starpu_codelet cl =
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	/* CUDA implementation of the codelet */
															
 
																 	.cuda_funcs = {scal_cuda_func, NULL},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 	/* OpenCL implementation of the codelet */
															
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -42,6 +42,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
															
 
																-
															
 
																-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2011, 2013-2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -69,9 +69,11 @@ int main(int argc, char **argv)
 
																 		.cpu_funcs_name = {"cpu_codelet", NULL},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		.cuda_funcs = {cuda_codelet, NULL},
															
 
																+		.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 		.opencl_funcs = {opencl_codelet, NULL},
															
 
																+		.opencl_flags = {STARPU_OPENCL_ASYNC},
															
 
																 #endif
															
 
																 		.nbuffers = 1,
															
 
																 		.modes = {STARPU_RW},
															
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -31,5 +31,4 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
																 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
															
 
																-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																- * Copyright (C) 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -47,11 +47,7 @@ void opencl_codelet(void *descr[], void *_args)
 
																 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
															
 
																 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-	}
															
 
																-
															
 
																-	clFinish(queue);
															
 
																-	starpu_opencl_collect_stats(event);
															
 
																-	clReleaseEvent(event);
															
 
																-	starpu_opencl_release_kernel(kernel);
															
 
																+		starpu_opencl_release_kernel(kernel);
															
 
																+	}
															
 
																 }
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -41,6 +41,9 @@ extern "C"
 
																 #define STARPU_MIC	((1ULL)<<7)
															
 
																 #define STARPU_SCC	((1ULL)<<8)
															
 
																+#define STARPU_CUDA_ASYNC	(1<<0)
															
 
																+#define STARPU_OPENCL_ASYNC	(1<<0)
															
 
																+
															
 
																 enum starpu_codelet_type
															
 
																 {
															
 
																 	STARPU_SEQ,
															
@@ -90,7 +93,9 @@ struct starpu_codelet
 
																 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																+	char cuda_flags[STARPU_MAXIMPLEMENTATIONS];
															
 
																 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																+	char opencl_flags[STARPU_MAXIMPLEMENTATIONS];
															
 
																 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -367,6 +367,8 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
																 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
															
 
																 #else
															
 
																 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+		if (cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
															
 
																+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 #endif
															
 
																 	}
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -780,6 +780,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
																 		double length = NAN;
															
 
																 	  #ifdef STARPU_OPENCL_SIMULATOR
															
 
																 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
															
 
																+		{
															
 
																+			cl_command_queue queue;
															
 
																+			starpu_opencl_get_queue(args->devid, &queue);
															
 
																+			clFinish(queue);
															
 
																+		}
															
 
																 	    #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
															
 
																 	      #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
															
 
																 		#define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
															
@@ -794,6 +800,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
																 		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
															
 
																 #else
															
 
																 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
															
 
																+		{
															
 
																+			cl_command_queue queue;
															
 
																+			starpu_opencl_get_queue(args->devid, &queue);
															
 
																+			clFinish(queue);
															
 
																+		}
															
 
																 #endif
															
 
																 	}