11 年之前 · 8f26d26ad7
--- a/ChangeLog
+++ b/ChangeLog
@@ -44,6 +44,8 @@ New features:
 
				   * New functions starpu_pause() and starpu_resume()
			
 
				   * New codelet specific_nodes field to specify explicit target nodes for data.
			
 
				   * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
			
 
				+  * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
			
 
				+    CUDA and OpenCL kernel execution.
			
 
				 
			
 
				 Small features:
			
 
				   * New functions starpu_data_acquire_cb_sequential_consistency() and
			
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -54,9 +54,30 @@ cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
				 
			
 
				 StarPU already does appropriate calls for the CUBLAS library.
			
 
				 
			
 
				+If the kernel can be made to only use this local stream or other self-allocated
			
 
				+streams, i.e. the whole kernel submission can be made asynchronous, then
			
 
				+one should enable asynchronous execution of the kernel. This means setting
			
 
				+the corresponding cuda_flags[] flag in the codelet and dropping the
			
 
				+cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
			
 
				+able to pipeline submitting tasks to GPUs, instead of synchronizing at each
			
 
				+kernel submission. The kernel just has to make sure that StarPU can use the
			
 
				+local stream to synchronize with the kernel startup and completion.
			
 
				+
			
 
				 Unfortunately, some CUDA libraries do not have stream variants of
			
 
				 kernels. That will lower the potential for overlapping.
			
 
				 
			
 
				+\section OpenCL-specificOptimizations OpenCL-specific Optimizations
			
 
				+
			
 
				+If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
			
 
				+streams, i.e. the whole kernel submission can be made asynchronous, then
			
 
				+one should enable asynchronous execution of the kernel. This means setting
			
 
				+the corresponding opencl_flags[] flag in the codelet and dropping the
			
 
				+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
			
 
				+That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
			
 
				+synchronizing at each kernel submission. The kernel just has to make sure
			
 
				+that StarPU can use the command queue it has provided to synchronize with the
			
 
				+kernel startup and completion.
			
 
				+
			
 
				 \section DetectionStuckConditions Detection Stuck Conditions
			
 
				 
			
 
				 It may happen that for some reason, StarPU does not make progress for a long
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -223,6 +223,10 @@ If the field starpu_codelet::where is set, then the field
 
				 starpu_codelet::cuda_funcs is ignored if ::STARPU_CUDA does not appear
			
 
				 in the field starpu_codelet::where, it must be non-null otherwise.
			
 
				 
			
 
				+\var starpu_codelet::cuda_flags
			
 
				+Optional array of flags for CUDA execution. They specify some semantic details
			
 
				+about CUDA kernel execution, such as asynchronous execution.
			
 
				+
			
 
				 \var starpu_codelet::opencl_funcs
			
 
				 Optional array of function pointers to the OpenCL implementations of
			
 
				 the codelet. It must be terminated by a NULL value. The functions
			
@@ -235,6 +239,10 @@ starpu_codelet::opencl_funcs is ignored if ::STARPU_OPENCL does not
 
				 appear in the field starpu_codelet::where, it must be non-null
			
 
				 otherwise.
			
 
				 
			
 
				+\var starpu_codelet::opencl_flags
			
 
				+Optional array of flags for OpenCL execution. They specify some semantic details
			
 
				+about OpenCL kernel execution, such as asynchronous execution.
			
 
				+
			
 
				 \var starpu_codelet::mic_funcs
			
 
				 Optional array of function pointers to a function which returns the
			
 
				 MIC implementation of the codelet. It must be terminated by a NULL
			
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -83,6 +83,7 @@ static struct starpu_codelet cl =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	/* CUDA implementation of the codelet */
			
 
				 	.cuda_funcs = {scal_cuda_func, NULL},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	/* OpenCL implementation of the codelet */
			
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -42,6 +42,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
			
 
				-
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011, 2013-2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -69,9 +69,11 @@ int main(int argc, char **argv)
 
				 		.cpu_funcs_name = {"cpu_codelet", NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		.cuda_funcs = {cuda_codelet, NULL},
			
 
				+		.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 		.opencl_funcs = {opencl_codelet, NULL},
			
 
				+		.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				 #endif
			
 
				 		.nbuffers = 1,
			
 
				 		.modes = {STARPU_RW},
			
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -31,5 +31,4 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -47,11 +47,7 @@ void opencl_codelet(void *descr[], void *_args)
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-	}
			
 
				-
			
 
				-	clFinish(queue);
			
 
				-	starpu_opencl_collect_stats(event);
			
 
				-	clReleaseEvent(event);
			
 
				 
			
 
				-	starpu_opencl_release_kernel(kernel);
			
 
				+		starpu_opencl_release_kernel(kernel);
			
 
				+	}
			
 
				 }
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -41,6 +41,9 @@ extern "C"
 
				 #define STARPU_MIC	((1ULL)<<7)
			
 
				 #define STARPU_SCC	((1ULL)<<8)
			
 
				 
			
 
				+#define STARPU_CUDA_ASYNC	(1<<0)
			
 
				+#define STARPU_OPENCL_ASYNC	(1<<0)
			
 
				+
			
 
				 enum starpu_codelet_type
			
 
				 {
			
 
				 	STARPU_SEQ,
			
@@ -90,7 +93,9 @@ struct starpu_codelet
 
				 
			
 
				 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	char cuda_flags[STARPU_MAXIMPLEMENTATIONS];
			
 
				 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	char opencl_flags[STARPU_MAXIMPLEMENTATIONS];
			
 
				 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -367,6 +367,8 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
				 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
			
 
				 #else
			
 
				 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+		if (cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 #endif
			
 
				 	}
			
 
				 
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -780,6 +780,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 		double length = NAN;
			
 
				 	  #ifdef STARPU_OPENCL_SIMULATOR
			
 
				 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
			
 
				+		{
			
 
				+			cl_command_queue queue;
			
 
				+			starpu_opencl_get_queue(args->devid, &queue);
			
 
				+			clFinish(queue);
			
 
				+		}
			
 
				 	    #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
			
 
				 	      #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
			
 
				 		#define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
			
@@ -794,6 +800,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
			
 
				 #else
			
 
				 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
			
 
				+		{
			
 
				+			cl_command_queue queue;
			
 
				+			starpu_opencl_get_queue(args->devid, &queue);
			
 
				+			clFinish(queue);
			
 
				+		}
			
 
				 #endif
			
 
				 	}