Преглед на файлове

Introduce the STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags. No optimization is done yet for now.

Samuel Thibault преди 11 години
родител
ревизия
8f26d26ad7

+ 2 - 0
ChangeLog

@@ -44,6 +44,8 @@ New features:
   * New functions starpu_pause() and starpu_resume()
   * New codelet specific_nodes field to specify explicit target nodes for data.
   * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+  * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
+    CUDA and OpenCL kernel execution.
 
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and

+ 21 - 0
doc/doxygen/chapters/05check_list_performance.doxy

@@ -54,9 +54,30 @@ cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 StarPU already does appropriate calls for the CUBLAS library.
 
+If the kernel can be made to only use this local stream or other self-allocated
+streams, i.e. the whole kernel submission can be made asynchronous, then
+one should enable asynchronous execution of the kernel. This means setting
+the corresponding cuda_flags[] flag in the codelet and dropping the
+cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
+able to pipeline submitting tasks to GPUs, instead of synchronizing at each
+kernel submission. The kernel just has to make sure that StarPU can use the
+local stream to synchronize with the kernel startup and completion.
+
 Unfortunately, some CUDA libraries do not have stream variants of
 kernels. That will lower the potential for overlapping.
 
+\section OpenCL-specificOptimizations OpenCL-specific Optimizations
+
+If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
+streams, i.e. the whole kernel submission can be made asynchronous, then
+one should enable asynchronous execution of the kernel. This means setting
+the corresponding opencl_flags[] flag in the codelet and dropping the
+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
+That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
+synchronizing at each kernel submission. The kernel just has to make sure
+that StarPU can use the command queue it has provided to synchronize with the
+kernel startup and completion.
+
 \section DetectionStuckConditions Detection Stuck Conditions
 
 It may happen that for some reason, StarPU does not make progress for a long

+ 8 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -223,6 +223,10 @@ If the field starpu_codelet::where is set, then the field
 starpu_codelet::cuda_funcs is ignored if ::STARPU_CUDA does not appear
 in the field starpu_codelet::where, it must be non-null otherwise.
 
+\var starpu_codelet::cuda_flags
+Optional array of flags for CUDA execution. They specify some semantic details
+about CUDA kernel execution, such as asynchronous execution.
+
 \var starpu_codelet::opencl_funcs
 Optional array of function pointers to the OpenCL implementations of
 the codelet. It must be terminated by a NULL value. The functions
@@ -235,6 +239,10 @@ starpu_codelet::opencl_funcs is ignored if ::STARPU_OPENCL does not
 appear in the field starpu_codelet::where, it must be non-null
 otherwise.
 
+\var starpu_codelet::opencl_flags
+Optional array of flags for OpenCL execution. They specify some semantic details
+about OpenCL kernel execution, such as asynchronous execution.
+
 \var starpu_codelet::mic_funcs
 Optional array of function pointers to a function which returns the
 MIC implementation of the codelet. It must be terminated by a NULL

+ 2 - 1
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -83,6 +83,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	/* OpenCL implementation of the codelet */

+ 1 - 3
examples/basic_examples/vector_scal_cuda.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +42,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 1
examples/incrementer/incrementer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,9 +69,11 @@ int main(int argc, char **argv)
 		.cpu_funcs_name = {"cpu_codelet", NULL},
 #ifdef STARPU_USE_CUDA
 		.cuda_funcs = {cuda_codelet, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 		.opencl_funcs = {opencl_codelet, NULL},
+		.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 		.nbuffers = 1,
 		.modes = {STARPU_RW},

+ 1 - 2
examples/incrementer/incrementer_kernels.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,5 +31,4 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 7
examples/incrementer/incrementer_kernels_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,11 +47,7 @@ void opencl_codelet(void *descr[], void *_args)
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
 
-	starpu_opencl_release_kernel(kernel);
+		starpu_opencl_release_kernel(kernel);
+	}
 }

+ 5 - 0
include/starpu_task.h

@@ -41,6 +41,9 @@ extern "C"
 #define STARPU_MIC	((1ULL)<<7)
 #define STARPU_SCC	((1ULL)<<8)
 
+#define STARPU_CUDA_ASYNC	(1<<0)
+#define STARPU_OPENCL_ASYNC	(1<<0)
+
 enum starpu_codelet_type
 {
 	STARPU_SEQ,
@@ -90,7 +93,9 @@ struct starpu_codelet
 
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
+	char cuda_flags[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	char opencl_flags[STARPU_MAXIMPLEMENTATIONS];
 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
 

+ 2 - 0
src/drivers/cuda/driver_cuda.c

@@ -367,6 +367,8 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		if (cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 	}
 

+ 12 - 0
src/drivers/opencl/driver_opencl.c

@@ -780,6 +780,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		double length = NAN;
 	  #ifdef STARPU_OPENCL_SIMULATOR
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+		{
+			cl_command_queue queue;
+			starpu_opencl_get_queue(args->devid, &queue);
+			clFinish(queue);
+		}
 	    #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
 	      #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
 		#define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
@@ -794,6 +800,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+		{
+			cl_command_queue queue;
+			starpu_opencl_get_queue(args->devid, &queue);
+			clFinish(queue);
+		}
 #endif
 	}