Преглед на файлове

Introduce the STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags. No optimization is done yet for now.

Samuel Thibault преди 11 години
родител
ревизия
8f26d26ad7

+ 2 - 0
ChangeLog

@@ -44,6 +44,8 @@ New features:
   * New functions starpu_pause() and starpu_resume()
   * New functions starpu_pause() and starpu_resume()
   * New codelet specific_nodes field to specify explicit target nodes for data.
   * New codelet specific_nodes field to specify explicit target nodes for data.
   * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
   * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+  * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
+    CUDA and OpenCL kernel execution.
 
 
 Small features:
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and
   * New functions starpu_data_acquire_cb_sequential_consistency() and

+ 21 - 0
doc/doxygen/chapters/05check_list_performance.doxy

@@ -54,9 +54,30 @@ cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 StarPU already does appropriate calls for the CUBLAS library.
 StarPU already does appropriate calls for the CUBLAS library.
 
 
+If the kernel can be made to only use this local stream or other self-allocated
+streams, i.e. the whole kernel submission can be made asynchronous, then
+one should enable asynchronous execution of the kernel. This means setting
+the corresponding cuda_flags[] flag in the codelet and dropping the
+cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
+able to pipeline submitting tasks to GPUs, instead of synchronizing at each
+kernel submission. The kernel just has to make sure that StarPU can use the
+local stream to synchronize with the kernel startup and completion.
+
 Unfortunately, some CUDA libraries do not have stream variants of
 Unfortunately, some CUDA libraries do not have stream variants of
 kernels. That will lower the potential for overlapping.
 kernels. That will lower the potential for overlapping.
 
 
+\section OpenCL-specificOptimizations OpenCL-specific Optimizations
+
+If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
+streams, i.e. the whole kernel submission can be made asynchronous, then
+one should enable asynchronous execution of the kernel. This means setting
+the corresponding opencl_flags[] flag in the codelet and dropping the
+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
+That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
+synchronizing at each kernel submission. The kernel just has to make sure
+that StarPU can use the command queue it has provided to synchronize with the
+kernel startup and completion.
+
 \section DetectionStuckConditions Detection Stuck Conditions
 \section DetectionStuckConditions Detection Stuck Conditions
 
 
 It may happen that for some reason, StarPU does not make progress for a long
 It may happen that for some reason, StarPU does not make progress for a long

+ 8 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -223,6 +223,10 @@ If the field starpu_codelet::where is set, then the field
 starpu_codelet::cuda_funcs is ignored if ::STARPU_CUDA does not appear
 starpu_codelet::cuda_funcs is ignored if ::STARPU_CUDA does not appear
 in the field starpu_codelet::where, it must be non-null otherwise.
 in the field starpu_codelet::where, it must be non-null otherwise.
 
 
+\var starpu_codelet::cuda_flags
+Optional array of flags for CUDA execution. They specify some semantic details
+about CUDA kernel execution, such as asynchronous execution.
+
 \var starpu_codelet::opencl_funcs
 \var starpu_codelet::opencl_funcs
 Optional array of function pointers to the OpenCL implementations of
 Optional array of function pointers to the OpenCL implementations of
 the codelet. It must be terminated by a NULL value. The functions
 the codelet. It must be terminated by a NULL value. The functions
@@ -235,6 +239,10 @@ starpu_codelet::opencl_funcs is ignored if ::STARPU_OPENCL does not
 appear in the field starpu_codelet::where, it must be non-null
 appear in the field starpu_codelet::where, it must be non-null
 otherwise.
 otherwise.
 
 
+\var starpu_codelet::opencl_flags
+Optional array of flags for OpenCL execution. They specify some semantic details
+about OpenCL kernel execution, such as asynchronous execution.
+
 \var starpu_codelet::mic_funcs
 \var starpu_codelet::mic_funcs
 Optional array of function pointers to a function which returns the
 Optional array of function pointers to a function which returns the
 MIC implementation of the codelet. It must be terminated by a NULL
 MIC implementation of the codelet. It must be terminated by a NULL

+ 2 - 1
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -83,6 +83,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
 	.cuda_funcs = {scal_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	/* OpenCL implementation of the codelet */
 	/* OpenCL implementation of the codelet */

+ 1 - 3
examples/basic_examples/vector_scal_cuda.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +42,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 1
examples/incrementer/incrementer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,9 +69,11 @@ int main(int argc, char **argv)
 		.cpu_funcs_name = {"cpu_codelet", NULL},
 		.cpu_funcs_name = {"cpu_codelet", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		.cuda_funcs = {cuda_codelet, NULL},
 		.cuda_funcs = {cuda_codelet, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 		.opencl_funcs = {opencl_codelet, NULL},
 		.opencl_funcs = {opencl_codelet, NULL},
+		.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 #endif
 		.nbuffers = 1,
 		.nbuffers = 1,
 		.modes = {STARPU_RW},
 		.modes = {STARPU_RW},

+ 1 - 2
examples/incrementer/incrementer_kernels.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,5 +31,4 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 7
examples/incrementer/incrementer_kernels_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,11 +47,7 @@ void opencl_codelet(void *descr[], void *_args)
 
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
 
 
-	starpu_opencl_release_kernel(kernel);
+		starpu_opencl_release_kernel(kernel);
+	}
 }
 }

+ 5 - 0
include/starpu_task.h

@@ -41,6 +41,9 @@ extern "C"
 #define STARPU_MIC	((1ULL)<<7)
 #define STARPU_MIC	((1ULL)<<7)
 #define STARPU_SCC	((1ULL)<<8)
 #define STARPU_SCC	((1ULL)<<8)
 
 
+#define STARPU_CUDA_ASYNC	(1<<0)
+#define STARPU_OPENCL_ASYNC	(1<<0)
+
 enum starpu_codelet_type
 enum starpu_codelet_type
 {
 {
 	STARPU_SEQ,
 	STARPU_SEQ,
@@ -90,7 +93,9 @@ struct starpu_codelet
 
 
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
+	char cuda_flags[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	char opencl_flags[STARPU_MAXIMPLEMENTATIONS];
 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
 
 

+ 2 - 0
src/drivers/cuda/driver_cuda.c

@@ -367,6 +367,8 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 #else
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		if (cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 #endif
 	}
 	}
 
 

+ 12 - 0
src/drivers/opencl/driver_opencl.c

@@ -780,6 +780,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		double length = NAN;
 		double length = NAN;
 	  #ifdef STARPU_OPENCL_SIMULATOR
 	  #ifdef STARPU_OPENCL_SIMULATOR
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+		{
+			cl_command_queue queue;
+			starpu_opencl_get_queue(args->devid, &queue);
+			clFinish(queue);
+		}
 	    #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
 	    #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
 	      #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
 	      #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
 		#define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
 		#define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
@@ -794,6 +800,12 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
 		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
 #else
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+		{
+			cl_command_queue queue;
+			starpu_opencl_get_queue(args->devid, &queue);
+			clFinish(queue);
+		}
 #endif
 #endif
 	}
 	}