|
@@ -3,7 +3,7 @@
|
|
@c This file is part of the StarPU Handbook.
|
|
@c This file is part of the StarPU Handbook.
|
|
@c Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
|
|
@c Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
|
|
@c Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
|
|
@c Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
|
|
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
|
|
|
|
|
|
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
|
|
@c See the file starpu.texi for copying conditions.
|
|
@c See the file starpu.texi for copying conditions.
|
|
|
|
|
|
@menu
|
|
@menu
|
|
@@ -232,10 +232,269 @@ we show how StarPU tasks can manipulate data. The full source code for
|
|
this example is given in @ref{Full source code for the 'Scaling a Vector' example}.
|
|
this example is given in @ref{Full source code for the 'Scaling a Vector' example}.
|
|
|
|
|
|
@menu
|
|
@menu
|
|
-* Source code of Vector Scaling::
|
|
|
|
-* Execution of Vector Scaling::
|
|
|
|
|
|
+* Extended C Source Code of Vector Scaling:: StarPU program using C extensions
|
|
|
|
+* Source code of Vector Scaling:: StarPU program written in standard C
|
|
|
|
+* Execution of Vector Scaling:: Running the program
|
|
@end menu
|
|
@end menu
|
|
|
|
|
|
|
|
+@node Extended C Source Code of Vector Scaling
|
|
|
|
+@subsection Extended C Source Code of Vector Scaling
|
|
|
|
+
|
|
|
|
+The simplest way to get started writing StarPU programs is using the C
|
|
|
|
+language extensions provided by the GCC plug-in (@pxref{C Extensions}).
|
|
|
|
+These extensions map directly to StarPU's main concepts: tasks, task
|
|
|
|
+implementations for CPU, OpenCL, or CUDA, and registered data buffers.
|
|
|
|
+
|
|
|
|
+The example below is a vector-scaling program, that multiplies elements
|
|
|
|
+of a vector by a given factor@footnote{The complete example, and
|
|
|
|
+additional examples, is available in the @file{gcc-plugin/examples}
|
|
|
|
+directory of the StarPU distribution.}. For comparison, the standard C
|
|
|
|
+version that uses StarPU's standard C programming interface is given in
|
|
|
|
+the next section (@pxref{Source code of Vector Scaling, standard C
|
|
|
|
+version of the example}).
|
|
|
|
+
|
|
|
|
+First of all, the vector-scaling task and its simple CPU implementation
|
|
|
|
+has to be defined:
|
|
|
|
+
|
|
|
|
+@example
|
|
|
|
+/* Declare the `vector_scal' task. */
|
|
|
|
+
|
|
|
|
+static void vector_scal (size_t size, float vector[size],
|
|
|
|
+ float factor)
|
|
|
|
+ __attribute__ ((task));
|
|
|
|
+
|
|
|
|
+/* Declare and define the standard CPU implementation. */
|
|
|
|
+
|
|
|
|
+static void vector_scal_cpu (size_t size, float vector[size],
|
|
|
|
+ float factor)
|
|
|
|
+ __attribute__ ((task_implementation ("cpu", vector_scal)));
|
|
|
|
+
|
|
|
|
+static void
|
|
|
|
+vector_scal_cpu (size_t size, float vector[size], float factor)
|
|
|
|
+@{
|
|
|
|
+ size_t i;
|
|
|
|
+ for (i = 0; i < size; i++)
|
|
|
|
+ vector[i] *= factor;
|
|
|
|
+@}
|
|
|
|
+@end example
|
|
|
|
+
|
|
|
|
+Next, the body of the program, which uses the task defined above, can be
|
|
|
|
+implemented:
|
|
|
|
+
|
|
|
|
+@example
|
|
|
|
+int
|
|
|
|
+main (void)
|
|
|
|
+@{
|
|
|
|
+#pragma starpu initialize
|
|
|
|
+
|
|
|
|
+#define NX 0x100000
|
|
|
|
+#define FACTOR 3.14
|
|
|
|
+
|
|
|
|
+ @{
|
|
|
|
+ float vector[NX] __attribute__ ((heap_allocated));
|
|
|
|
+
|
|
|
|
+#pragma starpu register vector
|
|
|
|
+
|
|
|
|
+ size_t i;
|
|
|
|
+ for (i = 0; i < NX; i++)
|
|
|
|
+ vector[i] = (float) i;
|
|
|
|
+
|
|
|
|
+ vector_scal (NX, vector, FACTOR);
|
|
|
|
+
|
|
|
|
+#pragma starpu wait
|
|
|
|
+ @} /* VECTOR is automatically freed here. */
|
|
|
|
+
|
|
|
|
+#pragma starpu shutdown
|
|
|
|
+
|
|
|
|
+ return valid ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
|
|
+@}
|
|
|
|
+@end example
|
|
|
|
+
|
|
|
|
+@noindent
|
|
|
|
+The @code{main} function above does several things:
|
|
|
|
+
|
|
|
|
+@itemize
|
|
|
|
+@item
|
|
|
|
+It initializes StarPU. This has to be done explicitly, as it is
|
|
|
|
+undesirable to add implicit initialization code in user code.
|
|
|
|
+
|
|
|
|
+@item
|
|
|
|
+It allocates @var{vector} in the heap; it will automatically be freed
|
|
|
|
+when its scope is left. Alternatively, good old @code{malloc} and
|
|
|
|
+@code{free} could have been used, but they are more error-prone and
|
|
|
|
+require more typing.
|
|
|
|
+
|
|
|
|
+@item
|
|
|
|
+It @dfn{registers} the memory pointed to by @var{vector}. Eventually,
|
|
|
|
+when OpenCL or CUDA task implementations are added, this will allow
|
|
|
|
+StarPU to transfer that memory region between GPUs and the main memory.
|
|
|
|
+Removing this @code{pragma} is an error.
|
|
|
|
+
|
|
|
|
+@item
|
|
|
|
+It invokes the @code{vector_scal} task. The invocation looks the same
|
|
|
|
+as a standard C function call. However, it is an @dfn{asynchronous
|
|
|
|
+invocation}, meaning that the actual call is performed in parallel with
|
|
|
|
+the caller's continuation.
|
|
|
|
+
|
|
|
|
+@item
|
|
|
|
+It @dfn{waits} for the termination of the @code{vector_scal}
|
|
|
|
+asynchronous call.
|
|
|
|
+
|
|
|
|
+@item
|
|
|
|
+Finally, StarPU is shut down, giving it an opportunity to write
|
|
|
|
+profiling info to a file on disk, for instance (@pxref{Off-line,
|
|
|
|
+off-line performance feedback}).
|
|
|
|
+
|
|
|
|
+@end itemize
|
|
|
|
+
|
|
|
|
+The program can be compiled and linked with GCC and the @code{-fplugin}
|
|
|
|
+flag:
|
|
|
|
+
|
|
|
|
+@example
|
|
|
|
+$ gcc hello-starpu.c \
|
|
|
|
+ -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
|
|
|
|
+ `pkg-config starpu-1.0 --libs`
|
|
|
|
+@end example
|
|
|
|
+
|
|
|
|
+And voil@`a!
|
|
|
|
+
|
|
|
|
+@unnumberedsubsubsec Adding an OpenCL Task Implementation
|
|
|
|
+
|
|
|
|
+Now, this is all fine and great, but you certainly want to take
|
|
|
|
+advantage of these newfangled GPUs that your lab just bought, don't you?
|
|
|
|
+
|
|
|
|
+So, let's add an OpenCL implementation of the @code{vector_scal} task.
|
|
|
|
+We assume that the OpenCL kernel is available in a file,
|
|
|
|
+@file{vector_scal_opencl_kernel.cl}, not shown here. The OpenCL task
|
|
|
|
+implementation is similar to that used with the standard C API
|
|
|
|
+(@pxref{Definition of the OpenCL Kernel}). It is declared and defined
|
|
|
|
+in our C file like this:
|
|
|
|
+
|
|
|
|
+@example
|
|
|
|
+/* Include StarPU's OpenCL integration. */
|
|
|
|
+#include <starpu_opencl.h>
|
|
|
|
+
|
|
|
|
+/* The OpenCL programs, loaded from `main' (see below). */
|
|
|
|
+static struct starpu_opencl_program cl_programs;
|
|
|
|
+
|
|
|
|
+static void vector_scal_opencl (size_t size, float vector[size],
|
|
|
|
+ float factor)
|
|
|
|
+ __attribute__ ((task_implementation ("opencl", vector_scal)));
|
|
|
|
+
|
|
|
|
+static void
|
|
|
|
+vector_scal_opencl (size_t size, float vector[size], float factor)
|
|
|
|
+@{
|
|
|
|
+ int id, devid, err;
|
|
|
|
+ cl_kernel kernel;
|
|
|
|
+ cl_command_queue queue;
|
|
|
|
+ cl_event event;
|
|
|
|
+
|
|
|
|
+ /* VECTOR is GPU memory pointer, not a main memory pointer. */
|
|
|
|
+ cl_mem val = (cl_mem) vector;
|
|
|
|
+
|
|
|
|
+ id = starpu_worker_get_id ();
|
|
|
|
+ devid = starpu_worker_get_devid (id);
|
|
|
|
+
|
|
|
|
+ /* Prepare to invoke the kernel. In the future, this will be largely
|
|
|
|
+ automated. */
|
|
|
|
+ err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
|
|
|
|
+ "vector_mult_opencl", devid);
|
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
|
+ STARPU_OPENCL_REPORT_ERROR (err);
|
|
|
|
+
|
|
|
|
+ err = clSetKernelArg (kernel, 0, sizeof (val), &val);
|
|
|
|
+ err |= clSetKernelArg (kernel, 1, sizeof (size), &size);
|
|
|
|
+ err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
|
|
|
|
+ if (err)
|
|
|
|
+ STARPU_OPENCL_REPORT_ERROR (err);
|
|
|
|
+
|
|
|
|
+ size_t global = 1, local = 1;
|
|
|
|
+ err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0,
|
|
|
|
+ NULL, &event);
|
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
|
+ STARPU_OPENCL_REPORT_ERROR (err);
|
|
|
|
+
|
|
|
|
+ clFinish (queue);
|
|
|
|
+ starpu_opencl_collect_stats (event);
|
|
|
|
+ clReleaseEvent (event);
|
|
|
|
+
|
|
|
|
+ /* Done with KERNEL. */
|
|
|
|
+ starpu_opencl_release_kernel (kernel);
|
|
|
|
+@}
|
|
|
|
+@end example
|
|
|
|
+
|
|
|
|
+@noindent
|
|
|
|
+The OpenCL kernel itself must be loaded from @code{main}, sometime after
|
|
|
|
+the @code{initialize} pragma:
|
|
|
|
+
|
|
|
|
+@example
|
|
|
|
+ starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
|
|
|
|
+ &cl_programs, "");
|
|
|
|
+@end example
|
|
|
|
+
|
|
|
|
+@noindent
|
|
|
|
+And that's it. The @code{vector_scal} task now has an additional
|
|
|
|
+implementation, for OpenCL, which StarPU's scheduler may choose to use
|
|
|
|
+at run-time. Unfortunately, the @code{vector_scal_opencl} above still
|
|
|
|
+has to go through the common OpenCL boilerplate; in the future,
|
|
|
|
+additional extensions will automate most of it.
|
|
|
|
+
|
|
|
|
+@unnumberedsubsubsec Adding a CUDA Task Implementation
|
|
|
|
+
|
|
|
|
+Adding a CUDA implementation of the task is very similar, except that
|
|
|
|
+the implementation itself is typically written in CUDA, and compiled
|
|
|
|
+with @code{nvcc}. Thus, the C file only needs to contain an external
|
|
|
|
+declaration for the task implementation:
|
|
|
|
+
|
|
|
|
+@example
|
|
|
|
+extern void vector_scal_cuda (size_t size, float vector[size],
|
|
|
|
+ float factor)
|
|
|
|
+ __attribute__ ((task_implementation ("cuda", vector_scal)));
|
|
|
|
+@end example
|
|
|
|
+
|
|
|
|
+The actual implementation of the CUDA task goes into a separate
|
|
|
|
+compilation unit, in a @file{.cu} file. It is very close to the
|
|
|
|
+implementation when using StarPU's standard C API (@pxref{Definition of
|
|
|
|
+the CUDA Kernel}).
|
|
|
|
+
|
|
|
|
+@example
|
|
|
|
+/* CUDA implementation of the `vector_scal' task, to be compiled
|
|
|
|
+ with `nvcc'. */
|
|
|
|
+
|
|
|
|
+#include <starpu.h>
|
|
|
|
+#include <starpu_cuda.h>
|
|
|
|
+#include <stdlib.h>
|
|
|
|
+
|
|
|
|
+static __global__ void
|
|
|
|
+vector_mult_cuda (float *val, unsigned n, float factor)
|
|
|
|
+@{
|
|
|
|
+ unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
+
|
|
|
|
+ if (i < n)
|
|
|
|
+ val[i] *= factor;
|
|
|
|
+@}
|
|
|
|
+
|
|
|
|
+/* Definition of the task implementation declared in the C file. */
|
|
|
|
+extern "C" void
|
|
|
|
+vector_scal_cuda (size_t size, float vector[], float factor)
|
|
|
|
+@{
|
|
|
|
+ unsigned threads_per_block = 64;
|
|
|
|
+ unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
|
|
|
|
+
|
|
|
|
+ vector_mult_cuda <<< nblocks, threads_per_block, 0,
|
|
|
|
+ starpu_cuda_get_local_stream () >>> (vector, size, factor);
|
|
|
|
+
|
|
|
|
+ cudaStreamSynchronize (starpu_cuda_get_local_stream ());
|
|
|
|
+@}
|
|
|
|
+@end example
|
|
|
|
+
|
|
|
|
+The complete source code, in the @file{gcc-plugin/examples/vector_scal}
|
|
|
|
+directory of the StarPU distribution, also shows how an SSE-specialized
|
|
|
|
+CPU task implementation can be added.
|
|
|
|
+
|
|
|
|
+For more details on the C extensions provided by StarPU's GCC plug-in,
|
|
|
|
+@xref{C Extensions}.
|
|
|
|
+
|
|
@node Source code of Vector Scaling
|
|
@node Source code of Vector Scaling
|
|
@subsection Source code of Vector Scaling
|
|
@subsection Source code of Vector Scaling
|
|
|
|
|