13 years ago · 21ac454f23
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -3,7 +3,7 @@
 
																 @c This file is part of the StarPU Handbook.
															
 
																 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																 @c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
															
+
																+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																 @c See the file starpu.texi for copying conditions.
															
 
																 @menu
															
@@ -232,10 +232,269 @@ we show how StarPU tasks can manipulate data. The full source code for
 
																 this example is given in @ref{Full source code for the 'Scaling a Vector' example}.
															
 
																 @menu
															
 
																-* Source code of Vector Scaling::  
															
+
																+* Extended C Source Code of Vector Scaling::    StarPU program using C extensions
															
 
																-* Execution of Vector Scaling::  
															
+
																+* Source code of Vector Scaling::  StarPU program written in standard C
															
 
																+* Execution of Vector Scaling::    Running the program
															
 
																 @end menu
															
 
																+@node Extended C Source Code of Vector Scaling
															
 
																+@subsection Extended C Source Code of Vector Scaling
															
 
																+
															
 
																+The simplest way to get started writing StarPU programs is using the C
															
 
																+language extensions provided by the GCC plug-in (@pxref{C Extensions}).
															
 
																+These extensions map directly to StarPU's main concepts: tasks, task
															
 
																+implementations for CPU, OpenCL, or CUDA, and registered data buffers.
															
 
																+
															
 
																+The example below is a vector-scaling program, that multiplies elements
															
 
																+of a vector by a given factor@footnote{The complete example, and
															
 
																+additional examples, is available in the @file{gcc-plugin/examples}
															
 
																+directory of the StarPU distribution.}.  For comparison, the standard C
															
 
																+version that uses StarPU's standard C programming interface is given in
															
 
																+the next section (@pxref{Source code of Vector Scaling, standard C
															
 
																+version of the example}).
															
 
																+
															
 
																+First of all, the vector-scaling task and its simple CPU implementation
															
 
																+has to be defined:
															
 
																+
															
 
																+@example
															
 
																+/* Declare the `vector_scal' task.  */
															
 
																+
															
 
																+static void vector_scal (size_t size, float vector[size],
															
 
																+                         float factor)
															
 
																+  __attribute__ ((task));
															
 
																+
															
 
																+/* Declare and define the standard CPU implementation.  */
															
 
																+
															
 
																+static void vector_scal_cpu (size_t size, float vector[size],
															
 
																+                             float factor)
															
 
																+  __attribute__ ((task_implementation ("cpu", vector_scal)));
															
 
																+
															
 
																+static void
															
 
																+vector_scal_cpu (size_t size, float vector[size], float factor)
															
 
																+@{
															
 
																+  size_t i;
															
 
																+  for (i = 0; i < size; i++)
															
 
																+    vector[i] *= factor;
															
 
																+@}
															
 
																+@end example
															
 
																+
															
 
																+Next, the body of the program, which uses the task defined above, can be
															
 
																+implemented:
															
 
																+
															
 
																+@example
															
 
																+int
															
 
																+main (void)
															
 
																+@{
															
 
																+#pragma starpu initialize
															
 
																+
															
 
																+#define NX     0x100000
															
 
																+#define FACTOR 3.14
															
 
																+
															
 
																+  @{
															
 
																+    float vector[NX] __attribute__ ((heap_allocated));
															
 
																+
															
 
																+#pragma starpu register vector
															
 
																+
															
 
																+    size_t i;
															
 
																+    for (i = 0; i < NX; i++)
															
 
																+      vector[i] = (float) i;
															
 
																+
															
 
																+    vector_scal (NX, vector, FACTOR);
															
 
																+
															
 
																+#pragma starpu wait
															
 
																+  @} /* VECTOR is automatically freed here.  */
															
 
																+
															
 
																+#pragma starpu shutdown
															
 
																+
															
 
																+  return valid ? EXIT_SUCCESS : EXIT_FAILURE;
															
 
																+@}
															
 
																+@end example
															
 
																+
															
 
																+@noindent
															
 
																+The @code{main} function above does several things:
															
 
																+
															
 
																+@itemize
															
 
																+@item
															
 
																+It initializes StarPU.  This has to be done explicitly, as it is
															
 
																+undesirable to add implicit initialization code in user code.
															
 
																+
															
 
																+@item
															
 
																+It allocates @var{vector} in the heap; it will automatically be freed
															
 
																+when its scope is left.  Alternatively, good old @code{malloc} and
															
 
																+@code{free} could have been used, but they are more error-prone and
															
 
																+require more typing.
															
 
																+
															
 
																+@item
															
 
																+It @dfn{registers} the memory pointed to by @var{vector}.  Eventually,
															
 
																+when OpenCL or CUDA task implementations are added, this will allow
															
 
																+StarPU to transfer that memory region between GPUs and the main memory.
															
 
																+Removing this @code{pragma} is an error.
															
 
																+
															
 
																+@item
															
 
																+It invokes the @code{vector_scal} task.  The invocation looks the same
															
 
																+as a standard C function call.  However, it is an @dfn{asynchronous
															
 
																+invocation}, meaning that the actual call is performed in parallel with
															
 
																+the caller's continuation.
															
 
																+
															
 
																+@item
															
 
																+It @dfn{waits} for the termination of the @code{vector_scal}
															
 
																+asynchronous call.
															
 
																+
															
 
																+@item
															
 
																+Finally, StarPU is shut down, giving it an opportunity to write
															
 
																+profiling info to a file on disk, for instance (@pxref{Off-line,
															
 
																+off-line performance feedback}).
															
 
																+
															
 
																+@end itemize
															
 
																+
															
 
																+The program can be compiled and linked with GCC and the @code{-fplugin}
															
 
																+flag:
															
 
																+
															
 
																+@example
															
 
																+$ gcc hello-starpu.c \
															
 
																+    -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
															
 
																+    `pkg-config starpu-1.0 --libs`
															
 
																+@end example
															
 
																+
															
 
																+And voil@`a!
															
 
																+
															
 
																+@unnumberedsubsubsec Adding an OpenCL Task Implementation
															
 
																+
															
 
																+Now, this is all fine and great, but you certainly want to take
															
 
																+advantage of these newfangled GPUs that your lab just bought, don't you?
															
 
																+
															
 
																+So, let's add an OpenCL implementation of the @code{vector_scal} task.
															
 
																+We assume that the OpenCL kernel is available in a file,
															
 
																+@file{vector_scal_opencl_kernel.cl}, not shown here.  The OpenCL task
															
 
																+implementation is similar to that used with the standard C API
															
 
																+(@pxref{Definition of the OpenCL Kernel}).  It is declared and defined
															
 
																+in our C file like this:
															
 
																+
															
 
																+@example
															
 
																+/* Include StarPU's OpenCL integration.  */
															
 
																+#include <starpu_opencl.h>
															
 
																+
															
 
																+/* The OpenCL programs, loaded from `main' (see below).  */
															
 
																+static struct starpu_opencl_program cl_programs;
															
 
																+
															
 
																+static void vector_scal_opencl (size_t size, float vector[size],
															
 
																+                                float factor)
															
 
																+  __attribute__ ((task_implementation ("opencl", vector_scal)));
															
 
																+
															
 
																+static void
															
 
																+vector_scal_opencl (size_t size, float vector[size], float factor)
															
 
																+@{
															
 
																+  int id, devid, err;
															
 
																+  cl_kernel kernel;
															
 
																+  cl_command_queue queue;
															
 
																+  cl_event event;
															
 
																+
															
 
																+  /* VECTOR is GPU memory pointer, not a main memory pointer.  */
															
 
																+  cl_mem val = (cl_mem) vector;
															
 
																+
															
 
																+  id = starpu_worker_get_id ();
															
 
																+  devid = starpu_worker_get_devid (id);
															
 
																+
															
 
																+  /* Prepare to invoke the kernel.  In the future, this will be largely
															
 
																+     automated.  */
															
 
																+  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
															
 
																+				   "vector_mult_opencl", devid);
															
 
																+  if (err != CL_SUCCESS)
															
 
																+    STARPU_OPENCL_REPORT_ERROR (err);
															
 
																+
															
 
																+  err = clSetKernelArg (kernel, 0, sizeof (val), &val);
															
 
																+  err |= clSetKernelArg (kernel, 1, sizeof (size), &size);
															
 
																+  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
															
 
																+  if (err)
															
 
																+    STARPU_OPENCL_REPORT_ERROR (err);
															
 
																+
															
 
																+  size_t global = 1, local = 1;
															
 
																+  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0,
															
 
																+				NULL, &event);
															
 
																+  if (err != CL_SUCCESS)
															
 
																+    STARPU_OPENCL_REPORT_ERROR (err);
															
 
																+
															
 
																+  clFinish (queue);
															
 
																+  starpu_opencl_collect_stats (event);
															
 
																+  clReleaseEvent (event);
															
 
																+
															
 
																+  /* Done with KERNEL.  */
															
 
																+  starpu_opencl_release_kernel (kernel);
															
 
																+@}
															
 
																+@end example
															
 
																+
															
 
																+@noindent
															
 
																+The OpenCL kernel itself must be loaded from @code{main}, sometime after
															
 
																+the @code{initialize} pragma:
															
 
																+
															
 
																+@example
															
 
																+  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
															
 
																+                                       &cl_programs, "");
															
 
																+@end example
															
 
																+
															
 
																+@noindent
															
 
																+And that's it.  The @code{vector_scal} task now has an additional
															
 
																+implementation, for OpenCL, which StarPU's scheduler may choose to use
															
 
																+at run-time.  Unfortunately, the @code{vector_scal_opencl} above still
															
 
																+has to go through the common OpenCL boilerplate; in the future,
															
 
																+additional extensions will automate most of it.
															
 
																+
															
 
																+@unnumberedsubsubsec Adding a CUDA Task Implementation
															
 
																+
															
 
																+Adding a CUDA implementation of the task is very similar, except that
															
 
																+the implementation itself is typically written in CUDA, and compiled
															
 
																+with @code{nvcc}.  Thus, the C file only needs to contain an external
															
 
																+declaration for the task implementation:
															
 
																+
															
 
																+@example
															
 
																+extern void vector_scal_cuda (size_t size, float vector[size],
															
 
																+                              float factor)
															
 
																+  __attribute__ ((task_implementation ("cuda", vector_scal)));
															
 
																+@end example
															
 
																+
															
 
																+The actual implementation of the CUDA task goes into a separate
															
 
																+compilation unit, in a @file{.cu} file.  It is very close to the
															
 
																+implementation when using StarPU's standard C API (@pxref{Definition of
															
 
																+the CUDA Kernel}).
															
 
																+
															
 
																+@example
															
 
																+/* CUDA implementation of the `vector_scal' task, to be compiled
															
 
																+   with `nvcc'.  */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																+#include <stdlib.h>
															
 
																+
															
 
																+static __global__ void
															
 
																+vector_mult_cuda (float *val, unsigned n, float factor)
															
 
																+@{
															
 
																+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
															
 
																+
															
 
																+  if (i < n)
															
 
																+    val[i] *= factor;
															
 
																+@}
															
 
																+
															
 
																+/* Definition of the task implementation declared in the C file.   */
															
 
																+extern "C" void
															
 
																+vector_scal_cuda (size_t size, float vector[], float factor)
															
 
																+@{
															
 
																+  unsigned threads_per_block = 64;
															
 
																+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
															
 
																+
															
 
																+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
															
 
																+    starpu_cuda_get_local_stream () >>> (vector, size, factor);
															
 
																+
															
 
																+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
															
 
																+@}
															
 
																+@end example
															
 
																+
															
 
																+The complete source code, in the @file{gcc-plugin/examples/vector_scal}
															
 
																+directory of the StarPU distribution, also shows how an SSE-specialized
															
 
																+CPU task implementation can be added.
															
 
																+
															
 
																+For more details on the C extensions provided by StarPU's GCC plug-in,
															
 
																+@xref{C Extensions}.
															
 
																+
															
 
																 @node Source code of Vector Scaling
															
 
																 @subsection Source code of Vector Scaling
															
--- a/doc/chapters/c-extensions.texi
+++ b/doc/chapters/c-extensions.texi
@@ -196,7 +196,10 @@ This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
 
																 implementation may run in parallel with the continuation of the caller.
															
 
																 The next section describes how memory buffers must be handled in
															
 
																-StarPU-GCC code.
															
+
																+StarPU-GCC code.  For a complete example, see the
															
 
																+@code{gcc-plugin/examples} directory of the source distribution, and
															
 
																+@ref{Extended C Source Code of Vector Scaling, the vector-scaling
															
 
																+example}.
															
 
																 @node Registered Data Buffers