12 years ago · 810f5e6def
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -43,6 +43,13 @@ chapters =	\
 
				 	chapters/scaling-vector-example.doxy \
			
 
				 	chapters/code/hello_pragma2.c \
			
 
				 	chapters/code/hello_pragma.c \
			
 
				+	chapters/code/matmul_pragma.c \
			
 
				+	chapters/code/matmul_pragma2.c \
			
 
				+	chapters/code/cholesky_pragma.c \
			
 
				+	chapters/code/forkmode.c \
			
 
				+	chapters/code/multiformat.c \
			
 
				+	chapters/code/complex.c \
			
 
				+	chapters/code/simgrid.c \
			
 
				 	chapters/code/vector_scal_c.c \
			
 
				 	chapters/code/vector_scal_cpu.c \
			
 
				 	chapters/code/vector_scal_cuda.cu \
			
--- a/doc/doxygen/chapters/advanced_examples.doxy
+++ b/doc/doxygen/chapters/advanced_examples.doxy
@@ -731,7 +731,7 @@ otherwise StarPU will not know how to better group cores.
 
				 
			
 
				 Two modes of execution exist to accomodate with existing usages.
			
 
				 
			
 
				-\subsection Fork-mode parallel tasks Fork-mode_parallel_tasks
			
 
				+\subsection Fork-mode_parallel_tasks Fork-mode Parallel Tasks
			
 
				 
			
 
				 In the Fork mode, StarPU will call the codelet function on one
			
 
				 of the CPUs of the combined worker. The codelet function can use
			
@@ -746,30 +746,7 @@ the CPU binding mask that StarPU chose.
 
				 For instance, using OpenMP (full source is available in
			
 
				 <c>examples/openmp/vector_scal.c</c>):
			
 
				 
			
 
				-\code{.c}
			
 
				-void scal_cpu_func(void *buffers[], void *_args)
			
 
				-{
			
 
				-    unsigned i;
			
 
				-    float *factor = _args;
			
 
				-    struct starpu_vector_interface *vector = buffers[0];
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				-    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				-
			
 
				-#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
			
 
				-    for (i = 0; i < n; i++)
			
 
				-        val[i] *= *factor;
			
 
				-}
			
 
				-
			
 
				-static struct starpu_codelet cl =
			
 
				-{
			
 
				-    .modes = { STARPU_RW },
			
 
				-    .where = STARPU_CPU,
			
 
				-    .type = STARPU_FORKJOIN,
			
 
				-    .max_parallelism = INT_MAX,
			
 
				-    .cpu_funcs = {scal_cpu_func, NULL},
			
 
				-    .nbuffers = 1,
			
 
				-};
			
 
				-\endcode
			
 
				+\include forkmode.c
			
 
				 
			
 
				 Other examples include for instance calling a BLAS parallel CPU implementation
			
 
				 (see <c>examples/mult/xgemm.c</c>).
			
@@ -886,48 +863,7 @@ will be able to convert data from one data structure to the other when needed.
 
				 Note that the dmda scheduler is the only one optimized for this interface. The
			
 
				 user must provide StarPU with conversion codelets:
			
 
				 
			
 
				-\code{.c}
			
 
				-#define NX 1024
			
 
				-struct point array_of_structs[NX];
			
 
				-starpu_data_handle_t handle;
			
 
				-
			
 
				-/*
			
 
				- * The conversion of a piece of data is itself a task, though it is created,
			
 
				- * submitted and destroyed by StarPU internals and not by the user. Therefore,
			
 
				- * we have to define two codelets.
			
 
				- * Note that for now the conversion from the CPU format to the GPU format has to
			
 
				- * be executed on the GPU, and the conversion from the GPU to the CPU has to be
			
 
				- * executed on the CPU.
			
 
				- */
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-void cpu_to_opencl_opencl_func(void *buffers[], void *args);
			
 
				-struct starpu_codelet cpu_to_opencl_cl = {
			
 
				-    .where = STARPU_OPENCL,
			
 
				-    .opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = { STARPU_RW }
			
 
				-};
			
 
				-
			
 
				-void opencl_to_cpu_func(void *buffers[], void *args);
			
 
				-struct starpu_codelet opencl_to_cpu_cl = {
			
 
				-    .where = STARPU_CPU,
			
 
				-    .cpu_funcs = { opencl_to_cpu_func, NULL },
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = { STARPU_RW }
			
 
				-};
			
 
				-#endif
			
 
				-
			
 
				-struct starpu_multiformat_data_interface_ops format_ops = {
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-    .opencl_elemsize = 2 * sizeof(float),
			
 
				-    .cpu_to_opencl_cl = &cpu_to_opencl_cl,
			
 
				-    .opencl_to_cpu_cl = &opencl_to_cpu_cl,
			
 
				-#endif
			
 
				-    .cpu_elemsize = 2 * sizeof(float),
			
 
				-    ...
			
 
				-};
			
 
				-starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
			
 
				-\endcode
			
 
				+\include multiformat.c
			
 
				 
			
 
				 Kernels can be written almost as for any other interface. Note that
			
 
				 ::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
			
@@ -1150,14 +1086,7 @@ Similar functions need to be defined to access the different fields of the
 
				 complex interface from a <c>void *</c> pointer to be used within codelet
			
 
				 implemetations.
			
 
				 
			
 
				-\code{.c}
			
 
				-#define STARPU_COMPLEX_GET_REAL(interface)	\
			
 
				-        (((struct starpu_complex_interface *)(interface))->real)
			
 
				-#define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
			
 
				-        (((struct starpu_complex_interface *)(interface))->imaginary)
			
 
				-#define STARPU_COMPLEX_GET_NX(interface)	\
			
 
				-        (((struct starpu_complex_interface *)(interface))->nx)
			
 
				-\endcode
			
 
				+\include complex.c
			
 
				 
			
 
				 Complex data interfaces can then be registered to StarPU.
			
 
				 
			
--- a/doc/doxygen/chapters/basic_examples.doxy
+++ b/doc/doxygen/chapters/basic_examples.doxy
@@ -675,88 +675,7 @@ Here is the source of the main application. You can notice that the fields
 
				 define the pointers to the CUDA and OpenCL implementations of the
			
 
				 task.
			
 
				 
			
 
				-\code{.c}
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#define NX 2048
			
 
				-
			
 
				-extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				-extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				-extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				-
			
 
				-/* Definition of the codelet */
			
 
				-static struct starpu_codelet cl =
			
 
				-{
			
 
				-    .cuda_funcs = { scal_cuda_func, NULL },
			
 
				-    .cpu_funcs = { scal_cpu_func, NULL },
			
 
				-    .opencl_funcs = { scal_opencl_func, NULL },
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = { STARPU_RW }
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-/* The compiled version of the OpenCL program */
			
 
				-struct starpu_opencl_program programs;
			
 
				-#endif
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-    float *vector;
			
 
				-    int i, ret;
			
 
				-    float factor=3.0;
			
 
				-    struct starpu_task *task;
			
 
				-    starpu_data_handle_t vector_handle;
			
 
				-
			
 
				-    starpu_init(NULL);                            /* Initialising StarPU */
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-    starpu_opencl_load_opencl_from_file(
			
 
				-            "examples/basic_examples/vector_scal_opencl_codelet.cl",
			
 
				-            &programs, NULL);
			
 
				-#endif
			
 
				-
			
 
				-    vector = malloc(NX*sizeof(vector[0]));
			
 
				-    assert(vector);
			
 
				-    for(i=0 ; i<NX ; i++) vector[i] = i;
			
 
				-
			
 
				-    /* Registering data within StarPU */
			
 
				-    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
			
 
				-                                NX, sizeof(vector[0]));
			
 
				-
			
 
				-    /* Definition of the task */
			
 
				-    task = starpu_task_create();
			
 
				-    task->cl = &cl;
			
 
				-    task->handles[0] = vector_handle;
			
 
				-    task->cl_arg = &factor;
			
 
				-    task->cl_arg_size = sizeof(factor);
			
 
				-
			
 
				-    /* Submitting the task */
			
 
				-    ret = starpu_task_submit(task);
			
 
				-    if (ret == -ENODEV) {
			
 
				-            fprintf(stderr, "No worker may execute this task\n");
			
 
				-            return 1;
			
 
				-    }
			
 
				-
			
 
				-    /* Waiting for its termination */
			
 
				-    starpu_task_wait_for_all();
			
 
				-
			
 
				-    /* Update the vector in RAM */
			
 
				-    starpu_data_acquire(vector_handle, STARPU_R);
			
 
				-
			
 
				-    /* Access the data */
			
 
				-    for(i=0 ; i<NX; i++) {
			
 
				-      fprintf(stderr, "%f ", vector[i]);
			
 
				-    }
			
 
				-    fprintf(stderr, "\n");
			
 
				-
			
 
				-    /* Release the RAM view of the data before unregistering it and shutting down StarPU */
			
 
				-    starpu_data_release(vector_handle);
			
 
				-    starpu_data_unregister(vector_handle);
			
 
				-    starpu_shutdown();
			
 
				-
			
 
				-    return 0;
			
 
				-}
			
 
				-\endcode
			
 
				+\include vector_scal_c.c
			
 
				 
			
 
				 \subsection Execution_of_Hybrid_Vector_Scaling Execution of Hybrid Vector Scaling
			
 
				 
			
--- a/doc/doxygen/chapters/c_extensions.doxy
+++ b/doc/doxygen/chapters/c_extensions.doxy
@@ -322,39 +322,7 @@ automatic variables.
 
				 The following example illustrates use of the <c>heap_allocated</c>
			
 
				 attribute:
			
 
				 
			
 
				-\code{.c}
			
 
				-extern void cholesky(unsigned nblocks, unsigned size,
			
 
				-                    float mat[nblocks][nblocks][size])
			
 
				-  __attribute__ ((task));
			
 
				-
			
 
				-int
			
 
				-main (int argc, char *argv[])
			
 
				-{
			
 
				-#pragma starpu initialize
			
 
				-
			
 
				-  /* ... */
			
 
				-
			
 
				-  int nblocks, size;
			
 
				-  parse_args (&nblocks, &size);
			
 
				-
			
 
				-  /* Allocate an array of the required size on the heap,
			
 
				-     and register it.  */
			
 
				-
			
 
				-  {
			
 
				-    float matrix[nblocks][nblocks][size]
			
 
				-      __attribute__ ((heap_allocated, registered));
			
 
				-
			
 
				-    cholesky (nblocks, size, matrix);
			
 
				-
			
 
				-#pragma starpu wait
			
 
				-
			
 
				-  }   /* MATRIX is automatically unregistered & freed here.  */
			
 
				-
			
 
				-#pragma starpu shutdown
			
 
				-
			
 
				-  return EXIT_SUCCESS;
			
 
				-}
			
 
				-\endcode
			
 
				+\include cholesky_pragma.c
			
 
				 
			
 
				 \section Conditional_Extensions Using C Extensions Conditionally
			
 
				 
			
@@ -376,62 +344,7 @@ supported C extensions.
 
				 The code below illustrates how to define a task and its implementations
			
 
				 in a way that allows it to be compiled without the GCC plug-in:
			
 
				 
			
 
				-\code{.c}
			
 
				-/* This program is valid, whether or not StarPU's GCC plug-in
			
 
				-   is being used.  */
			
 
				-
			
 
				-#include <stdlib.h>
			
 
				-
			
 
				-/* The attribute below is ignored when GCC is not used.  */
			
 
				-static void matmul (const float *A, const float *B, float * C,
			
 
				-                    unsigned nx, unsigned ny, unsigned nz)
			
 
				-  __attribute__ ((task));
			
 
				-
			
 
				-static void
			
 
				-matmul (const float *A, const float *B, float * C,
			
 
				-        unsigned nx, unsigned ny, unsigned nz)
			
 
				-{
			
 
				-  /* Code of the CPU kernel here...  */
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_GCC_PLUGIN
			
 
				-/* Optional OpenCL task implementation.  */
			
 
				-
			
 
				-static void matmul_opencl (const float *A, const float *B, float * C,
			
 
				-                           unsigned nx, unsigned ny, unsigned nz)
			
 
				-  __attribute__ ((task_implementation ("opencl", matmul)));
			
 
				-
			
 
				-static void
			
 
				-matmul_opencl (const float *A, const float *B, float * C,
			
 
				-               unsigned nx, unsigned ny, unsigned nz)
			
 
				-{
			
 
				-  /* Code that invokes the OpenCL kernel here...  */
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-int
			
 
				-main (int argc, char *argv[])
			
 
				-{
			
 
				-  /* The pragmas below are simply ignored when StarPU-GCC
			
 
				-     is not used.  */
			
 
				-#pragma starpu initialize
			
 
				-
			
 
				-  float A[123][42][7], B[123][42][7], C[123][42][7];
			
 
				-
			
 
				-#pragma starpu register A
			
 
				-#pragma starpu register B
			
 
				-#pragma starpu register C
			
 
				-
			
 
				-  /* When StarPU-GCC is used, the call below is asynchronous;
			
 
				-     otherwise, it is synchronous.  */
			
 
				-  matmul ((float *) A, (float *) B, (float *) C, 123, 42, 7);
			
 
				-
			
 
				-#pragma starpu wait
			
 
				-#pragma starpu shutdown
			
 
				-
			
 
				-  return EXIT_SUCCESS;
			
 
				-}
			
 
				-\endcode
			
 
				+\include matmul_pragma.c
			
 
				 
			
 
				 The above program is a valid StarPU program when StarPU's GCC plug-in is
			
 
				 used; it is also a valid sequential program when the plug-in is not
			
@@ -445,19 +358,7 @@ unable to parse the attribute syntax (In practice, Clang and
 
				 several proprietary compilers implement attributes.), so you may want to
			
 
				 wrap attributes in macros like this:
			
 
				 
			
 
				-\code{.c}
			
 
				-/* Use the `task' attribute only when StarPU's GCC plug-in
			
 
				-   is available.   */
			
 
				-#ifdef STARPU_GCC_PLUGIN
			
 
				-# define __task  __attribute__ ((task))
			
 
				-#else
			
 
				-# define __task
			
 
				-#endif
			
 
				-
			
 
				-static void matmul (const float *A, const float *B, float *C,
			
 
				-                    unsigned nx, unsigned ny, unsigned nz) __task;
			
 
				-\endcode
			
 
				-
			
 
				+\include matmul_pragma2.c
			
 
				 
			
 
				 */
			
 
				 
			
--- a/doc/doxygen/chapters/code/cholesky_pragma.c
+++ b/doc/doxygen/chapters/code/cholesky_pragma.c
@@ -0,0 +1,31 @@
 
				+extern void cholesky(unsigned nblocks, unsigned size,
			
 
				+                    float mat[nblocks][nblocks][size])
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+int
			
 
				+main (int argc, char *argv[])
			
 
				+{
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+  /* ... */
			
 
				+
			
 
				+  int nblocks, size;
			
 
				+  parse_args (&nblocks, &size);
			
 
				+
			
 
				+  /* Allocate an array of the required size on the heap,
			
 
				+     and register it.  */
			
 
				+
			
 
				+  {
			
 
				+    float matrix[nblocks][nblocks][size]
			
 
				+      __attribute__ ((heap_allocated, registered));
			
 
				+
			
 
				+    cholesky (nblocks, size, matrix);
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+
			
 
				+  }   /* MATRIX is automatically unregistered & freed here.  */
			
 
				+
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  return EXIT_SUCCESS;
			
 
				+}
			
--- a/doc/doxygen/chapters/code/complex.c
+++ b/doc/doxygen/chapters/code/complex.c
@@ -0,0 +1,6 @@
 
				+#define STARPU_COMPLEX_GET_REAL(interface)	\
			
 
				+        (((struct starpu_complex_interface *)(interface))->real)
			
 
				+#define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
			
 
				+        (((struct starpu_complex_interface *)(interface))->imaginary)
			
 
				+#define STARPU_COMPLEX_GET_NX(interface)	\
			
 
				+        (((struct starpu_complex_interface *)(interface))->nx)
			
--- a/doc/doxygen/chapters/code/forkmode.c
+++ b/doc/doxygen/chapters/code/forkmode.c
@@ -0,0 +1,22 @@
 
				+void scal_cpu_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+    unsigned i;
			
 
				+    float *factor = _args;
			
 
				+    struct starpu_vector_interface *vector = buffers[0];
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+    .modes = { STARPU_RW },
			
 
				+    .where = STARPU_CPU,
			
 
				+    .type = STARPU_FORKJOIN,
			
 
				+    .max_parallelism = INT_MAX,
			
 
				+    .cpu_funcs = {scal_cpu_func, NULL},
			
 
				+    .nbuffers = 1,
			
 
				+};
			
--- a/doc/doxygen/chapters/code/matmul_pragma.c
+++ b/doc/doxygen/chapters/code/matmul_pragma.c
@@ -0,0 +1,54 @@
 
				+/* This program is valid, whether or not StarPU's GCC plug-in
			
 
				+   is being used.  */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+/* The attribute below is ignored when GCC is not used.  */
			
 
				+static void matmul (const float *A, const float *B, float * C,
			
 
				+                    unsigned nx, unsigned ny, unsigned nz)
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+static void
			
 
				+matmul (const float *A, const float *B, float * C,
			
 
				+        unsigned nx, unsigned ny, unsigned nz)
			
 
				+{
			
 
				+  /* Code of the CPU kernel here...  */
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_GCC_PLUGIN
			
 
				+/* Optional OpenCL task implementation.  */
			
 
				+
			
 
				+static void matmul_opencl (const float *A, const float *B, float * C,
			
 
				+                           unsigned nx, unsigned ny, unsigned nz)
			
 
				+  __attribute__ ((task_implementation ("opencl", matmul)));
			
 
				+
			
 
				+static void
			
 
				+matmul_opencl (const float *A, const float *B, float * C,
			
 
				+               unsigned nx, unsigned ny, unsigned nz)
			
 
				+{
			
 
				+  /* Code that invokes the OpenCL kernel here...  */
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+int
			
 
				+main (int argc, char *argv[])
			
 
				+{
			
 
				+  /* The pragmas below are simply ignored when StarPU-GCC
			
 
				+     is not used.  */
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+  float A[123][42][7], B[123][42][7], C[123][42][7];
			
 
				+
			
 
				+#pragma starpu register A
			
 
				+#pragma starpu register B
			
 
				+#pragma starpu register C
			
 
				+
			
 
				+  /* When StarPU-GCC is used, the call below is asynchronous;
			
 
				+     otherwise, it is synchronous.  */
			
 
				+  matmul ((float *) A, (float *) B, (float *) C, 123, 42, 7);
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  return EXIT_SUCCESS;
			
 
				+}
			
--- a/doc/doxygen/chapters/code/matmul_pragma2.c
+++ b/doc/doxygen/chapters/code/matmul_pragma2.c
@@ -0,0 +1,10 @@
 
				+/* Use the `task' attribute only when StarPU's GCC plug-in
			
 
				+   is available.   */
			
 
				+#ifdef STARPU_GCC_PLUGIN
			
 
				+# define __task  __attribute__ ((task))
			
 
				+#else
			
 
				+# define __task
			
 
				+#endif
			
 
				+
			
 
				+static void matmul (const float *A, const float *B, float *C,
			
 
				+                    unsigned nx, unsigned ny, unsigned nz) __task;
			
--- a/doc/doxygen/chapters/code/multiformat.c
+++ b/doc/doxygen/chapters/code/multiformat.c
@@ -0,0 +1,41 @@
 
				+#define NX 1024
			
 
				+struct point array_of_structs[NX];
			
 
				+starpu_data_handle_t handle;
			
 
				+
			
 
				+/*
			
 
				+ * The conversion of a piece of data is itself a task, though it is created,
			
 
				+ * submitted and destroyed by StarPU internals and not by the user. Therefore,
			
 
				+ * we have to define two codelets.
			
 
				+ * Note that for now the conversion from the CPU format to the GPU format has to
			
 
				+ * be executed on the GPU, and the conversion from the GPU to the CPU has to be
			
 
				+ * executed on the CPU.
			
 
				+ */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void cpu_to_opencl_opencl_func(void *buffers[], void *args);
			
 
				+struct starpu_codelet cpu_to_opencl_cl = {
			
 
				+    .where = STARPU_OPENCL,
			
 
				+    .opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = { STARPU_RW }
			
 
				+};
			
 
				+
			
 
				+void opencl_to_cpu_func(void *buffers[], void *args);
			
 
				+struct starpu_codelet opencl_to_cpu_cl = {
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = { opencl_to_cpu_func, NULL },
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = { STARPU_RW }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+struct starpu_multiformat_data_interface_ops format_ops = {
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    .opencl_elemsize = 2 * sizeof(float),
			
 
				+    .cpu_to_opencl_cl = &cpu_to_opencl_cl,
			
 
				+    .opencl_to_cpu_cl = &opencl_to_cpu_cl,
			
 
				+#endif
			
 
				+    .cpu_elemsize = 2 * sizeof(float),
			
 
				+    ...
			
 
				+};
			
 
				+
			
 
				+starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
			
--- a/doc/doxygen/chapters/code/simgrid.c
+++ b/doc/doxygen/chapters/code/simgrid.c
@@ -0,0 +1,12 @@
 
				+static struct starpu_codelet cl11 =
			
 
				+{
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+	.cuda_funcs = {(void*)1, NULL},
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.model = &chol_model_11
			
 
				+};
			
--- a/doc/doxygen/chapters/optimize_performance.doxy
+++ b/doc/doxygen/chapters/optimize_performance.doxy
@@ -516,19 +516,6 @@ case. Since during simgrid execution, the functions of the codelet are actually
 
				 not called, one can use dummy functions such as the following to still permit
			
 
				 CUDA or OpenCL execution:
			
 
				 
			
 
				-\code{.c}
			
 
				-static struct starpu_codelet cl11 =
			
 
				-{
			
 
				-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				-#elif defined(STARPU_SIMGRID)
			
 
				-	.cuda_funcs = {(void*)1, NULL},
			
 
				-#endif
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = {STARPU_RW},
			
 
				-	.model = &chol_model_11
			
 
				-};
			
 
				-\endcode
			
 
				+\include simgrid.c
			
 
				 
			
 
				 */