c-extensions.texi 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. @c -*-texinfo-*-
  2. @c This file is part of the StarPU Handbook.
  3. @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
  4. @c See the file starpu.texi for copying conditions.
  5. @cindex C extensions
  6. @cindex GCC plug-in
  7. When configured with @code{--enable-gcc-extensions}, StarPU builds a
  8. plug-in for the GNU Compiler Collection (GCC), which defines extensions
  9. to languages of the C family (C, C++, Objective-C) that make it easier
  10. to write StarPU code@footnote{This feature is only available for GCC 4.5
  11. and later.}. Those extensions include syntactic sugar for defining
  12. tasks and their implementations, invoking a task, and manipulating data
  13. buffers. Use of these extensions can be made conditional on the
  14. availability of the plug-in, leading to valid C sequential code when the
  15. plug-in is not used (@pxref{Conditional Extensions}).
  16. This section does not require detailed knowledge of the StarPU library.
  17. Note: as of StarPU @value{VERSION}, this is still an area under
  18. development and subject to change.
  19. @menu
  20. * Defining Tasks:: Defining StarPU tasks
  21. * Registered Data Buffers:: Manipulating data buffers
  22. * Conditional Extensions:: Using C extensions only when available
  23. @end menu
  24. @node Defining Tasks
  25. @section Defining Tasks
  26. @cindex task
  27. @cindex task implementation
  28. The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
  29. @enumerate
  30. @item
  31. tasks may have several implementations---e.g., one for CPUs, one written
  32. in OpenCL, one written in CUDA;
  33. @item
  34. tasks may have several implementations of the same target---e.g.,
  35. several CPU implementations;
  36. @item
  37. when a task is invoked, it may run in parallel, and StarPU is free to
  38. choose any of its implementations.
  39. @end enumerate
  40. Tasks and their implementations must be @emph{declared}. These
  41. declarations are annotated with @dfn{attributes} (@pxref{Attribute
  42. Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
  43. (GCC)}): the declaration of a task is a regular C function declaration
  44. with an additional @code{task} attribute, and task implementations are
  45. declared with a @code{task_implementation} attribute.
  46. The following function attributes are provided:
  47. @table @code
  48. @item task
  49. @cindex @code{task} attribute
  50. Declare the given function as a StarPU task. Its return type must be
  51. @code{void}, and it must not be defined---instead, a definition will
  52. automatically be provided by the compiler.
  53. Under the hood, declaring a task leads to the declaration of the
  54. corresponding @code{codelet} (@pxref{Codelet and Tasks}). If one or
  55. more task implementations are declared in the same compilation unit,
  56. then the codelet and the function itself are also defined; they inherit
  57. the scope of the task.
  58. Scalar arguments to the task are passed by value and copied to the
  59. target device if need be---technically, they are passed as the
  60. @code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
  61. @cindex @code{output} type attribute
  62. Pointer arguments are assumed to be registered data buffers---the
  63. @code{buffers} argument of a task (@pxref{Codelets and Tasks,
  64. @code{buffers}}); @code{const}-qualified pointer arguments are viewed as
  65. read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
  66. buffers are assumed to be used read-write (@code{STARPU_RW}). In
  67. addition, the @code{output} type attribute can be as a type qualifier
  68. for output pointer or array parameters (@code{STARPU_W}).
  69. @item task_implementation (@var{target}, @var{task})
  70. @cindex @code{task_implementation} attribute
  71. Declare the given function as an implementation of @var{task} to run on
  72. @var{target}. @var{target} must be a string, currently one of
  73. @code{"cpu"} or @code{"cuda"}.
  74. @c FIXME: Update when OpenCL support is ready.
  75. @end table
  76. Here is an example:
  77. @cartouche
  78. @smallexample
  79. #define __output __attribute__ ((output))
  80. static void matmul (const float *A, const float *B,
  81. __output float *C,
  82. size_t nx, size_t ny, size_t nz)
  83. __attribute__ ((task));
  84. static void matmul_cpu (const float *A, const float *B,
  85. __output float *C,
  86. size_t nx, size_t ny, size_t nz)
  87. __attribute__ ((task_implementation ("cpu", matmul)));
  88. static void
  89. matmul_cpu (const float *A, const float *B, __output float *C,
  90. size_t nx, size_t ny, size_t nz)
  91. @{
  92. size_t i, j, k;
  93. for (j = 0; j < ny; j++)
  94. for (i = 0; i < nx; i++)
  95. @{
  96. for (k = 0; k < nz; k++)
  97. C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
  98. @}
  99. @}
  100. @end smallexample
  101. @end cartouche
  102. @noindent
  103. A @code{matmult} task is defined; it has only one implementation,
  104. @code{matmult_cpu}, which runs on the CPU. Variables @var{A} and
  105. @var{B} are input buffers, whereas @var{C} is considered an input/output
  106. buffer.
  107. CUDA and OpenCL implementations can be declared in a similar way:
  108. @cartouche
  109. @smallexample
  110. static void matmul_cuda (const float *A, const float *B, float *C,
  111. size_t nx, size_t ny, size_t nz)
  112. __attribute__ ((task_implementation ("cuda", matmul)));
  113. static void matmul_opencl (const float *A, const float *B, float *C,
  114. size_t nx, size_t ny, size_t nz)
  115. __attribute__ ((task_implementation ("opencl", matmul)));
  116. @end smallexample
  117. @end cartouche
  118. @noindent
  119. The CUDA and OpenCL implementations typically either invoke a kernel
  120. written in CUDA or OpenCL (for similar code, @pxref{CUDA Kernel}, and
  121. @pxref{OpenCL Kernel}), or call a library function that uses CUDA or
  122. OpenCL under the hood, such as CUBLAS functions:
  123. @cartouche
  124. @smallexample
  125. static void
  126. matmul_cuda (const float *A, const float *B, float *C,
  127. size_t nx, size_t ny, size_t nz)
  128. @{
  129. cublasSgemm ('n', 'n', nx, ny, nz,
  130. 1.0f, A, 0, B, 0,
  131. 0.0f, C, 0);
  132. cudaStreamSynchronize (starpu_cuda_get_local_stream ());
  133. @}
  134. @end smallexample
  135. @end cartouche
  136. A task can be invoked like a regular C function:
  137. @cartouche
  138. @smallexample
  139. matmul (&A[i * zdim * bydim + k * bzdim * bydim],
  140. &B[k * xdim * bzdim + j * bxdim * bzdim],
  141. &C[i * xdim * bydim + j * bxdim * bydim],
  142. bxdim, bydim, bzdim);
  143. @end smallexample
  144. @end cartouche
  145. @noindent
  146. This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
  147. implementation may run in parallel with the continuation of the caller.
  148. The next section describes how memory buffers must be handled in
  149. StarPU-GCC code.
  150. @node Registered Data Buffers
  151. @section Registered Data Buffers
  152. Data buffers such as matrices and vectors that are to be passed to tasks
  153. must be @dfn{registered}. Registration allows StarPU to handle data
  154. transfers among devices---e.g., transferring an input buffer from the
  155. CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
  156. Management Library}).
  157. The following pragmas are provided:
  158. @table @code
  159. @item #pragma starpu register @var{ptr} [@var{size}]
  160. Register @var{ptr} as a @var{size}-element buffer.
  161. @item #pragma starpu unregister @var{ptr}
  162. @item #pragma starpu acquire @var{ptr}
  163. @end table
  164. FIXME: finish
  165. @node Conditional Extensions
  166. @section Using C Extensions Conditionally
  167. The C extensions described in this chapter are only available when GCC
  168. and its StarPU plug-in are in use. Yet, it is possible to make use of
  169. these extensions when they are available---leading to hybrid CPU/GPU
  170. code---and discard them when they are not available---leading to valid
  171. sequential code.
  172. To that end, the GCC plug-in defines a C preprocessor macro when it is
  173. being used:
  174. @defmac STARPU_GCC_PLUGIN
  175. Defined for code being compiled with the StarPU GCC plug-in. When
  176. defined, this macro expands to an integer denoting the version of the
  177. supported C extensions.
  178. @end defmac
  179. The code below illustrates how to define a task and its implementations
  180. in a way that allows it to be compiled without the GCC plug-in:
  181. @cartouche
  182. @smallexample
  183. /* The macros below abstract over the attributes specific to
  184. StarPU-GCC and the name of the CPU implementation. */
  185. #ifdef STARPU_GCC_PLUGIN
  186. # define __task __attribute__ ((task))
  187. # define CPU_TASK_IMPL(task) task ## _cpu
  188. #else
  189. # define __task
  190. # define CPU_TASK_IMPL(task) task
  191. #endif
  192. #include <stdlib.h>
  193. static void matmul (const float *A, const float *B, float *C,
  194. size_t nx, size_t ny, size_t nz) __task;
  195. #ifdef STARPU_GCC_PLUGIN
  196. static void matmul_cpu (const float *A, const float *B, float *C,
  197. size_t nx, size_t ny, size_t nz)
  198. __attribute__ ((task_implementation ("cpu", matmul)));
  199. #endif
  200. static void
  201. CPU_TASK_IMPL (matmul) (const float *A, const float *B, float *C,
  202. size_t nx, size_t ny, size_t nz)
  203. @{
  204. /* Code of the CPU kernel here... */
  205. @}
  206. int
  207. main (int argc, char *argv[])
  208. @{
  209. /* The pragmas below are simply ignored when StarPU-GCC
  210. is not used. */
  211. #pragma starpu initialize
  212. float A[123][42][7], B[123][42][7], C[123][42][7];
  213. #pragma starpu register A
  214. #pragma starpu register B
  215. #pragma starpu register C
  216. /* When StarPU-GCC is used, the call below is asynchronous;
  217. otherwise, it is synchronous. */
  218. matmul (A, B, C, 123, 42, 7);
  219. #pragma starpu wait
  220. #pragma starpu shutdown
  221. return EXIT_SUCCESS;
  222. @}
  223. @end smallexample
  224. @end cartouche
  225. Note that attributes such as @code{task} are simply ignored by GCC when
  226. the StarPU plug-in is not loaded, so the @code{__task} macro could be
  227. omitted altogether. However, @command{gcc -Wall} emits a warning for
  228. unknown attributes, which can be inconvenient, and other compilers may
  229. be unable to parse the attribute syntax. Thus, using macros such as
  230. @code{__task} above is recommended.
  231. @c Local Variables:
  232. @c TeX-master: "../starpu.texi"
  233. @c ispell-local-dictionary: "american"
  234. @c End: