c-extensions.texi 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. @c -*-texinfo-*-
  2. @c This file is part of the StarPU Handbook.
  3. @c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  4. @c See the file starpu.texi for copying conditions.
  5. @cindex C extensions
  6. @cindex GCC plug-in
  7. If the GCC used to compile StarPU provides a plug-in
  8. support@footnote{This feature is only available for GCC 4.5 and
  9. later.}, StarPU will then build a plug-in for the GNU Compiler
  10. Collection (GCC), which defines extensions to languages of the C
  11. family (C, C++, Objective-C) that make it easier to write StarPU code.
  12. The plug-in can be disabled by configuring with @code{--disable-gcc-extensions}.
  13. Those extensions include syntactic sugar for defining
  14. tasks and their implementations, invoking a task, and manipulating data
  15. buffers. Use of these extensions can be made conditional on the
  16. availability of the plug-in, leading to valid C sequential code when the
  17. plug-in is not used (@pxref{Conditional Extensions}).
  18. This section does not require detailed knowledge of the StarPU library.
  19. Note: as of StarPU @value{VERSION}, this is still an area under
  20. development and subject to change.
  21. @menu
  22. * Defining Tasks:: Defining StarPU tasks
  23. * Registered Data Buffers:: Manipulating data buffers
  24. * Conditional Extensions:: Using C extensions only when available
  25. @end menu
  26. @node Defining Tasks
  27. @section Defining Tasks
  28. @cindex task
  29. @cindex task implementation
  30. The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
  31. @enumerate
  32. @item
  33. tasks may have several implementations---e.g., one for CPUs, one written
  34. in OpenCL, one written in CUDA;
  35. @item
  36. tasks may have several implementations of the same target---e.g.,
  37. several CPU implementations;
  38. @item
  39. when a task is invoked, it may run in parallel, and StarPU is free to
  40. choose any of its implementations.
  41. @end enumerate
  42. Tasks and their implementations must be @emph{declared}. These
  43. declarations are annotated with @dfn{attributes} (@pxref{Attribute
  44. Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
  45. (GCC)}): the declaration of a task is a regular C function declaration
  46. with an additional @code{task} attribute, and task implementations are
  47. declared with a @code{task_implementation} attribute.
  48. The following function attributes are provided:
  49. @table @code
  50. @item task
  51. @cindex @code{task} attribute
  52. Declare the given function as a StarPU task. Its return type must be
  53. @code{void}, and it must not be defined---instead, a definition will
  54. automatically be provided by the compiler.
  55. Under the hood, declaring a task leads to the declaration of the
  56. corresponding @code{codelet} (@pxref{Codelet and Tasks}). If one or
  57. more task implementations are declared in the same compilation unit,
  58. then the codelet and the function itself are also defined; they inherit
  59. the scope of the task.
  60. Scalar arguments to the task are passed by value and copied to the
  61. target device if need be---technically, they are passed as the
  62. @code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
  63. @cindex @code{output} type attribute
  64. Pointer arguments are assumed to be registered data buffers---the
  65. @code{buffers} argument of a task (@pxref{Codelets and Tasks,
  66. @code{buffers}}); @code{const}-qualified pointer arguments are viewed as
  67. read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
  68. buffers are assumed to be used read-write (@code{STARPU_RW}). In
  69. addition, the @code{output} type attribute can be as a type qualifier
  70. for output pointer or array parameters (@code{STARPU_W}).
  71. @item task_implementation (@var{target}, @var{task})
  72. @cindex @code{task_implementation} attribute
  73. Declare the given function as an implementation of @var{task} to run on
  74. @var{target}. @var{target} must be a string, currently one of
  75. @code{"cpu"}, @code{"opencl"}, or @code{"cuda"}.
  76. @c FIXME: Update when OpenCL support is ready.
  77. @end table
  78. Here is an example:
  79. @cartouche
  80. @smallexample
  81. #define __output __attribute__ ((output))
  82. static void matmul (const float *A, const float *B,
  83. __output float *C,
  84. size_t nx, size_t ny, size_t nz)
  85. __attribute__ ((task));
  86. static void matmul_cpu (const float *A, const float *B,
  87. __output float *C,
  88. size_t nx, size_t ny, size_t nz)
  89. __attribute__ ((task_implementation ("cpu", matmul)));
  90. static void
  91. matmul_cpu (const float *A, const float *B, __output float *C,
  92. size_t nx, size_t ny, size_t nz)
  93. @{
  94. size_t i, j, k;
  95. for (j = 0; j < ny; j++)
  96. for (i = 0; i < nx; i++)
  97. @{
  98. for (k = 0; k < nz; k++)
  99. C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
  100. @}
  101. @}
  102. @end smallexample
  103. @end cartouche
  104. @noindent
  105. A @code{matmult} task is defined; it has only one implementation,
  106. @code{matmult_cpu}, which runs on the CPU. Variables @var{A} and
  107. @var{B} are input buffers, whereas @var{C} is considered an input/output
  108. buffer.
  109. CUDA and OpenCL implementations can be declared in a similar way:
  110. @cartouche
  111. @smallexample
  112. static void matmul_cuda (const float *A, const float *B, float *C,
  113. size_t nx, size_t ny, size_t nz)
  114. __attribute__ ((task_implementation ("cuda", matmul)));
  115. static void matmul_opencl (const float *A, const float *B, float *C,
  116. size_t nx, size_t ny, size_t nz)
  117. __attribute__ ((task_implementation ("opencl", matmul)));
  118. @end smallexample
  119. @end cartouche
  120. @noindent
  121. The CUDA and OpenCL implementations typically either invoke a kernel
  122. written in CUDA or OpenCL (for similar code, @pxref{CUDA Kernel}, and
  123. @pxref{OpenCL Kernel}), or call a library function that uses CUDA or
  124. OpenCL under the hood, such as CUBLAS functions:
  125. @cartouche
  126. @smallexample
  127. static void
  128. matmul_cuda (const float *A, const float *B, float *C,
  129. size_t nx, size_t ny, size_t nz)
  130. @{
  131. cublasSgemm ('n', 'n', nx, ny, nz,
  132. 1.0f, A, 0, B, 0,
  133. 0.0f, C, 0);
  134. cudaStreamSynchronize (starpu_cuda_get_local_stream ());
  135. @}
  136. @end smallexample
  137. @end cartouche
  138. A task can be invoked like a regular C function:
  139. @cartouche
  140. @smallexample
  141. matmul (&A[i * zdim * bydim + k * bzdim * bydim],
  142. &B[k * xdim * bzdim + j * bxdim * bzdim],
  143. &C[i * xdim * bydim + j * bxdim * bydim],
  144. bxdim, bydim, bzdim);
  145. @end smallexample
  146. @end cartouche
  147. @noindent
  148. This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
  149. implementation may run in parallel with the continuation of the caller.
  150. The next section describes how memory buffers must be handled in
  151. StarPU-GCC code.
  152. @node Registered Data Buffers
  153. @section Registered Data Buffers
  154. Data buffers such as matrices and vectors that are to be passed to tasks
  155. must be @dfn{registered}. Registration allows StarPU to handle data
  156. transfers among devices---e.g., transferring an input buffer from the
  157. CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
  158. Management Library}).
  159. The following pragmas are provided:
  160. @table @code
  161. @item #pragma starpu register @var{ptr} [@var{size}]
  162. Register @var{ptr} as a @var{size}-element buffer. When @var{ptr} has
  163. an array type whose size is known, @var{size} may be omitted.
  164. @item #pragma starpu unregister @var{ptr}
  165. Unregister the previously-registered memory area pointed to by
  166. @var{ptr}. As a side-effect, @var{ptr} points to a valid copy in main
  167. memory.
  168. @item #pragma starpu acquire @var{ptr}
  169. Acquire in main memory an up-to-date copy of the previously-registered
  170. memory area pointed to by @var{ptr}, for read-write access.
  171. @item #pragma starpu release @var{ptr}
  172. Release the previously-register memory area pointed to by @var{ptr},
  173. making it available to the tasks.
  174. @end table
  175. As a substitute for the @code{register} and @code{unregister} pragmas,
  176. the @code{heap_allocated} variable attribute offers a higher-level
  177. mechanism:
  178. @table @code
  179. @item heap_allocated
  180. @cindex @code{heap_allocated} attribute
  181. This attributes applies to local variables with an array type. Its
  182. effect is to automatically allocate and register the array's storage on
  183. the heap, using @code{starpu_malloc} under the hood (@pxref{Basic Data
  184. Library API, starpu_malloc}). The heap-allocated array is automatically
  185. freed and unregistered when the variable's scope is left, as with
  186. automatic variables@footnote{This is achieved by using the
  187. @code{cleanup} attribute (@pxref{Variable Attributes,,, gcc, Using the
  188. GNU Compiler Collection (GCC)})}.
  189. @end table
  190. @noindent
  191. The following example illustrates use of the @code{heap_allocated}
  192. attribute:
  193. @example
  194. extern void cholesky(unsigned nblocks, unsigned size,
  195. float mat[nblocks][nblocks][size])
  196. __attribute__ ((task));
  197. int
  198. main (int argc, char *argv[])
  199. @{
  200. #pragma starpu initialize
  201. /* ... */
  202. int nblocks, size;
  203. parse_args (&nblocks, &size);
  204. /* Allocate an array of the required size on the heap,
  205. and register it. */
  206. float matrix[nblocks][nblocks][size]
  207. __attribute__ ((heap_allocated));
  208. cholesky (nblocks, size, matrix);
  209. #pragma starpu shutdown
  210. /* MATRIX is automatically freed upon return. */
  211. return EXIT_SUCCESS;
  212. @}
  213. @end example
  214. @node Conditional Extensions
  215. @section Using C Extensions Conditionally
  216. The C extensions described in this chapter are only available when GCC
  217. and its StarPU plug-in are in use. Yet, it is possible to make use of
  218. these extensions when they are available---leading to hybrid CPU/GPU
  219. code---and discard them when they are not available---leading to valid
  220. sequential code.
  221. To that end, the GCC plug-in defines a C preprocessor macro when it is
  222. being used:
  223. @defmac STARPU_GCC_PLUGIN
  224. Defined for code being compiled with the StarPU GCC plug-in. When
  225. defined, this macro expands to an integer denoting the version of the
  226. supported C extensions.
  227. @end defmac
  228. The code below illustrates how to define a task and its implementations
  229. in a way that allows it to be compiled without the GCC plug-in:
  230. @cartouche
  231. @smallexample
  232. /* The macros below abstract over the attributes specific to
  233. StarPU-GCC and the name of the CPU implementation. */
  234. #ifdef STARPU_GCC_PLUGIN
  235. # define __task __attribute__ ((task))
  236. # define CPU_TASK_IMPL(task) task ## _cpu
  237. #else
  238. # define __task
  239. # define CPU_TASK_IMPL(task) task
  240. #endif
  241. #include <stdlib.h>
  242. static void matmul (const float *A, const float *B, float *C,
  243. size_t nx, size_t ny, size_t nz) __task;
  244. #ifdef STARPU_GCC_PLUGIN
  245. static void matmul_cpu (const float *A, const float *B, float *C,
  246. size_t nx, size_t ny, size_t nz)
  247. __attribute__ ((task_implementation ("cpu", matmul)));
  248. #endif
  249. static void
  250. CPU_TASK_IMPL (matmul) (const float *A, const float *B, float *C,
  251. size_t nx, size_t ny, size_t nz)
  252. @{
  253. /* Code of the CPU kernel here... */
  254. @}
  255. int
  256. main (int argc, char *argv[])
  257. @{
  258. /* The pragmas below are simply ignored when StarPU-GCC
  259. is not used. */
  260. #pragma starpu initialize
  261. float A[123][42][7], B[123][42][7], C[123][42][7];
  262. #pragma starpu register A
  263. #pragma starpu register B
  264. #pragma starpu register C
  265. /* When StarPU-GCC is used, the call below is asynchronous;
  266. otherwise, it is synchronous. */
  267. matmul (A, B, C, 123, 42, 7);
  268. #pragma starpu wait
  269. #pragma starpu shutdown
  270. return EXIT_SUCCESS;
  271. @}
  272. @end smallexample
  273. @end cartouche
  274. Note that attributes such as @code{task} are simply ignored by GCC when
  275. the StarPU plug-in is not loaded, so the @code{__task} macro could be
  276. omitted altogether. However, @command{gcc -Wall} emits a warning for
  277. unknown attributes, which can be inconvenient, and other compilers may
  278. be unable to parse the attribute syntax. Thus, using macros such as
  279. @code{__task} above is recommended.
  280. @c Local Variables:
  281. @c TeX-master: "../starpu.texi"
  282. @c ispell-local-dictionary: "american"
  283. @c End: