13 years ago · a027ccf2b8
--- a/doc/chapters/c-extensions.texi
+++ b/doc/chapters/c-extensions.texi
@@ -121,7 +121,39 @@ matmul_cpu (const float *A, const float *B, float *C,
 
				 A @code{matmult} task is defined; it has only one implementation,
			
 
				 @code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
			
 
				 @var{B} are input buffers, whereas @var{C} is considered an input/output
			
 
				-buffer.  The task can be invoked like a regular C function:
			
 
				+buffer.
			
 
				+
			
 
				+CUDA and OpenCL implementations can be declared in a similar way:
			
 
				+
			
 
				+@example
			
 
				+static void matmul_cuda (const float *A, const float *B, float *C,
			
 
				+                         size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("cuda", matmul)));
			
 
				+
			
 
				+static void matmul_opencl (const float *A, const float *B, float *C,
			
 
				+                           size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("opencl", matmul)));
			
 
				+@end example
			
 
				+
			
 
				+@noindent
			
 
				+The CUDA and OpenCL implementations typically either invoke a kernel
			
 
				+written in CUDA or OpenCL (for similar code, @pxref{CUDA Kernel}, and
			
 
				+@pxref{OpenCL Kernel}), or call a library function that uses CUDA or
			
 
				+OpenCL under the hood, such as CUBLAS functions:
			
 
				+
			
 
				+@example
			
 
				+static void
			
 
				+matmul_cuda (const float *A, const float *B, float *C,
			
 
				+             size_t nx, size_t ny, size_t nz)
			
 
				+@{
			
 
				+  cublasSgemm ('n', 'n', nx, ny, nz,
			
 
				+               1.0f, A, 0, B, 0,
			
 
				+               0.0f, C, 0);
			
 
				+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				+@}
			
 
				+@end example
			
 
				+
			
 
				+A task can be invoked like a regular C function:
			
 
				 
			
 
				 @example
			
 
				 matmul (&A[i * zdim * bydim + k * bzdim * bydim],