|
@@ -70,6 +70,7 @@ was last updated on @value{UPDATED}.
|
|
|
* Performance optimization:: How to optimize performance with StarPU
|
|
|
* Performance feedback:: Performance debugging tools
|
|
|
* StarPU MPI support:: How to combine StarPU with MPI
|
|
|
+* Tips and Tricks:: Tips and tricks to know about
|
|
|
* Configuring StarPU:: How to configure StarPU
|
|
|
* StarPU API:: The API to use StarPU
|
|
|
* Advanced Topics:: Advanced use of StarPU
|
|
@@ -2656,6 +2657,62 @@ starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
|
|
|
|
|
|
|
|
|
@c ---------------------------------------------------------------------
|
|
|
+@c Tips and Tricks
|
|
|
+@c ---------------------------------------------------------------------
|
|
|
+
|
|
|
+@node Tips and Tricks
|
|
|
+@chapter Tips and Tricks to know about
|
|
|
+
|
|
|
+@menu
|
|
|
+* Per-worker library initialization:: How to initialize a computation library once for each worker?
|
|
|
+@end menu
|
|
|
+
|
|
|
+@node Per-worker library initialization
|
|
|
+@section How to initialize a computation library once for each worker?
|
|
|
+
|
|
|
+Some libraries need to be initialized one for each concurrent instance that
|
|
|
+may run on the machine. For instance, a C++ computation class which is not
|
|
|
+thread-safe by itself, but for which several instanciated objects of that class
|
|
|
+can be used concurrently. This can be used in StarPU by initializing one such
|
|
|
+object per worker. For instance, the libstarpufft example does the following to be able to use FFTW.
|
|
|
+
|
|
|
+Some global array stores the instanciated objects:
|
|
|
+
|
|
|
+@smallexample
|
|
|
+fftw_plan plan_cpu[STARPU_NMAXWORKERS];
|
|
|
+@end smallexample
|
|
|
+
|
|
|
+At initialisation time of libstarpu, the objects are initialized:
|
|
|
+
|
|
|
+@smallexample
|
|
|
+int workerid;
|
|
|
+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) @{
|
|
|
+ switch (starpu_worker_get_type(workerid)) @{
|
|
|
+ case STARPU_CPU_WORKER:
|
|
|
+ plan_cpu[workerid] = fftw_plan(...);
|
|
|
+ break;
|
|
|
+ @}
|
|
|
+@}
|
|
|
+@end smallexample
|
|
|
+
|
|
|
+And in the codelet body, they are used:
|
|
|
+
|
|
|
+@smallexample
|
|
|
+static void fft(void *descr[], void *_args)
|
|
|
+@{
|
|
|
+ int workerid = starpu_worker_get_id();
|
|
|
+ fftw_plan plan = plan_cpu[workerid];
|
|
|
+ ...
|
|
|
+
|
|
|
+ fftw_execute(plan, ...);
|
|
|
+@}
|
|
|
+@end smallexample
|
|
|
+
|
|
|
+To also deal with the CUDA CUFFT implementation, the @code{fftw_plan} type can
|
|
|
+be replaced with a union of @code{fftw_plan} and @code{cufftHandle}, and the
|
|
|
+@code{switch} statement extended with @code{STARPU_CUDA_WORKER}.
|
|
|
+
|
|
|
+@c ---------------------------------------------------------------------
|
|
|
@c Configuration options
|
|
|
@c ---------------------------------------------------------------------
|
|
|
|