123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474 |
- props = starpu_cuda_get_device_properties(workerid);
- if (props->major >= 2 || props->minor >= 3)
-
- return 1;
-
- return 0;
- }
- struct starpu_codelet cl = {
- .can_execute = can_execute,
- .cpu_funcs = { cpu_func },
- .cpu_funcs_name = { "cpu_func" },
- .cuda_funcs = { gpu_func }
- .nbuffers = 1,
- .modes = { STARPU_RW }
- };
- \endcode
- This can be essential e.g. when running on a machine which mixes various models
- of CUDA devices, to take benefit from the new models without crashing on old models.
- Note: the function starpu_codelet::can_execute is called by the
- scheduler each time it tries to match a task with a worker, and should
- thus be very fast. The function starpu_cuda_get_device_properties()
- provides a quick access to CUDA properties of CUDA devices to achieve
- such efficiency.
- Another example is to compile CUDA code for various compute capabilities,
- resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
- 1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
- provided to StarPU by using starpu_codelet::cuda_funcs, and
- starpu_codelet::can_execute can then be used to rule out the
- <c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
- \code{.c}
- static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
- {
- const struct cudaDeviceProp *props;
- if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
- return 1;
-
- if (nimpl == 0)
-
- return 1;
-
- props = starpu_cuda_get_device_properties(workerid);
- if (props->major >= 2 || props->minor >= 0)
-
- return 1;
-
- return 0;
- }
- struct starpu_codelet cl = {
- .can_execute = can_execute,
- .cpu_funcs = { cpu_func },
- .cpu_funcs_name = { "cpu_func" },
- .cuda_funcs = { scal_gpu_13, scal_gpu_20 },
- .nbuffers = 1,
- .modes = { STARPU_RW }
- };
- \endcode
- Another example is having specialized implementations for some given common
- sizes, for instance here we have a specialized implementation for 1024x1024
- matrices:
- \code{.c}
- static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
- {
- const struct cudaDeviceProp *props;
- if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
- return 1;
-
- switch (nimpl)
- {
- case 0:
-
- return 1;
- case 1:
- {
-
- struct starpu_matrix_interface *interface = starpu_data_get_interface_on_node(task->handles[0]);
- return STARPU_MATRIX_GET_NX(interface) == 1024 && STARPU_MATRIX_GET_NY(interface == 1024);
- }
- }
- }
- struct starpu_codelet cl = {
- .can_execute = can_execute,
- .cpu_funcs = { cpu_func },
- .cpu_funcs_name = { "cpu_func" },
- .cuda_funcs = { potrf_gpu_generic, potrf_gpu_1024 },
- .nbuffers = 1,
- .modes = { STARPU_RW }
- };
- \endcode
- Note: the most generic variant should be provided first, as some schedulers are
- not able to try the different variants.
- \section InsertTaskUtility Insert Task Utility
- StarPU provides the wrapper function starpu_task_insert() to ease
- the creation and submission of tasks.
- Here the implementation of the codelet:
- \code{.c}
- void func_cpu(void *descr[], void *_args)
- {
- int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
- float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
- int ifactor;
- float ffactor;
- starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
- *x0 = *x0 * ifactor;
- *x1 = *x1 * ffactor;
- }
- struct starpu_codelet mycodelet = {
- .cpu_funcs = { func_cpu },
- .cpu_funcs_name = { "func_cpu" },
- .nbuffers = 2,
- .modes = { STARPU_RW, STARPU_RW }
- };
- \endcode
- And the call to the function starpu_task_insert():
- \code{.c}
- starpu_task_insert(&mycodelet,
- STARPU_VALUE, &ifactor, sizeof(ifactor),
- STARPU_VALUE, &ffactor, sizeof(ffactor),
- STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
- 0);
- \endcode
- The call to starpu_task_insert() is equivalent to the following
- code:
- \code{.c}
- struct starpu_task *task = starpu_task_create();
- task->cl = &mycodelet;
- task->handles[0] = data_handles[0];
- task->handles[1] = data_handles[1];
- char *arg_buffer;
- size_t arg_buffer_size;
- starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
- STARPU_VALUE, &ifactor, sizeof(ifactor),
- STARPU_VALUE, &ffactor, sizeof(ffactor),
- 0);
- task->cl_arg = arg_buffer;
- task->cl_arg_size = arg_buffer_size;
- int ret = starpu_task_submit(task);
- \endcode
- Here a similar call using ::STARPU_DATA_ARRAY.
- \code{.c}
- starpu_task_insert(&mycodelet,
- STARPU_DATA_ARRAY, data_handles, 2,
- STARPU_VALUE, &ifactor, sizeof(ifactor),
- STARPU_VALUE, &ffactor, sizeof(ffactor),
- 0);
- \endcode
- If some part of the task insertion depends on the value of some computation,
- the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For
- instance, assuming that the index variable <c>i</c> was registered as handle
- <c>A_handle[i]</c>:
- \code{.c}
- starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
- STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
- starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
- \endcode
- The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
- acquiring data <c>i</c> for the main application, and will execute the code
- given as third parameter when it is acquired. In other words, as soon as the
- value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
- portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
- be executed, and is allowed to read from <c>i</c> to use it e.g. as an
- index. Note that this macro is only avaible when compiling StarPU with
- the compiler <c>gcc</c>.
- \section ParallelTasks Parallel Tasks
- StarPU can leverage existing parallel computation libraries by the means of
- parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
- (called a parallel or combined worker) at the same time, by using an existing
- parallel CPU implementation of the computation to be achieved. This can also be
- useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
- work collectively on a single task, the completion time of tasks on CPUs become
- comparable to the completion time on GPUs, thus relieving from granularity
- discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
- good performance, otherwise StarPU will not know how to better group
- cores.
- Two modes of execution exist to accomodate with existing usages.
- \subsection Fork-modeParallelTasks Fork-mode Parallel Tasks
- In the Fork mode, StarPU will call the codelet function on one
- of the CPUs of the combined worker. The codelet function can use
- starpu_combined_worker_get_size() to get the number of threads it is
- allowed to start to achieve the computation. The CPU binding mask for the whole
- set of CPUs is already enforced, so that threads created by the function will
- inherit the mask, and thus execute where StarPU expected, the OS being in charge
- of choosing how to schedule threads on the corresponding CPUs. The application
- can also choose to bind threads by hand, using e.g. sched_getaffinity to know
- the CPU binding mask that StarPU chose.
- For instance, using OpenMP (full source is available in
- <c>examples/openmp/vector_scal.c</c>):
- \snippet forkmode.c To be included. You should update doxygen if you see this text.
- Other examples include for instance calling a BLAS parallel CPU implementation
- (see <c>examples/mult/xgemm.c</c>).
- \subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks
- In the SPMD mode, StarPU will call the codelet function on
- each CPU of the combined worker. The codelet function can use
- starpu_combined_worker_get_size() to get the total number of CPUs
- involved in the combined worker, and thus the number of calls that are made in
- parallel to the function, and starpu_combined_worker_get_rank() to get
- the rank of the current CPU within the combined worker. For instance:
- \code{.c}
- static void func(void *buffers[], void *args)
- {
- unsigned i;
- float *factor = _args;
- struct starpu_vector_interface *vector = buffers[0];
- unsigned n = STARPU_VECTOR_GET_NX(vector);
- float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
-
- unsigned m = starpu_combined_worker_get_size();
- unsigned j = starpu_combined_worker_get_rank();
- unsigned slice = (n+m-1)/m;
- for (i = j * slice; i < (j+1) * slice && i < n; i++)
- val[i] *= *factor;
- }
- static struct starpu_codelet cl =
- {
- .modes = { STARPU_RW },
- .type = STARPU_SPMD,
- .max_parallelism = INT_MAX,
- .cpu_funcs = { func },
- .cpu_funcs_name = { "func" },
- .nbuffers = 1,
- }
- \endcode
- Of course, this trivial example will not really benefit from parallel task
- execution, and was only meant to be simple to understand. The benefit comes
- when the computation to be done is so that threads have to e.g. exchange
- intermediate results, or write to the data in a complex but safe way in the same
- buffer.
- \subsection ParallelTasksPerformance Parallel Tasks Performance
- To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
- be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
- ::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
- (parallel eager) will indeed also try to execute tasks with
- several CPUs. It will automatically try the various available combined
- worker sizes (making several measurements for each worker size) and
- thus be able to avoid choosing a large combined worker if the codelet
- does not actually scale so much.
- \subsection CombinedWorkers Combined Workers
- By default, StarPU creates combined workers according to the architecture
- structure as detected by <c>hwloc</c>. It means that for each object of the <c>hwloc</c>
- topology (NUMA node, socket, cache, ...) a combined worker will be created. If
- some nodes of the hierarchy have a big arity (e.g. many cores in a socket
- without a hierarchy of shared caches), StarPU will create combined workers of
- intermediate sizes. The variable \ref
- STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
- arity between levels of combined workers.
- The combined workers actually produced can be seen in the output of the
- tool <c>starpu_machine_display</c> (the environment variable \ref
- STARPU_SCHED has to be set to a combined worker-aware scheduler such
- as <c>pheft</c> or <c>peager</c>).
- \subsection ConcurrentParallelTasks Concurrent Parallel Tasks
- Unfortunately, many environments and librairies do not support concurrent
- calls.
- For instance, most OpenMP implementations (including the main ones) do not
- support concurrent <c>pragma omp parallel</c> statements without nesting them in
- another <c>pragma omp parallel</c> statement, but StarPU does not yet support
- creating its CPU workers by using such pragma.
- Other parallel libraries are also not safe when being invoked concurrently
- from different threads, due to the use of global variables in their sequential
- sections for instance.
- The solution is then to use only one combined worker at a time. This can be
- done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
- setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
- to <c>1</c>. StarPU will then run only one parallel task at a time (but other
- CPU and GPU tasks are not affected and can be run concurrently). The parallel
- task scheduler will however still however still try varying combined worker
- sizes to look for the most efficient ones.
- */
|