|
@@ -0,0 +1,351 @@
|
|
|
+/*
|
|
|
+ * This file is part of the StarPU Handbook.
|
|
|
+ * Copyright (C) 2014 Inria
|
|
|
+ * See the file version.doxy for copying conditions.
|
|
|
+ */
|
|
|
+
|
|
|
+/*! \page OpenMPRuntimeSupport The StarPU OpenMP Runtime Support (SORS)
|
|
|
+
|
|
|
+StarPU provides the necessary routines and support to implement an <a
|
|
|
+href="http://www.openmp.org/">OpenMP</a> runtime compliant with the
|
|
|
+revision 3.1 of the language specification, and compliant with the
|
|
|
+task-related data dependency functionalities introduced in the revision
|
|
|
+4.0 of the language. This StarPU OpenMP Runtime Support (SORS) has been
|
|
|
+designed to be targetted by OpenMP compilers such as the Klang-OMP
|
|
|
+compiler. Most supported OpenMP directives can both be implemented
|
|
|
+inline or as outlined functions.
|
|
|
+
|
|
|
+All functions are defined in \ref API_OpenMP_Runtime_Support.
|
|
|
+
|
|
|
+\section Implementation Implementation Details and Specificities
|
|
|
+
|
|
|
+\subsection MainThread Main Thread
|
|
|
+
|
|
|
+When using the SORS, the main thread gets involved in executing OpenMP tasks
|
|
|
+just like every other threads, in order to be compliant with the
|
|
|
+specification execution model. This contrasts with StarPU's usual
|
|
|
+execution model where the main thread submit tasks but does not take
|
|
|
+part in executing them.
|
|
|
+
|
|
|
+\subsection TaskSemantics Extended Task Semantics
|
|
|
+
|
|
|
+The semantics of tasks generated by the SORS are extended with respect
|
|
|
+to regular StarPU tasks in that SORS' tasks may block and be preempted
|
|
|
+by SORS call, whereas regular StarPU tasks cannot. SORS tasks may
|
|
|
+coexist with regular StarPU tasks. However, only the tasks created using
|
|
|
+SORS API functions inherit from extended semantics.
|
|
|
+
|
|
|
+\section Configuration Configuration
|
|
|
+
|
|
|
+The SORS can be compiled into <c>libstarpu</c>
|
|
|
+by providing the <c>--enable-openmp</c> flag to StarPU's
|
|
|
+<c>configure</c>. Conditional compiled source codes may check for the
|
|
|
+availability of the OpenMP Runtime Support by testing whether the C
|
|
|
+preprocessor macro <c>STARPU_OPENMP</c> is defined or not.
|
|
|
+
|
|
|
+\section InitExit Initialization and Shutdown
|
|
|
+
|
|
|
+The SORS needs to be executed/terminated by the
|
|
|
+starpu_omp_init()/starpu_omp_shutdown() instead of
|
|
|
+starpu_init()/starpu_shutdown(). This requirement is necessary to make
|
|
|
+sure that the main thread gets the proper execution environment to run
|
|
|
+OpenMP tasks. These calls will usually be performed by a compiler
|
|
|
+runtime. Thus, they can be executed from a constructor/destructor such
|
|
|
+as this:
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+__attribute__((constructor))
|
|
|
+static void omp_constructor(void)
|
|
|
+{
|
|
|
+ int ret = starpu_omp_init();
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
|
|
|
+}
|
|
|
+
|
|
|
+__attribute__((destructor))
|
|
|
+static void omp_destructor(void)
|
|
|
+{
|
|
|
+ starpu_omp_shutdown();
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+\section Parallel Parallel Regions and Worksharing
|
|
|
+
|
|
|
+The SORS provides functions to create OpenMP parallel regions as well as
|
|
|
+mapping work on participating workers. The current implementation does
|
|
|
+not provide nested active parallel regions: Parallel regions may be
|
|
|
+created recursively, however only the first level parallel region may
|
|
|
+have more than one worker. From an internal point-of-view, the SORS'
|
|
|
+parallel regions are implemented as a set of implicit, extended semantics
|
|
|
+StarPU tasks, following the execution model of the OpenMP specification.
|
|
|
+Thus the SORS' parallel region tasks may block and be preempted, by
|
|
|
+SORS calls, enabling constructs such as barriers.
|
|
|
+
|
|
|
+\subsection OMPParallel Parallel Regions
|
|
|
+
|
|
|
+Parallel regions can be created with the function
|
|
|
+starpu_omp_parallel_region() which accepts a set of attributes as
|
|
|
+parameter. The execution of the calling task is suspended until the
|
|
|
+parallel region completes. The <c>attr.cl</c> field is a regular StarPU
|
|
|
+codelet. However only CPU codelets are supported for parallel regions.
|
|
|
+Here is an example of use:
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void parallel_region_f(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ pthread_t tid = pthread_self();
|
|
|
+ int worker_id = starpu_worker_get_id();
|
|
|
+ printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
|
|
|
+}
|
|
|
+
|
|
|
+void f(void)
|
|
|
+{
|
|
|
+ struct starpu_omp_parallel_region_attr attr;
|
|
|
+ memset(&attr, 0, sizeof(attr));
|
|
|
+ attr.cl.cpu_funcs[0] = parallel_region_f;
|
|
|
+ attr.cl.where = STARPU_CPU;
|
|
|
+ attr.if_clause = 1;
|
|
|
+ starpu_omp_parallel_region(&attr);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+\subsection OMPFor Parallel For
|
|
|
+
|
|
|
+OpenMP <c>for</c> loops are provided by the starpu_omp_for() group of
|
|
|
+functions. Variants are available for inline or outlined
|
|
|
+implementations. The SORS supports <c>static</c>, <c>dynamic</c>, and
|
|
|
+<c>guided</c> loop scheduling clauses. The <c>auto</c> scheduling clause
|
|
|
+is implemented as <c>static</c>. The <c>runtime</c> scheduling clause
|
|
|
+honors the scheduling mode selected through the environment variable
|
|
|
+OMP_SCHEDULE or the starpu_omp_set_schedule() function. For loops with
|
|
|
+the <c>ordered</c> clause are also supported. An implicit barrier can be
|
|
|
+enforced or skipped at the end of the worksharing construct, according
|
|
|
+to the value of the <c>nowait</c> parameter.
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void for_g(unsigned long long i, unsigned long long nb_i, void *arg)
|
|
|
+{
|
|
|
+ (void) arg;
|
|
|
+ for (; nb_i > 0; i++, nb_i--)
|
|
|
+ {
|
|
|
+ array[i] = 1;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void parallel_region_f(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ starpu_omp_for(for_g, NULL, NB_ITERS, CHUNK, starpu_omp_sched_static, 0, 0);
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+\subsection OMPSections Sections
|
|
|
+OpenMP <c>sections</c> worksharing constructs are supported using the
|
|
|
+set of starpu_omp_sections() variants. The general principle is either
|
|
|
+to provide an array of per-section functions or a single function that
|
|
|
+will redirect to execution to the suitable per-section functions. An
|
|
|
+implicit barrier can be enforced or skipped at the end of the
|
|
|
+worksharing construct, according to the value of the <c>nowait</c>
|
|
|
+parameter.
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void parallel_region_f(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+
|
|
|
+ section_funcs[0] = f;
|
|
|
+ section_funcs[1] = g;
|
|
|
+ section_funcs[2] = h;
|
|
|
+ section_funcs[3] = i;
|
|
|
+
|
|
|
+ section_args[0] = arg_f;
|
|
|
+ section_args[1] = arg_g;
|
|
|
+ section_args[2] = arg_h;
|
|
|
+ section_args[3] = arg_i;
|
|
|
+
|
|
|
+ starpu_omp_sections(4, section_f, section_args, 0);
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+\subsection OMPSingle Single
|
|
|
+OpenMP <c>single</c> workharing constructs are supported using the set
|
|
|
+of starpu_omp_single() variants. An
|
|
|
+implicit barrier can be enforced or skipped at the end of the
|
|
|
+worksharing construct, according to the value of the <c>nowait</c>
|
|
|
+parameter.
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void single_f(void *arg)
|
|
|
+{
|
|
|
+ (void) arg;
|
|
|
+ pthread_t tid = pthread_self();
|
|
|
+ int worker_id = starpu_worker_get_id();
|
|
|
+ printf("[tid %p] task thread = %d -- single\n", (void *)tid, worker_id);
|
|
|
+}
|
|
|
+
|
|
|
+void parallel_region_f(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ starpu_omp_single(single_f, NULL, 0);
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+The SORS also provides dedicated support for <c>single</c> sections
|
|
|
+with <c>copyprivate</c> clauses through the
|
|
|
+starpu_omp_single_copyprivate() function variants. The OpenMP
|
|
|
+<c>master</c> directive is supported as well using the
|
|
|
+starpu_omp_master() function variants.
|
|
|
+
|
|
|
+\section Task Tasks
|
|
|
+
|
|
|
+The SORS implements the necessary support of OpenMP 3.1 and OpenMP 4.0's
|
|
|
+so-called explicit tasks, together with OpenMP 4.0's data dependency
|
|
|
+management.
|
|
|
+
|
|
|
+\subsection OMPTask Explicit Tasks
|
|
|
+Explicit OpenMP tasks are created with the SORS using the
|
|
|
+starpu_omp_task_region() function. The implementation supports
|
|
|
+<c>if</c>, <c>final</c>, <c>untied</c> and <c>mergeable</c> clauses
|
|
|
+as defined in the OpenMP specification. Unless specified otherwise by
|
|
|
+the appropriate clause(s), the created task may be executed by any
|
|
|
+participating worker of the current parallel region.
|
|
|
+
|
|
|
+The current SORS implementation requires explicit tasks to be created
|
|
|
+within the context of an active parallel region. In particular, an
|
|
|
+explicit task cannot be created by the main thread outside of a parallel
|
|
|
+region. Explicit OpenMP tasks created using starpu_omp_task_region() are
|
|
|
+implemented as StarPU tasks with extended semantics, and may as such be
|
|
|
+blocked and preempted by SORS routines.
|
|
|
+
|
|
|
+The current SORS implementation supports recursive explicit tasks
|
|
|
+creation, to ensure compliance with the OpenMP specification. However,
|
|
|
+it should be noted that StarPU is not designed nor optimized for
|
|
|
+efficiently scheduling of recursive task applications.
|
|
|
+
|
|
|
+The code below shows how to create 4 explicit tasks within a parallel
|
|
|
+region.
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void task_region_g(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ pthread tid = pthread_self();
|
|
|
+ int worker_id = starpu_worker_get_id();
|
|
|
+ printf("[tid %p] task thread = %d: explicit task \"g\"\n", (void *)tid, worker_id);
|
|
|
+}
|
|
|
+
|
|
|
+void parallel_region_f(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ struct starpu_omp_task_region_attr attr;
|
|
|
+
|
|
|
+ memset(&attr, 0, sizeof(attr));
|
|
|
+ attr.cl.cpu_funcs[0] = task_region_g;
|
|
|
+ attr.cl.where = STARPU_CPU;
|
|
|
+ attr.if_clause = 1;
|
|
|
+ attr.final_clause = 0;
|
|
|
+ attr.untied_clause = 1;
|
|
|
+ attr.mergeable_clause = 0;
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+\subsection DataDependencies Data Dependencies
|
|
|
+The SORS implements inter-tasks data dependencies as specified in OpenMP
|
|
|
+4.0. Data dependencies are expressed using regular StarPU data handles
|
|
|
+(<c>starpu_data_handle_t</c>) plugged into the task's <c>attr.cl</c>
|
|
|
+codelet. The family of starpu_vector_data_register() -like functions and the
|
|
|
+starpu_data_lookup() function may be used to register a memory area and
|
|
|
+to retrieve the current data handle associated with a pointer
|
|
|
+respectively. The testcase <c>./tests/openmp/task_02.c</c> gives a
|
|
|
+detailed example of using OpenMP 4.0 tasks dependencies with the SORS
|
|
|
+implementation.
|
|
|
+
|
|
|
+Note: the OpenMP 4.0 specification only supports data dependencies
|
|
|
+between sibling tasks, that is tasks created by the same implicit or
|
|
|
+explicit parent task. The current SORS implementation also only supports data
|
|
|
+dependencies between sibling tasks. Consequently the behaviour is
|
|
|
+unspecified if dependencies are expressed beween tasks that have not
|
|
|
+been created by the same parent task.
|
|
|
+
|
|
|
+\subsection TaskSyncs TaskWait and TaskGroup
|
|
|
+The SORS implements both the <c>taskwait</c> and <c>taskgroup</c> OpenMP
|
|
|
+task synchronization constructs specified in OpenMP 4.0, with the
|
|
|
+starpu_omp_taskwait() and starpu_omp_taskgroup() functions respectively.
|
|
|
+
|
|
|
+An example of starpu_omp_taskwait() use, creating two explicit tasks and
|
|
|
+waiting for their completion:
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void task_region_g(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ printf("Hello, World!\n");
|
|
|
+}
|
|
|
+
|
|
|
+void parallel_region_f(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ struct starpu_omp_task_region_attr attr;
|
|
|
+ memset(&attr, 0, sizeof(attr));
|
|
|
+ attr.cl.cpu_funcs[0] = task_region_g;
|
|
|
+ attr.cl.where = STARPU_CPU;
|
|
|
+ attr.if_clause = 1;
|
|
|
+ attr.final_clause = 0;
|
|
|
+ attr.untied_clause = 1;
|
|
|
+ attr.mergeable_clause = 0;
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+ starpu_omp_taskwait();
|
|
|
+\endcode
|
|
|
+
|
|
|
+An example of starpu_omp_taskgroup() use, creating a task group of two explicit tasks:
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void task_region_g(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ printf("Hello, World!\n");
|
|
|
+}
|
|
|
+
|
|
|
+void taskgroup_f(void *arg)
|
|
|
+{
|
|
|
+ (void)arg;
|
|
|
+ struct starpu_omp_task_region_attr attr;
|
|
|
+ memset(&attr, 0, sizeof(attr));
|
|
|
+ attr.cl.cpu_funcs[0] = task_region_g;
|
|
|
+ attr.cl.where = STARPU_CPU;
|
|
|
+ attr.if_clause = 1;
|
|
|
+ attr.final_clause = 0;
|
|
|
+ attr.untied_clause = 1;
|
|
|
+ attr.mergeable_clause = 0;
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+ starpu_omp_task_region(&attr);
|
|
|
+}
|
|
|
+
|
|
|
+void parallel_region_f(void *buffers[], void *args)
|
|
|
+{
|
|
|
+ (void) buffers;
|
|
|
+ (void) args;
|
|
|
+ starpu_omp_taskgroup(taskgroup_f, (void *)NULL);
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+\section Synchronization Synchronization Support
|
|
|
+
|
|
|
+Synchronization objects and methods.
|
|
|
+
|
|
|
+*/
|