Browse Source

merge trunk

Nathalie Furmento 10 years ago
parent
commit
836c1e148c
64 changed files with 9394 additions and 87 deletions
  1. 2 0
      Makefile.am
  2. 21 1
      configure.ac
  3. 3 0
      doc/doxygen/Makefile.am
  4. 463 0
      doc/doxygen/chapters/22openmp_runtime_support.doxy
  5. 1 0
      doc/doxygen/chapters/45files.doxy
  6. 955 0
      doc/doxygen/chapters/api/openmp_runtime_support.doxy
  7. 1 0
      doc/doxygen/doxygen-config.cfg.in
  8. 1 0
      doc/doxygen/doxygen.cfg
  9. 7 0
      doc/doxygen/refman.tex
  10. 2 0
      include/starpu.h
  11. 3 0
      include/starpu_config.h.in
  12. 169 0
      include/starpu_openmp.h
  13. 7 1
      include/starpu_task.h
  14. 5 1
      src/Makefile.am
  15. 7 1
      src/core/dependencies/task_deps.c
  16. 189 49
      src/core/jobs.c
  17. 48 1
      src/core/jobs.h
  18. 7 1
      src/core/simgrid.c
  19. 91 6
      src/core/task.c
  20. 18 1
      src/core/task.h
  21. 6 0
      src/core/workers.c
  22. 4 0
      src/datawizard/coherency.c
  23. 5 0
      src/datawizard/coherency.h
  24. 129 18
      src/datawizard/interfaces/data_interface.c
  25. 10 0
      src/datawizard/interfaces/data_interface.h
  26. 15 2
      src/drivers/cpu/driver_cpu.c
  27. 11 1
      src/drivers/cuda/driver_cuda.c
  28. 60 4
      src/drivers/driver_common/driver_common.c
  29. 2403 0
      src/util/openmp_runtime_support.c
  30. 378 0
      src/util/openmp_runtime_support.h
  31. 941 0
      src/util/openmp_runtime_support_environment.c
  32. 279 0
      src/util/openmp_runtime_support_omp_api.c
  33. 124 0
      tests/Makefile.am
  34. 135 0
      tests/openmp/api_01.c
  35. 48 0
      tests/openmp/environment.c
  36. 34 0
      tests/openmp/init_exit_01.c
  37. 44 0
      tests/openmp/init_exit_02.c
  38. 62 0
      tests/openmp/parallel_01.c
  39. 79 0
      tests/openmp/parallel_02.c
  40. 63 0
      tests/openmp/parallel_03.c
  41. 71 0
      tests/openmp/parallel_barrier_01.c
  42. 83 0
      tests/openmp/parallel_critical_01.c
  43. 86 0
      tests/openmp/parallel_critical_inline_01.c
  44. 93 0
      tests/openmp/parallel_critical_named_01.c
  45. 86 0
      tests/openmp/parallel_critical_named_inline_01.c
  46. 189 0
      tests/openmp/parallel_for_01.c
  47. 90 0
      tests/openmp/parallel_for_02.c
  48. 207 0
      tests/openmp/parallel_for_ordered_01.c
  49. 83 0
      tests/openmp/parallel_master_01.c
  50. 77 0
      tests/openmp/parallel_master_inline_01.c
  51. 117 0
      tests/openmp/parallel_nested_lock_01.c
  52. 106 0
      tests/openmp/parallel_sections_01.c
  53. 100 0
      tests/openmp/parallel_sections_combined_01.c
  54. 107 0
      tests/openmp/parallel_simple_lock_01.c
  55. 91 0
      tests/openmp/parallel_single_copyprivate_01.c
  56. 88 0
      tests/openmp/parallel_single_copyprivate_inline_01.c
  57. 94 0
      tests/openmp/parallel_single_inline_01.c
  58. 83 0
      tests/openmp/parallel_single_nowait_01.c
  59. 83 0
      tests/openmp/parallel_single_wait_01.c
  60. 87 0
      tests/openmp/task_01.c
  61. 196 0
      tests/openmp/task_02.c
  62. 119 0
      tests/openmp/taskgroup_01.c
  63. 123 0
      tests/openmp/taskgroup_02.c
  64. 105 0
      tests/openmp/taskwait_01.c

+ 2 - 0
Makefile.am

@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2009-2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2014  Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -65,6 +66,7 @@ versinclude_HEADERS = 				\
 	include/starpu_fxt.h			\
 	include/starpu_cuda.h			\
 	include/starpu_opencl.h			\
+	include/starpu_openmp.h			\
 	include/starpu_sink.h			\
 	include/starpu_mic.h			\
 	include/starpu_scc.h			\

+ 21 - 1
configure.ac

@@ -3,7 +3,7 @@
 # Copyright (C) 2009-2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
+# Copyright (C) 2011, 2012, 2014  Institut National de Recherche en Informatique et Automatique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -2031,6 +2031,25 @@ AM_CONDITIONAL([RUN_GCC_PLUGIN_TESTS],
 
 ###############################################################################
 #                                                                             #
+#                            OpenMP runtime support                           #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(openmp, [AS_HELP_STRING([--enable-openmp],
+			[build the OpenMP runtime support (experimental)])],
+			enable_openmp=$enableval, enable_openmp=no)
+
+AC_MSG_CHECKING(for OpenMP runtime support)
+
+if test x$enable_openmp = xyes; then
+	AC_DEFINE(STARPU_OPENMP, 1, [Define this to enable OpenMP runtime support])
+fi
+
+AM_CONDITIONAL([STARPU_OPENMP], [test "x$enable_openmp" = "xyes"])
+AC_MSG_RESULT($enable_openmp)
+
+###############################################################################
+#                                                                             #
 #                               SOCL interface                                #
 #                                                                             #
 ###############################################################################
@@ -2602,6 +2621,7 @@ AC_MSG_NOTICE([
 	       FFT Support:                                 $fft_support
 	       GCC plug-in:                                 $build_gcc_plugin
 	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite
+	       OpenMP runtime support enabled:              $enable_openmp
 	       SOCL enabled:                                $build_socl
                SOCL test suite:                             $run_socl_check
                Scheduler Hypervisor:                        $build_sc_hypervisor

+ 3 - 0
doc/doxygen/Makefile.am

@@ -46,6 +46,7 @@ chapters =	\
 	chapters/19c_extensions.doxy \
 	chapters/20socl_opencl_extensions.doxy \
 	chapters/21simgrid.doxy \
+	chapters/22openmp_runtime_support.doxy \
 	chapters/40environment_variables.doxy \
 	chapters/41configure_options.doxy \
 	chapters/45files.doxy \
@@ -86,6 +87,7 @@ chapters =	\
 	chapters/api/mpi.doxy \
 	chapters/api/multiformat_data_interface.doxy \
 	chapters/api/opencl_extensions.doxy \
+	chapters/api/openmp_runtime_support.doxy \
 	chapters/api/mic_extensions.doxy \
 	chapters/api/scc_extensions.doxy \
 	chapters/api/parallel_tasks.doxy \
@@ -202,6 +204,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_fxt.h		\
 	$(top_srcdir)/include/starpu_cuda.h		\
 	$(top_srcdir)/include/starpu_opencl.h		\
+	$(top_srcdir)/include/starpu_openmp.h		\
 	$(top_srcdir)/include/starpu_sink.h		\
 	$(top_srcdir)/include/starpu_mic.h		\
 	$(top_srcdir)/include/starpu_scc.h		\

+ 463 - 0
doc/doxygen/chapters/22openmp_runtime_support.doxy

@@ -0,0 +1,463 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2014 Inria
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page OpenMPRuntimeSupport The StarPU OpenMP Runtime Support (SORS)
+
+StarPU provides the necessary routines and support to implement an <a
+href="http://www.openmp.org/">OpenMP</a> runtime compliant with the
+revision 3.1 of the language specification, and compliant with the
+task-related data dependency functionalities introduced in the revision
+4.0 of the language. This StarPU OpenMP Runtime Support (SORS) has been
+designed to be targetted by OpenMP compilers such as the Klang-OMP
+compiler. Most supported OpenMP directives can both be implemented
+inline or as outlined functions.
+
+All functions are defined in \ref API_OpenMP_Runtime_Support.
+
+\section Implementation Implementation Details and Specificities
+
+\subsection MainThread Main Thread
+
+When using the SORS, the main thread gets involved in executing OpenMP tasks
+just like every other threads, in order to be compliant with the
+specification execution model. This contrasts with StarPU's usual
+execution model where the main thread submit tasks but does not take
+part in executing them.
+
+\subsection TaskSemantics Extended Task Semantics
+
+The semantics of tasks generated by the SORS are extended with respect
+to regular StarPU tasks in that SORS' tasks may block and be preempted
+by SORS call, whereas regular StarPU tasks cannot. SORS tasks may
+coexist with regular StarPU tasks. However, only the tasks created using
+SORS API functions inherit from extended semantics.
+
+\section Configuration Configuration
+
+The SORS can be compiled into <c>libstarpu</c>
+by providing the <c>--enable-openmp</c> flag to StarPU's
+<c>configure</c>. Conditional compiled source codes may check for the
+availability of the OpenMP Runtime Support by testing whether the C
+preprocessor macro <c>STARPU_OPENMP</c> is defined or not.
+
+\section InitExit Initialization and Shutdown
+
+The SORS needs to be executed/terminated by the
+starpu_omp_init()/starpu_omp_shutdown() instead of
+starpu_init()/starpu_shutdown(). This requirement is necessary to make
+sure that the main thread gets the proper execution environment to run
+OpenMP tasks. These calls will usually be performed by a compiler
+runtime. Thus, they can be executed from a constructor/destructor such
+as this:
+
+\code{.c}
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+\endcode
+
+\sa starpu_omp_init()
+\sa starpu_omp_shutdown()
+
+\section Parallel Parallel Regions and Worksharing
+
+The SORS provides functions to create OpenMP parallel regions as well as
+mapping work on participating workers. The current implementation does
+not provide nested active parallel regions: Parallel regions may be
+created recursively, however only the first level parallel region may
+have more than one worker. From an internal point-of-view, the SORS'
+parallel regions are implemented as a set of implicit, extended semantics
+StarPU tasks, following the execution model of the OpenMP specification.
+Thus the SORS' parallel region tasks may block and be preempted, by
+SORS calls, enabling constructs such as barriers.
+
+\subsection OMPParallel Parallel Regions
+
+Parallel regions can be created with the function
+starpu_omp_parallel_region() which accepts a set of attributes as
+parameter. The execution of the calling task is suspended until the
+parallel region completes. The <c>attr.cl</c> field is a regular StarPU
+codelet. However only CPU codelets are supported for parallel regions.
+Here is an example of use:
+
+\code{.c}
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	pthread_t tid = pthread_self();
+	int worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+}
+
+void f(void)
+{
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+\endcode
+
+\sa struct starpu_omp_parallel_region_attr
+\sa starpu_omp_parallel_region()
+
+\subsection OMPFor Parallel For
+
+OpenMP <c>for</c> loops are provided by the starpu_omp_for() group of
+functions. Variants are available for inline or outlined
+implementations. The SORS supports <c>static</c>, <c>dynamic</c>, and
+<c>guided</c> loop scheduling clauses. The <c>auto</c> scheduling clause
+is implemented as <c>static</c>. The <c>runtime</c> scheduling clause
+honors the scheduling mode selected through the environment variable
+OMP_SCHEDULE or the starpu_omp_set_schedule() function. For loops with
+the <c>ordered</c> clause are also supported. An implicit barrier can be
+enforced or skipped at the end of the worksharing construct, according
+to the value of the <c>nowait</c> parameter.
+
+The canonical family of starpu_omp_for() functions provide each instance
+with the first iteration number and the number of iterations (possibly
+zero) to perform. The alternate family of starpu_omp_for_alt() functions
+provide each instance with the (possibly empty) range of iterations to
+perform, including the first and excluding the last.
+
+The family of starpu_omp_ordered() functions enable to implement
+OpenMP's ordered construct, a region with a parallel for loop that is
+guaranteed to be executed in the sequential order of the loop
+iterations.
+
+\code{.c}
+void for_g(unsigned long long i, unsigned long long nb_i, void *arg)
+{
+	(void) arg;
+	for (; nb_i > 0; i++, nb_i--)
+	{
+		array[i] = 1;
+	}
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	starpu_omp_for(for_g, NULL, NB_ITERS, CHUNK, starpu_omp_sched_static, 0, 0);
+}
+\endcode
+
+\sa starpu_omp_for()
+\sa starpu_omp_for_inline_first()
+\sa starpu_omp_for_inline_next()
+\sa starpu_omp_for_alt()
+\sa starpu_omp_for_inline_first_alt()
+\sa starpu_omp_for_inline_next_alt()
+\sa starpu_omp_ordered()
+\sa starpu_omp_ordered_inline_begin()
+\sa starpu_omp_ordered_inline_end()
+
+
+\subsection OMPSections Sections
+OpenMP <c>sections</c> worksharing constructs are supported using the
+set of starpu_omp_sections() variants. The general principle is either
+to provide an array of per-section functions or a single function that
+will redirect to execution to the suitable per-section functions. An
+implicit barrier can be enforced or skipped at the end of the
+worksharing construct, according to the value of the <c>nowait</c>
+parameter.
+
+\code{.c}
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+
+	section_funcs[0] = f;
+	section_funcs[1] = g;
+	section_funcs[2] = h;
+	section_funcs[3] = i;
+
+	section_args[0] = arg_f;
+	section_args[1] = arg_g;
+	section_args[2] = arg_h;
+	section_args[3] = arg_i;
+
+	starpu_omp_sections(4, section_f, section_args, 0);
+}
+\endcode
+
+\sa starpu_omp_sections()
+\sa starpu_omp_sections_combined()
+
+\subsection OMPSingle Single
+OpenMP <c>single</c> workharing constructs are supported using the set
+of starpu_omp_single() variants. An
+implicit barrier can be enforced or skipped at the end of the
+worksharing construct, according to the value of the <c>nowait</c>
+parameter.
+
+\code{.c}
+void single_f(void *arg)
+{
+	(void) arg;
+	pthread_t tid = pthread_self();
+	int worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- single\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	starpu_omp_single(single_f, NULL, 0);
+}
+\endcode
+
+The SORS also provides dedicated support for  <c>single</c> sections
+with <c>copyprivate</c> clauses through the
+starpu_omp_single_copyprivate() function variants. The OpenMP
+<c>master</c> directive is supported as well using the
+starpu_omp_master() function variants.
+
+\sa starpu_omp_master()
+\sa starpu_omp_master_inline()
+\sa starpu_omp_single()
+\sa starpu_omp_single_inline()
+\sa starpu_omp_single_copyprivate()
+\sa starpu_omp_single_copyprivate_inline_begin()
+\sa starpu_omp_single_copyprivate_inline_end()
+
+\section Task Tasks
+
+The SORS implements the necessary support of OpenMP 3.1 and OpenMP 4.0's
+so-called explicit tasks, together with OpenMP 4.0's data dependency
+management.
+
+\subsection OMPTask Explicit Tasks
+Explicit OpenMP tasks are created with the SORS using the
+starpu_omp_task_region() function. The implementation supports
+<c>if</c>, <c>final</c>, <c>untied</c> and <c>mergeable</c> clauses
+as defined in the OpenMP specification. Unless specified otherwise by
+the appropriate clause(s), the created task may be executed by any
+participating worker of the current parallel region.
+
+The current SORS implementation requires explicit tasks to be created
+within the context of an active parallel region. In particular, an
+explicit task cannot be created by the main thread outside of a parallel
+region. Explicit OpenMP tasks created using starpu_omp_task_region() are
+implemented as StarPU tasks with extended semantics, and may as such be
+blocked and preempted by SORS routines.
+
+The current SORS implementation supports recursive explicit tasks
+creation, to ensure compliance with the OpenMP specification. However,
+it should be noted that StarPU is not designed nor optimized for
+efficiently scheduling of recursive task applications.
+
+The code below shows how to create 4 explicit tasks within a parallel
+region.
+
+\code{.c}
+void task_region_g(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	pthread tid = pthread_self();
+	int worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: explicit task \"g\"\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	struct starpu_omp_task_region_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+}
+\endcode
+
+\sa struct starpu_omp_task_region_attr
+\sa starpu_omp_task_region()
+
+\subsection DataDependencies Data Dependencies
+The SORS implements inter-tasks data dependencies as specified in OpenMP
+4.0. Data dependencies are expressed using regular StarPU data handles
+(<c>starpu_data_handle_t</c>) plugged into the task's <c>attr.cl</c>
+codelet. The family of starpu_vector_data_register() -like functions and the
+starpu_data_lookup() function may be used to register a memory area and
+to retrieve the current data handle associated with a pointer
+respectively. The testcase <c>./tests/openmp/task_02.c</c> gives a
+detailed example of using OpenMP 4.0 tasks dependencies with the SORS
+implementation.
+
+Note: the OpenMP 4.0 specification only supports data dependencies
+between sibling tasks, that is tasks created by the same implicit or
+explicit parent task. The current SORS implementation also only supports data
+dependencies between sibling tasks. Consequently the behaviour is
+unspecified if dependencies are expressed beween tasks that have not
+been created by the same parent task.
+
+\subsection TaskSyncs TaskWait and TaskGroup
+The SORS implements both the <c>taskwait</c> and <c>taskgroup</c> OpenMP
+task synchronization constructs specified in OpenMP 4.0, with the
+starpu_omp_taskwait() and starpu_omp_taskgroup() functions respectively.
+
+An example of starpu_omp_taskwait() use, creating two explicit tasks and
+waiting for their completion:
+
+\code{.c}
+void task_region_g(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	printf("Hello, World!\n");
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	struct starpu_omp_task_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+	starpu_omp_taskwait();
+\endcode
+
+An example of starpu_omp_taskgroup() use, creating a task group of two explicit tasks:
+
+\code{.c}
+void task_region_g(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	printf("Hello, World!\n");
+}
+
+void taskgroup_f(void *arg)
+{
+	(void)arg;
+	struct starpu_omp_task_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	starpu_omp_taskgroup(taskgroup_f, (void *)NULL);
+}
+\endcode
+
+\sa starpu_omp_task_region()
+\sa starpu_omp_taskwait()
+\sa starpu_omp_taskgroup()
+\sa starpu_omp_taskgroup_inline_begin()
+\sa starpu_omp_taskgroup_inline_end()
+
+\section Synchronization Synchronization Support
+
+The SORS implements objects and method to build common OpenMP
+synchronization constructs.
+
+\subsection SimpleLock Simple Locks
+
+The SORS Simple Locks are opaque starpu_omp_lock_t objects enabling multiple
+tasks to synchronize with each others, following the Simple Lock
+constructs defined by the OpenMP specification. In accordance with such
+specification, simple locks may not by acquired multiple times by the
+same task, without being released in-between; otherwise, deadlocks may
+result. Codes requiring the possibility to lock multiple times
+recursively should use Nestable Locks (\ref NestableLock). Codes NOT
+requiring the possibility to lock multiple times recursively should use
+Simple Locks as they incur less processing overhead than Nestable Locks.
+
+\sa starpu_omp_lock_t
+\sa starpu_omp_init_lock()
+\sa starpu_omp_destroy_lock()
+\sa starpu_omp_set_lock()
+\sa starpu_omp_unset_lock()
+\sa starpu_omp_test_lock()
+
+\subsection NestableLock Nestable Locks
+
+The SORS Nestable Locks are opaque starpu_omp_nest_lock_t objects enabling
+multiple tasks to synchronize with each others, following the Nestable
+Lock constructs defined by the OpenMP specification. In accordance with
+such specification, nestable locks may by acquired multiple times
+recursively by the same task without deadlocking. Nested locking and
+unlocking operations must be well parenthesized at any time, otherwise
+deadlock and/or undefined behaviour may occur.  Codes requiring the
+possibility to lock multiple times recursively should use Nestable
+Locks. Codes NOT requiring the possibility to lock multiple times
+recursively should use Simple Locks (\ref SimpleLock) instead, as they
+incur less processing overhead than Nestable Locks.
+
+\sa starpu_omp_nest_lock_t
+\sa starpu_omp_init_nest_lock()
+\sa starpu_omp_destroy_nest_lock()
+\sa starpu_omp_set_nest_lock()
+\sa starpu_omp_unset_nest_lock()
+\sa starpu_omp_test_nest_lock()
+
+\subsection Critical Critical Sections
+
+The SORS implements support for OpenMP critical sections through the
+family of starpu_omp_critical functions. Critical sections may optionally
+be named. There is a single, common anonymous critical section. Mutual
+exclusion only occur within the scope of single critical section, either
+a named one or the anonymous one.
+
+\sa starpu_omp_critical()
+\sa starpu_omp_critical_inline_begin()
+\sa starpu_omp_critical_inline_end()
+
+\subsection Barrier Barriers
+
+The SORS provides the starpu_omp_barrier() function to implement
+barriers over parallel region teams. In accordance with the OpenMP
+specification, the starpu_omp_barrier() function waits for every
+implicit task of the parallel region to reach the barrier and every
+explicit task launched by the parallel region to complete, before
+returning.
+
+\sa starpu_omp_barrier()
+
+*/

+ 1 - 0
doc/doxygen/chapters/45files.doxy

@@ -26,6 +26,7 @@
 \file starpu_hash.h
 \file starpu_mic.h
 \file starpu_opencl.h
+\file starpu_openmp.h
 \file starpu_perfmodel.h
 \file starpu_profiling.h
 \file starpu_rand.h

+ 955 - 0
doc/doxygen/chapters/api/openmp_runtime_support.doxy

@@ -0,0 +1,955 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2014 Inria
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_OpenMP_Runtime_Support OpenMP Runtime Support
+
+\brief This section describes the interface provided for implementing OpenMP runtimes on top of StarPU.
+
+
+\struct starpu_omp_lock_t
+\ingroup API_OpenMP_Runtime_Support
+Opaque Simple Lock object (\ref SimpleLock) for inter-task synchronization operations.
+
+\sa starpu_omp_init_lock()
+\sa starpu_omp_destroy_lock()
+\sa starpu_omp_set_lock()
+\sa starpu_omp_unset_lock()
+\sa starpu_omp_test_lock()
+
+\var starpu_omp_lock_t::internal
+Is an opaque pointer for internal use.
+
+
+\struct starpu_omp_nest_lock_t
+\ingroup API_OpenMP_Runtime_Support
+Opaque Nestable Lock object (\ref NestableLock) for inter-task synchronization operations.
+
+\sa starpu_omp_init_nest_lock()
+\sa starpu_omp_destroy_nest_lock()
+\sa starpu_omp_set_nest_lock()
+\sa starpu_omp_unset_nest_lock()
+\sa starpu_omp_test_nest_lock()
+\var starpu_omp_nest_lock_t::internal
+Is an opaque pointer for internal use.
+
+
+\enum starpu_omp_sched_value
+\ingroup API_OpenMP_Runtime_Support
+Set of constants for selecting the for loop iteration scheduling algorithm
+(\ref OMPFor) as defined by the OpenMP specification.
+
+\var starpu_omp_sched_value::starpu_omp_sched_undefined
+\ingroup API_OpenMP_Runtime_Support
+Undefined iteration scheduling algorithm.
+
+\var starpu_omp_sched_value::starpu_omp_sched_static
+\ingroup API_OpenMP_Runtime_Support
+\b Static iteration scheduling algorithm.
+
+\var starpu_omp_sched_value::starpu_omp_sched_dynamic
+\ingroup API_OpenMP_Runtime_Support
+\b Dynamic iteration scheduling algorithm.
+
+\var starpu_omp_sched_value::starpu_omp_sched_guided
+\ingroup API_OpenMP_Runtime_Support
+\b Guided iteration scheduling algorithm.
+
+\var starpu_omp_sched_value::starpu_omp_sched_auto
+\ingroup API_OpenMP_Runtime_Support
+\b Automatically choosen iteration scheduling algorithm.
+
+\var starpu_omp_sched_value::starpu_omp_sched_runtime
+\ingroup API_OpenMP_Runtime_Support
+Choice of iteration scheduling algorithm deferred at \b runtime.
+
+\sa starpu_omp_for()
+\sa starpu_omp_for_inline_first()
+\sa starpu_omp_for_inline_next()
+\sa starpu_omp_for_alt()
+\sa starpu_omp_for_inline_first_alt()
+\sa starpu_omp_for_inline_next_alt()
+
+
+\enum starpu_omp_proc_bind_value
+\ingroup API_OpenMP_Runtime_Support
+Set of constants for selecting the processor binding method, as defined in the
+OpenMP specification.
+
+\var starpu_omp_proc_bind_value::starpu_omp_proc_bind_undefined
+\ingroup API_OpenMP_Runtime_Support
+Undefined processor binding method.
+
+\var starpu_omp_proc_bind_value::starpu_omp_proc_bind_false
+\ingroup API_OpenMP_Runtime_Support
+Team threads may be moved between places at any time.
+
+\var starpu_omp_proc_bind_value::starpu_omp_proc_bind_true
+\ingroup API_OpenMP_Runtime_Support
+Team threads may not be moved between places.
+
+\var starpu_omp_proc_bind_value::starpu_omp_proc_bind_master
+\ingroup API_OpenMP_Runtime_Support
+Assign every thread in the team to the same place as the \b master thread.
+
+\var starpu_omp_proc_bind_value::starpu_omp_proc_bind_close
+\ingroup API_OpenMP_Runtime_Support
+Assign every thread in the team to a place \b close to the parent thread.
+
+\var starpu_omp_proc_bind_value::starpu_omp_proc_bind_spread
+\ingroup API_OpenMP_Runtime_Support
+Assign team threads as a sparse distribution over the selected places.
+
+\sa starpu_omp_get_proc_bind()
+
+
+\struct starpu_omp_parallel_region_attr
+\ingroup API_OpenMP_Runtime_Support
+Set of attributes used for creating a new parallel region.
+
+\sa starpu_omp_parallel_region()
+
+\var starpu_omp_parallel_region_attr::cl
+
+Is a ::starpu_codelet (\ref API_Codelet_And_Tasks) to use for the parallel region
+implicit tasks. The codelet must provide a CPU implementation function.
+
+\var starpu_omp_parallel_region_attr::handles
+
+Is an array of zero or more ::starpu_data_handle_t data handle to be passed to
+the parallel region implicit tasks.
+
+\var starpu_omp_parallel_region_attr::cl_arg
+
+Is an optional pointer to an inline argument to be passed to the region implicit tasks.
+
+\var starpu_omp_parallel_region_attr::cl_arg_size
+
+Is the size of the optional inline argument to be passed to the region implicit tasks, or 0 if unused.
+
+\var starpu_omp_parallel_region_attr::cl_arg_free
+
+Is a boolean indicating whether the optional inline argument should be automatically freed (true), or not (false).
+
+\var starpu_omp_parallel_region_attr::if_clause
+
+Is a boolean indicating whether the \b if clause of the corresponding <c>pragma
+omp parallel</c> is true or false.
+
+\var starpu_omp_parallel_region_attr::num_threads
+
+Is an integer indicating the requested number of threads in the team of the
+newly created parallel region, or 0 to let the runtime choose the number of
+threads alone. This attribute may be ignored by the runtime system if the
+requested number of threads is higher than the number of threads that the
+runtime can create.
+
+\struct starpu_omp_task_region_attr
+\ingroup API_OpenMP_Runtime_Support
+Set of attributes used for creating a new task region.
+
+\sa starpu_omp_task_region()
+
+\var starpu_omp_task_region_attr::cl
+
+Is a ::starpu_codelet (\ref API_Codelet_And_Tasks) to use for the task region
+explicit task. The codelet must provide a CPU implementation function or an
+accelerator implementation for offloaded target regions.
+
+\var starpu_omp_task_region_attr::handles
+
+Is an array of zero or more ::starpu_data_handle_t data handle to be passed to
+the task region explicit tasks.
+
+\var starpu_omp_task_region_attr::cl_arg
+
+Is an optional pointer to an inline argument to be passed to the region implicit tasks.
+
+\var starpu_omp_task_region_attr::cl_arg_size
+
+Is the size of the optional inline argument to be passed to the region implicit
+tasks, or 0 if unused.
+
+\var starpu_omp_task_region_attr::cl_arg_free
+
+Is a boolean indicating whether the optional inline argument should be
+automatically freed (true), or not (false).
+
+\var starpu_omp_task_region_attr::if_clause
+
+Is a boolean indicating whether the \b if clause of the corresponding <c>pragma
+omp task</c> is true or false.
+
+\var starpu_omp_task_region_attr::final_clause
+
+Is a boolean indicating whether the \b final clause of the corresponding <c>pragma
+omp task</c> is true or false.
+
+\var starpu_omp_task_region_attr::untied_clause
+
+Is a boolean indicating whether the \b untied clause of the corresponding <c>pragma
+omp task</c> is true or false.
+
+\var starpu_omp_task_region_attr::mergeable_clause
+
+Is a boolean indicating whether the \b mergeable clause of the corresponding <c>pragma
+omp task</c> is true or false.
+
+@name Initialisation
+\ingroup API_OpenMP_Runtime_Support
+
+\def STARPU_OPENMP
+\ingroup API_OpenMP_Runtime_Support
+This macro is defined when StarPU has been installed with OpenMP Runtime
+support. It should be used in your code to detect the availability of
+the runtime support for OpenMP.
+
+\fn int starpu_omp_init(void)
+\ingroup API_OpenMP_Runtime_Support
+Initializes StarPU and its OpenMP Runtime support.
+
+\fn int starpu_omp_shutdown(void)
+\ingroup API_OpenMP_Runtime_Support
+Shutdown StarPU and its OpenMP Runtime support.
+
+@name Parallel
+\anchor ORS_Parallel
+\ingroup API_OpenMP_Runtime_Support
+
+\fn void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *attr)
+\ingroup API_OpenMP_Runtime_Support
+Generates and launch an OpenMP parallel region and return after its
+completion. \p attr specifies the attributes for the generated parallel region.
+If this function is called from inside another, generating, parallel region, the
+generated parallel region is nested within the generating parallel region.
+
+This function can be used to implement <c>\#pragma omp parallel</c>.
+
+\fn void starpu_omp_master(void (*f)(void *arg), void *arg)
+\ingroup API_OpenMP_Runtime_Support
+Executes a function only on the master thread of the OpenMP
+parallel region it is called from. When called from a thread that is not the
+master of the parallel region it is called from, this function does nothing. \p
+f is the function to be called. \p arg is an argument passed to function \p f.
+
+This function can be used to implement <c>\#pragma omp master</c>.
+
+\fn int starpu_omp_master_inline(void)
+\ingroup API_OpenMP_Runtime_Support
+Determines whether the calling thread is the master of the OpenMP parallel region
+it is called from or not.
+
+This function can be used to implement <c>\#pragma omp master</c> without code
+outlining.
+\return <c>!0</c> if called by the region's master thread.
+\return <c>0</c> if not called by the region's master thread.
+
+@name Synchronization
+\anchor ORS_Synchronization
+\ingroup API_OpenMP_Runtime_Support
+
+\fn void starpu_omp_barrier(void)
+\ingroup API_OpenMP_Runtime_Support
+Waits until each participating thread of the innermost OpenMP parallel region
+has reached the barrier and each explicit OpenMP task bound to this region has
+completed its execution.
+
+This function can be used to implement <c>\#pragma omp barrier</c>.
+
+\fn void starpu_omp_critical(void (*f)(void *arg), void *arg, const char *name)
+\ingroup API_OpenMP_Runtime_Support
+Waits until no other thread is executing within the context of the selected
+critical section, then proceeds to the exclusive execution of a function within
+the critical section. \p f is the function to be executed in the critical
+section. \p arg is an argument passed to function \p f. \p name is the name of
+the selected critical section. If <c>name == NULL</c>, the selected critical
+section is the unique anonymous critical section.
+
+This function can be used to implement <c>\#pragma omp critical</c>.
+
+\fn void starpu_omp_critical_inline_begin(const char *name)
+\ingroup API_OpenMP_Runtime_Support
+Waits until execution can proceed exclusively within the context of the
+selected critical section. \p name is the name of the selected critical
+section. If <c>name == NULL</c>, the selected critical section is the unique
+anonymous critical section.
+
+This function together with #starpu_omp_critical_inline_end can be used to
+implement <c>\#pragma omp critical</c> without code outlining.
+
+\fn void starpu_omp_critical_inline_end(const char *name)
+\ingroup API_OpenMP_Runtime_Support
+Ends the exclusive execution within the context of the selected critical
+section. \p name is the name of the selected critical section. If
+<c>name==NULL</c>, the selected critical section is the unique anonymous
+critical section.
+
+This function together with #starpu_omp_critical_inline_begin can be used to
+implement <c>\#pragma omp critical</c> without code outlining.
+
+@name Worksharing
+\anchor ORS_Worksharing
+\ingroup API_OpenMP_Runtime_Support
+
+\fn void starpu_omp_single(void (*f)(void *arg), void *arg, int nowait)
+\ingroup API_OpenMP_Runtime_Support
+Ensures that a single participating thread of the innermost OpenMP parallel
+region executes a function. \p f is the function to be executed by a single
+thread. \p arg is an argument passed to function \p f. \p nowait is a flag
+indicating whether an implicit barrier is requested after the single section
+(<c>nowait==0</c>) or not (<c>nowait==!0</c>).
+
+This function can be used to implement <c>\#pragma omp single</c>.
+
+\fn int starpu_omp_single_inline(void)
+\ingroup API_OpenMP_Runtime_Support
+Decides whether the current thread is elected to run the following single
+section among the participating threads of the innermost OpenMP parallel
+region.
+
+This function can be used to implement <c>\#pragma omp single</c> without code
+outlining.
+\return <c>!0</c> if the calling thread has won the election.
+\return <c>0</c> if the calling thread has lost the election.
+
+\fn void starpu_omp_single_copyprivate(void (*f)(void *arg, void *data, unsigned long long data_size), void *arg, void *data, unsigned long long data_size)
+\ingroup API_OpenMP_Runtime_Support
+This function executes \p f on a single task of the current parallel region
+task, and then broadcast the contents of the memory block pointed by the
+copyprivate pointer \p data and of size \p data_size to the corresponding \p
+data pointed memory blocks of all the other participating region tasks. This
+function can be used to implement <c>\#pragma omp single</c> with a copyprivate
+clause.
+
+\sa starpu_omp_single_copyprivate_inline
+\sa starpu_omp_single_copyprivate_inline_begin
+\sa starpu_omp_single_copyprivate_inline_end
+
+\fn void *starpu_omp_single_copyprivate_inline_begin(void *data)
+\ingroup API_OpenMP_Runtime_Support
+This function elects one task among the tasks of the current parallel region
+task to execute the following single section, and then broadcast the
+copyprivate pointer \p data to all the other participating region tasks. This
+function can be used to implement <c>\#pragma omp single</c> with a copyprivate
+clause without code outlining.
+
+\sa starpu_omp_single_copyprivate_inline
+\sa starpu_omp_single_copyprivate_inline_end
+
+\fn void starpu_omp_single_copyprivate_inline_end(void)
+\ingroup API_OpenMP_Runtime_Support
+This function completes the execution of a single section and returns the
+broadcasted copyprivate pointer for tasks that lost the election and NULL for
+the task that won the election. This function can be used to implement
+<c>\#pragma omp single</c> with a copyprivate clause without code outlining.
+
+\return the copyprivate pointer for tasks that lost the election and therefore did not execute the code of the single section.
+\return NULL for the task that won the election and executed the code of the single section.
+
+\sa starpu_omp_single_copyprivate_inline
+\sa starpu_omp_single_copyprivate_inline_begin
+
+\fn void starpu_omp_for(void (*f)(unsigned long long _first_i, unsigned long long _nb_i, void *arg), void *arg, unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, int nowait)
+\ingroup API_OpenMP_Runtime_Support
+Executes a parallel loop together with the other threads participating to the
+innermost parallel region. \p f is the function to be executed iteratively. \p
+arg is an argument passed to function \p f. \p nb_iterations is the number of
+iterations to be performed by the parallel loop. \p chunk is the number of
+consecutive iterations that should be affected to the same thread when
+scheduling the loop workshares, it follows the semantics of the \c modifier
+argument in OpenMP <c>\#pragma omp for</c> specification. \p schedule is the
+scheduling mode according to the OpenMP specification. \p ordered is a flag
+indicating whether the loop region may contain an ordered section
+(<c>ordered==!0</c>) or not (<c>ordered==0</c>). \p nowait is a flag
+indicating whether an implicit barrier is requested after the for section
+(<c>nowait==0</c>) or not (<c>nowait==!0</c>).
+
+The function \p f will be called with arguments \p _first_i, the first iteration
+to perform, \p _nb_i, the number of consecutive iterations to perform before
+returning, \p arg, the free \p arg argument.
+
+This function can be used to implement <c>\#pragma omp for</c>.
+
+\fn int starpu_omp_for_inline_first(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_first_i, unsigned long long *_nb_i)
+\ingroup API_OpenMP_Runtime_Support
+Decides whether the current thread should start to execute a parallel loop
+section. See #starpu_omp_for for the argument description.
+
+This function together with #starpu_omp_for_inline_next can be used to
+implement <c>\#pragma omp for</c> without code outlining.
+
+\return <c>!0</c> if the calling thread participates to the loop region and
+should execute a first chunk of iterations. In that case, \p *_first_i will be
+set to the first iteration of the chunk to perform and \p *_nb_i will be set to
+the number of iterations of the chunk to perform.
+
+\return <c>0</c> if the calling thread does not participate to the loop region
+because all the available iterations have been affected to the other threads of
+the parallel region.
+
+\sa starpu_omp_for
+
+\fn int starpu_omp_for_inline_next(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_first_i, unsigned long long *_nb_i)
+\ingroup API_OpenMP_Runtime_Support
+Decides whether the current thread should continue to execute a parallel loop
+section. See #starpu_omp_for for the argument description.
+
+This function together with #starpu_omp_for_inline_first can be used to
+implement <c>\#pragma omp for</c> without code outlining.
+
+\return <c>!0</c> if the calling thread should execute a next chunk of
+iterations. In that case, \p *_first_i will be set to the first iteration of the
+chunk to perform and \p *_nb_i will be set to the number of iterations of the
+chunk to perform.
+
+\return <c>0</c> if the calling thread does not participate anymore to the loop
+region because all the available iterations have been affected to the other
+threads of the parallel region.
+
+\sa starpu_omp_for
+
+\fn void starpu_omp_for_alt(void (*f)(unsigned long long _begin_i, unsigned long long _end_i, void *arg), void *arg, unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, int nowait)
+\ingroup API_OpenMP_Runtime_Support
+Alternative implementation of a parallel loop. This function differs from
+#starpu_omp_for in the expected arguments of the loop function \c f.
+
+The function \p f will be called with arguments \p _begin_i, the first iteration
+to perform, \p _end_i, the first iteration not to perform before
+returning, \p arg, the free \p arg argument.
+
+This function can be used to implement <c>\#pragma omp for</c>.
+
+\sa starpu_omp_for
+
+\fn int starpu_omp_for_inline_first_alt(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_begin_i, unsigned long long *_end_i)
+\ingroup API_OpenMP_Runtime_Support
+Inline version of the alternative implementation of a parallel loop.
+
+This function together with #starpu_omp_for_inline_next_alt can be used to
+implement <c>\#pragma omp for</c> without code outlining.
+
+\sa starpu_omp_for
+\sa starpu_omp_for_alt
+\sa starpu_omp_for_inline_first
+
+\fn int starpu_omp_for_inline_next_alt(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_begin_i, unsigned long long *_end_i)
+\ingroup API_OpenMP_Runtime_Support
+Inline version of the alternative implementation of a parallel loop.
+
+This function together with #starpu_omp_for_inline_first_alt can be used to
+implement <c>\#pragma omp for</c> without code outlining.
+
+\sa starpu_omp_for
+\sa starpu_omp_for_alt
+\sa starpu_omp_for_inline_next
+
+\fn void starpu_omp_ordered(void (*f)(void *arg), void *arg)
+\ingroup API_OpenMP_Runtime_Support
+Ensures that a function is sequentially executed once for each iteration in
+order within a parallel loop, by the thread that own the iteration. \p f is the
+function to be executed by the thread that own the current iteration. \p arg is
+an argument passed to function \p f.
+
+This function can be used to implement <c>\#pragma omp ordered</c>.
+
+\fn void starpu_omp_ordered_inline_begin(void)
+\ingroup API_OpenMP_Runtime_Support
+Waits until all the iterations of a parallel loop below the iteration owned by
+the current thread have been executed.
+
+This function together with #starpu_omp_ordered_inline_end can be used to
+implement <c>\#pragma omp ordered</c> without code code outlining.
+
+\fn void starpu_omp_ordered_inline_end(void)
+\ingroup API_OpenMP_Runtime_Support
+Notifies that the ordered section for the current iteration has been completed.
+
+This function together with #starpu_omp_ordered_inline_begin can be used to
+implement <c>\#pragma omp ordered</c> without code code outlining.
+
+\fn void starpu_omp_sections(unsigned long long nb_sections, void (**section_f)(void *arg), void **section_arg, int nowait)
+\ingroup API_OpenMP_Runtime_Support
+Ensures that each function of a given array of functions is executed by one and
+only one thread. \p nb_sections is the number of functions in the array \p
+section_f. \p section_f is the array of functions to be executed as sections. \p
+section_arg is an array of arguments to be passed to the corresponding function.
+\p nowait is a flag indicating whether an implicit barrier is requested after
+the execution of all the sections (<c>nowait==0</c>) or not (<c>nowait==!0</c>).
+
+This function can be used to implement <c>\#pragma omp sections</c> and <c>\#pragma omp section</c>.
+
+\fn void starpu_omp_sections_combined(unsigned long long nb_sections, void (*section_f)(unsigned long long section_num, void *arg), void **section_arg, int nowait)
+\ingroup API_OpenMP_Runtime_Support
+Alternative implementation of sections. This function differs from
+#starpu_omp_sections in that all the sections are combined within a single
+function in this version. \p section_f is the function implementing the combined
+sections.
+
+The function \p section_f will be called with arguments \p section_num, the
+section number to be executed, \p arg, the entry of \p section_arg corresponding
+to this section.
+
+This function can be used to implement <c>\#pragma omp sections</c> and <c>\#pragma omp section</c>.
+
+\sa starpu_omp_sections
+
+@name Task
+\anchor ORS_Task
+\ingroup API_OpenMP_Runtime_Support
+
+\fn void starpu_omp_task_region(const struct starpu_omp_task_region_attr *attr)
+\ingroup API_OpenMP_Runtime_Support
+Generates an explicit child task. The execution of the generated task is
+asynchronous with respect to the calling code unless specified otherwise.
+\p attr specifies the attributes for the generated task region.
+
+This function can be used to implement <c>\#pragma omp task</c>.
+
+\fn void starpu_omp_taskwait(void)
+\ingroup API_OpenMP_Runtime_Support
+Waits for the completion of the tasks generated by the current task. This
+function does not wait for the descendants of the tasks generated by the current
+task.
+
+This function can be used to implement <c>\#pragma omp taskwait</c>.
+
+\fn void starpu_omp_taskgroup(void (*f)(void *arg), void *arg)
+\ingroup API_OpenMP_Runtime_Support
+Launches a function and wait for the completion of every descendant task
+generated during the execution of the function.
+
+This function can be used to implement <c>\#pragma omp taskgroup</c>.
+
+\sa starpu_omp_taskgroup_inline_begin
+\sa starpu_omp_taskgroup_inline_end
+
+\fn void starpu_omp_taskgroup_inline_begin(void)
+\ingroup API_OpenMP_Runtime_Support
+Launches a function and gets ready to wait for the completion of every descendant task
+generated during the dynamic scope of the taskgroup.
+
+This function can be used to implement <c>\#pragma omp taskgroup</c> without code outlining.
+
+\sa starpu_omp_taskgroup
+\sa starpu_omp_taskgroup_inline_end
+
+\fn void starpu_omp_taskgroup_inline_end(void)
+\ingroup API_OpenMP_Runtime_Support
+Waits for the completion of every descendant task
+generated during the dynamic scope of the taskgroup.
+
+This function can be used to implement <c>\#pragma omp taskgroup</c> without code outlining.
+
+\sa starpu_omp_taskgroup
+\sa starpu_omp_taskgroup_inline_begin
+
+
+@name API
+\anchor ORS_API
+\ingroup API_OpenMP_Runtime_Support
+
+\fn void starpu_omp_set_num_threads(int threads)
+\ingroup API_OpenMP_Runtime_Support
+This function sets ICVS nthreads_var for the parallel regions to be created
+with the current region.
+
+Note: The StarPU OpenMP runtime support currently ignores
+this setting for nested parallel regions.
+
+\sa starpu_omp_get_num_threads
+\sa starpu_omp_get_thread_num
+\sa starpu_omp_get_max_threads
+\sa starpu_omp_get_num_procs
+
+\fn int starpu_omp_get_num_threads()
+\ingroup API_OpenMP_Runtime_Support
+This function returns the number of threads of the current region.
+
+\return the number of threads of the current region.
+
+\sa starpu_omp_set_num_threads
+\sa starpu_omp_get_thread_num
+\sa starpu_omp_get_max_threads
+\sa starpu_omp_get_num_procs
+
+\fn int starpu_omp_get_thread_num()
+\ingroup API_OpenMP_Runtime_Support
+This function returns the rank of the current thread among the threads
+of the current region.
+
+\return the rank of the current thread in the current region.
+
+\sa starpu_omp_set_num_threads
+\sa starpu_omp_get_num_threads
+\sa starpu_omp_get_max_threads
+\sa starpu_omp_get_num_procs
+
+\fn int starpu_omp_get_max_threads()
+\ingroup API_OpenMP_Runtime_Support
+This function returns the maximum number of threads that can be used to
+create a region from the current region.
+
+\return the maximum number of threads that can be used to create a region from the current region.
+
+\sa starpu_omp_set_num_threads
+\sa starpu_omp_get_num_threads
+\sa starpu_omp_get_thread_num
+\sa starpu_omp_get_num_procs
+
+\fn int starpu_omp_get_num_procs (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the number of StarPU CPU workers.
+
+\return the number of StarPU CPU workers.
+
+\sa starpu_omp_set_num_threads
+\sa starpu_omp_get_num_threads
+\sa starpu_omp_get_thread_num
+\sa starpu_omp_get_max_threads
+
+\fn int starpu_omp_in_parallel (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns whether it is called from the scope of a parallel region or not.
+
+\return <c>!0</c> if called from a parallel region scope.
+\return <c>0</c> otherwise.
+
+\fn void starpu_omp_set_dynamic (int dynamic_threads)
+\ingroup API_OpenMP_Runtime_Support
+This function enables (1) or disables (0) dynamically adjusting the number of parallel threads.
+
+Note: The StarPU OpenMP runtime support currently ignores the argument of this function.
+
+\sa starpu_omp_get_dynamic
+
+\fn int starpu_omp_get_dynamic (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the state of dynamic thread number adjustment.
+
+\return <c>!0</c> if dynamic thread number adjustment is enabled.
+\return <c>0</c> otherwise.
+
+\sa starpu_omp_set_dynamic
+
+\fn void starpu_omp_set_nested (int nested)
+\ingroup API_OpenMP_Runtime_Support
+This function enables (1) or disables (0) nested parallel regions.
+
+Note: The StarPU OpenMP runtime support currently ignores the argument of this function.
+
+\sa starpu_omp_get_nested
+\sa starpu_omp_get_max_active_levels
+\sa starpu_omp_set_max_active_levels
+\sa starpu_omp_get_level
+\sa starpu_omp_get_active_level
+
+\fn int starpu_omp_get_nested (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns whether nested parallel sections are enabled or not.
+
+\return <c>!0</c> if nested parallel sections are enabled.
+\return <c>0</c> otherwise.
+
+\sa starpu_omp_set_nested
+\sa starpu_omp_get_max_active_levels
+\sa starpu_omp_set_max_active_levels
+\sa starpu_omp_get_level
+\sa starpu_omp_get_active_level
+
+\fn int starpu_omp_get_cancellation(void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the state of the cancel ICVS var.
+
+\fn void starpu_omp_set_schedule (enum starpu_omp_sched_value kind, int modifier)
+\ingroup API_OpenMP_Runtime_Support
+This function sets the default scheduling kind for upcoming loops within the
+current parallel section. \p kind is the scheduler kind, \p modifier
+complements the scheduler kind with informations such as the chunk size,
+in accordance with the OpenMP specification.
+
+\sa starpu_omp_get_schedule
+
+\fn void starpu_omp_get_schedule (enum starpu_omp_sched_value *kind, int *modifier)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the current selected default loop scheduler.
+
+\return the kind and the modifier of the current default loop scheduler.
+
+\sa starpu_omp_set_schedule
+
+\fn int starpu_omp_get_thread_limit (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the number of StarPU CPU workers.
+
+\return the number of StarPU CPU workers.
+
+\fn void starpu_omp_set_max_active_levels (int max_levels)
+\ingroup API_OpenMP_Runtime_Support
+This function sets the maximum number of allowed active parallel section levels.
+
+Note: The StarPU OpenMP runtime support currently ignores the argument of this function and assume \p max_levels equals <c>1</c> instead.
+
+\sa starpu_omp_set_nested
+\sa starpu_omp_get_nested
+\sa starpu_omp_get_max_active_levels
+\sa starpu_omp_get_level
+\sa starpu_omp_get_active_level
+
+\fn int starpu_omp_get_max_active_levels (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the current maximum number of allowed active parallel section levels
+
+\return the current maximum number of allowed active parallel section levels.
+
+\sa starpu_omp_set_nested
+\sa starpu_omp_get_nested
+\sa starpu_omp_set_max_active_levels
+\sa starpu_omp_get_level
+\sa starpu_omp_get_active_level
+
+\fn int starpu_omp_get_level (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the nesting level of the current parallel section.
+
+\return the nesting level of the current parallel section.
+
+\sa starpu_omp_set_nested
+\sa starpu_omp_get_nested
+\sa starpu_omp_get_max_active_levels
+\sa starpu_omp_set_max_active_levels
+\sa starpu_omp_get_active_level
+
+\fn int starpu_omp_get_ancestor_thread_num (int level)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the number of the ancestor of the current parallel section.
+
+\return the number of the ancestor of the current parallel section.
+
+\fn int starpu_omp_get_team_size (int level)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the size of the team of the current parallel section.
+
+\return the size of the team of the current parallel section.
+
+\fn int starpu_omp_get_active_level (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the nestinglevel of the current innermost active parallel section.
+
+\return the nestinglevel of the current innermost active parallel section.
+
+\sa starpu_omp_set_nested
+\sa starpu_omp_get_nested
+\sa starpu_omp_get_max_active_levels
+\sa starpu_omp_set_max_active_levels
+\sa starpu_omp_get_level
+
+\fn int starpu_omp_in_final(void)
+\ingroup API_OpenMP_Runtime_Support
+This function checks whether the current task is final or not.
+
+\return <c>!0</c> if called from a final task.
+\return <c>0</c> otherwise.
+
+\fn enum starpu_omp_proc_bind_value starpu_omp_get_proc_bind(void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the proc_bind setting of the current parallel region.
+
+\return the proc_bind setting of the current parallel region.
+
+\fn void starpu_omp_set_default_device(int device_num)
+\ingroup API_OpenMP_Runtime_Support
+This function sets the number of the device to use as default.
+
+Note: The StarPU OpenMP runtime support currently ignores the argument of this function.
+
+\sa starpu_omp_get_default_device
+\sa starpu_omp_is_initial_device
+
+\fn int starpu_omp_get_default_device(void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the number of the device used as default.
+
+\return the number of the device used as default.
+
+\sa starpu_omp_set_default_device
+\sa starpu_omp_is_initial_device
+
+\fn int starpu_omp_get_num_devices(void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the number of the devices.
+
+\return the number of the devices.
+
+\fn int starpu_omp_get_num_teams(void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the number of teams in the current teams region.
+
+\return the number of teams in the current teams region.
+
+\sa starpu_omp_get_num_teams
+
+\fn int starpu_omp_get_team_num(void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the team number of the calling thread.
+
+\return the team number of the calling thread.
+
+\sa starpu_omp_get_num_teams
+
+\fn int starpu_omp_is_initial_device(void)
+\ingroup API_OpenMP_Runtime_Support
+This function checks whether the current device is the initial device or not.
+
+\return <c>!0</c> if called from the host device.
+\return <c>0</c> otherwise.
+
+\sa starpu_omp_set_default_device
+\sa starpu_omp_get_default_device
+
+\fn void starpu_omp_init_lock (starpu_omp_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function initializes an opaque lock object.
+
+\sa starpu_omp_destroy_lock
+\sa starpu_omp_set_lock
+\sa starpu_omp_unset_lock
+\sa starpu_omp_test_lock
+
+\fn void starpu_omp_destroy_lock (starpu_omp_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function destroys an opaque lock object.
+
+\sa starpu_omp_init_lock
+\sa starpu_omp_set_lock
+\sa starpu_omp_unset_lock
+\sa starpu_omp_test_lock
+
+\fn void starpu_omp_set_lock (starpu_omp_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function locks an opaque lock object. If the lock is already locked, the
+function will block until it succeeds in exclusively acquiring the lock.
+
+\sa starpu_omp_init_lock
+\sa starpu_omp_destroy_lock
+\sa starpu_omp_unset_lock
+\sa starpu_omp_test_lock
+
+\fn void starpu_omp_unset_lock (starpu_omp_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function unlocks a previously locked lock object. The behaviour of this
+function is unspecified if it is called on an unlocked lock object.
+
+\sa starpu_omp_init_lock
+\sa starpu_omp_destroy_lock
+\sa starpu_omp_set_lock
+\sa starpu_omp_test_lock
+
+\fn int starpu_omp_test_lock (starpu_omp_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function unblockingly attempts to lock a lock object and returns whether
+it succeeded or not.
+
+\return <c>!0</c> if the function succeeded in acquiring the lock.
+\return <c>0</c> if the lock was already locked.
+
+\sa starpu_omp_init_lock
+\sa starpu_omp_destroy_lock
+\sa starpu_omp_set_lock
+\sa starpu_omp_unset_lock
+
+\fn void starpu_omp_init_nest_lock (starpu_omp_nest_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function initializes an opaque lock object supporting nested locking operations.
+
+\sa starpu_omp_destroy_nest_lock
+\sa starpu_omp_set_nest_lock
+\sa starpu_omp_unset_nest_lock
+\sa starpu_omp_test_nest_lock
+
+\fn void starpu_omp_destroy_nest_lock (starpu_omp_nest_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function destroys an opaque lock object supporting nested locking operations.
+
+\sa starpu_omp_init_nest_lock
+\sa starpu_omp_set_nest_lock
+\sa starpu_omp_unset_nest_lock
+\sa starpu_omp_test_nest_lock
+
+\fn void starpu_omp_set_nest_lock (starpu_omp_nest_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function locks an opaque lock object supporting nested locking operations.
+If the lock is already locked by another task, the function will block until
+it succeeds in exclusively acquiring the lock. If the lock is already taken by
+the current task, the function will increase the nested locking level of the
+lock object.
+
+\sa starpu_omp_init_nest_lock
+\sa starpu_omp_destroy_nest_lock
+\sa starpu_omp_unset_nest_lock
+\sa starpu_omp_test_nest_lock
+
+\fn void starpu_omp_unset_nest_lock (starpu_omp_nest_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function unlocks a previously locked lock object supporting nested locking
+operations. If the lock has been locked multiple times in nested fashion, the
+nested locking level is decreased and the lock remains locked. Otherwise, if
+the lock has only been locked once, it becomes unlocked. The behaviour of this
+function is unspecified if it is called on an unlocked lock object. The
+behaviour of this function is unspecified if it is called from a different task
+than the one that locked the lock object.
+
+\sa starpu_omp_init_nest_lock
+\sa starpu_omp_destroy_nest_lock
+\sa starpu_omp_set_nest_lock
+\sa starpu_omp_test_nest_lock
+
+\fn int starpu_omp_test_nest_lock (starpu_omp_nest_lock_t *lock)
+\ingroup API_OpenMP_Runtime_Support
+This function unblocking attempts to lock an opaque lock object supporting
+nested locking operations and returns whether it succeeded or not. If the lock
+is already locked by another task, the function will return without having
+acquired the lock. If the lock is already taken by the current task, the
+function will increase the nested locking level of the lock object.
+
+\return <c>!0</c> if the function succeeded in acquiring the lock.
+\return <c>0</c> if the lock was already locked.
+
+\sa starpu_omp_init_nest_lock
+\sa starpu_omp_destroy_nest_lock
+\sa starpu_omp_set_nest_lock
+\sa starpu_omp_unset_nest_lock
+
+\fn void starpu_omp_atomic_fallback_inline_begin(void)
+\ingroup API_OpenMP_Runtime_Support
+This function implements the entry point of a fallback global atomic region. It
+blocks until it succeeds in acquiring exclusive access to the global atomic
+region.
+
+\sa starpu_omp_atomic_fallback_inline_end
+
+\fn void starpu_omp_atomic_fallback_inline_end(void)
+\ingroup API_OpenMP_Runtime_Support
+This function implements the exit point of a fallback global atomic region. It
+release the exclusive access to the global atomic region.
+
+\sa starpu_omp_atomic_fallback_inline_begin
+
+\fn double starpu_omp_get_wtime (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the elapsed wallclock time in seconds.
+
+\return the elapsed wallclock time in seconds.
+
+\sa starpu_omp_get_wtick
+
+\fn double starpu_omp_get_wtick (void)
+\ingroup API_OpenMP_Runtime_Support
+This function returns the precision of the time used by \p starpu_omp_get_wtime.
+
+\return the precision of the time used by \p starpu_omp_get_wtime.
+
+\sa starpu_omp_get_wtime
+
+*/

+ 1 - 0
doc/doxygen/doxygen-config.cfg.in

@@ -36,6 +36,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu_hash.h \
 			 @top_srcdir@/include/starpu_mic.h \
 			 @top_srcdir@/include/starpu_opencl.h \
+			 @top_srcdir@/include/starpu_openmp.h \
 			 @top_srcdir@/include/starpu_perfmodel.h \
 			 @top_srcdir@/include/starpu_profiling.h \
 			 @top_srcdir@/include/starpu_rand.h \

+ 1 - 0
doc/doxygen/doxygen.cfg

@@ -1623,6 +1623,7 @@ PREDEFINED             = STARPU_USE_OPENCL=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \
 			 STARPU_SIMGRID=1 \
+			 STARPU_OPENMP=1 \
                          __GCC__
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then

+ 7 - 0
doc/doxygen/refman.tex

@@ -154,6 +154,11 @@ Documentation License”.
 \hypertarget{SimGridSupport}{}
 \input{SimGridSupport}
 
+\chapter{OpenMP Runtime Support}
+\label{OpenMPRuntimeSupport}
+\hypertarget{OpenMPRuntimeSupport}{}
+\input{OpenMPRuntimeSupport}
+
 \part{StarPU Reference API}
 
 \chapter{Execution Configuration Through Environment Variables}
@@ -194,6 +199,7 @@ Documentation License”.
 \input{group__API__Theoretical__Lower__Bound__on__Execution__Time}
 \input{group__API__CUDA__Extensions}
 \input{group__API__OpenCL__Extensions}
+\input{group__API__OpenMP__Runtime__Support}
 \input{group__API__MIC__Extensions}
 \input{group__API__SCC__Extensions}
 \input{group__API__Miscellaneous__Helpers}
@@ -236,6 +242,7 @@ Documentation License”.
 \input{starpu__hash_8h}
 \input{starpu__mic_8h}
 \input{starpu__opencl_8h}
+\input{starpu__openmp_8h}
 \input{starpu__perfmodel_8h}
 \input{starpu__profiling_8h}
 \input{starpu__rand_8h}

+ 2 - 0
include/starpu.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -66,6 +67,7 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_fxt.h>
 #include <starpu_driver.h>
 #include <starpu_tree.h>
+#include <starpu_openmp.h>
 #include <starpu_simgrid_wrap.h>
 
 #ifdef __cplusplus

+ 3 - 0
include/starpu_config.h.in

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,6 +36,8 @@
 #undef STARPU_USE_MIC
 #undef STARPU_USE_SCC
 
+#undef STARPU_OPENMP
+
 #undef STARPU_SIMGRID
 #undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 

+ 169 - 0
include/starpu_openmp.h

@@ -0,0 +1,169 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_OPENMP_H__
+#define __STARPU_OPENMP_H__
+
+#include <starpu_config.h>
+
+#if defined STARPU_OPENMP
+typedef struct { void *internal; } starpu_omp_lock_t;
+typedef struct { void *internal; } starpu_omp_nest_lock_t;
+
+enum starpu_omp_sched_value
+{
+	starpu_omp_sched_undefined = 0,
+	starpu_omp_sched_static    = 1,
+	starpu_omp_sched_dynamic   = 2,
+	starpu_omp_sched_guided    = 3,
+	starpu_omp_sched_auto      = 4,
+	starpu_omp_sched_runtime   = 5
+};
+
+enum starpu_omp_proc_bind_value
+{
+	starpu_omp_proc_bind_undefined  = -1,
+	starpu_omp_proc_bind_false  = 0,
+	starpu_omp_proc_bind_true   = 1,
+	starpu_omp_proc_bind_master = 2,
+	starpu_omp_proc_bind_close  = 3,
+	starpu_omp_proc_bind_spread = 4
+};
+
+struct starpu_omp_parallel_region_attr
+{
+	struct starpu_codelet  cl;
+	starpu_data_handle_t  *handles;
+	void     *cl_arg;
+	size_t    cl_arg_size;
+	unsigned  cl_arg_free;
+
+	int if_clause;
+	int num_threads;
+};
+
+struct starpu_omp_task_region_attr
+{
+	struct starpu_codelet  cl;
+	starpu_data_handle_t  *handles;
+	void     *cl_arg;
+	size_t    cl_arg_size;
+	unsigned  cl_arg_free;
+
+	int if_clause;
+	int final_clause;
+	int untied_clause;
+	int mergeable_clause;
+};
+
+#ifdef __cplusplus
+extern "C"
+{
+#define __STARPU_OMP_NOTHROW throw ()
+#else
+#define __STARPU_OMP_NOTHROW __attribute__((__nothrow__))
+#endif
+
+extern int starpu_omp_init(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_shutdown(void) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_parallel_region(const struct starpu_omp_parallel_region_attr *attr) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_barrier(void) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_master(void (*f)(void *arg), void *arg) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_master_inline(void) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_single(void (*f)(void *arg), void *arg, int nowait) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_single_inline(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_single_copyprivate(void (*f)(void *arg, void *data, unsigned long long data_size), void *arg, void *data, unsigned long long data_size) __STARPU_OMP_NOTHROW;
+extern void *starpu_omp_single_copyprivate_inline_begin(void *data) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_single_copyprivate_inline_end(void) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_critical(void (*f)(void *arg), void *arg, const char *name) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_critical_inline_begin(const char *name) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_critical_inline_end(const char *name) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_task_region(const struct starpu_omp_task_region_attr *attr) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_taskwait(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_taskgroup(void (*f)(void *arg), void *arg) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_taskgroup_inline_begin(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_taskgroup_inline_end(void) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_for(void (*f)(unsigned long long _first_i, unsigned long long _nb_i, void *arg), void *arg, unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, int nowait) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_for_inline_first(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_first_i, unsigned long long *_nb_i) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_for_inline_next(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_first_i, unsigned long long *_nb_i) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_for_alt(void (*f)(unsigned long long _begin_i, unsigned long long _end_i, void *arg), void *arg, unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, int nowait) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_for_inline_first_alt(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_begin_i, unsigned long long *_end_i) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_for_inline_next_alt(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_begin_i, unsigned long long *_end_i) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_ordered_inline_begin(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_ordered_inline_end(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_ordered(void (*f)(void *arg), void *arg) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_sections(unsigned long long nb_sections, void (**section_f)(void *arg), void **section_arg, int nowait) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_sections_combined(unsigned long long nb_sections, void (*section_f)(unsigned long long section_num, void *arg), void *section_arg, int nowait) __STARPU_OMP_NOTHROW;
+
+extern void starpu_omp_set_num_threads(int threads) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_num_threads() __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_thread_num() __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_max_threads() __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_num_procs (void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_in_parallel (void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_set_dynamic (int dynamic_threads) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_dynamic (void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_set_nested (int nested) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_nested (void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_cancellation(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_set_schedule (enum starpu_omp_sched_value kind, int modifier) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_get_schedule (enum starpu_omp_sched_value *kind, int *modifier) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_thread_limit (void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_set_max_active_levels (int max_levels) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_max_active_levels (void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_level (void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_ancestor_thread_num (int level) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_team_size (int level) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_active_level (void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_in_final(void) __STARPU_OMP_NOTHROW;
+extern enum starpu_omp_proc_bind_value starpu_omp_get_proc_bind(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_set_default_device(int device_num) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_default_device(void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_num_devices(void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_num_teams(void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_get_team_num(void) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_is_initial_device(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_init_lock (starpu_omp_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_destroy_lock (starpu_omp_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_set_lock (starpu_omp_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_unset_lock (starpu_omp_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_test_lock (starpu_omp_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_init_nest_lock (starpu_omp_nest_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_destroy_nest_lock (starpu_omp_nest_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_set_nest_lock (starpu_omp_nest_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_unset_nest_lock (starpu_omp_nest_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern int starpu_omp_test_nest_lock (starpu_omp_nest_lock_t *lock) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_atomic_fallback_inline_begin(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_atomic_fallback_inline_end(void) __STARPU_OMP_NOTHROW;
+extern double starpu_omp_get_wtime (void) __STARPU_OMP_NOTHROW;
+extern double starpu_omp_get_wtick (void) __STARPU_OMP_NOTHROW;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* STARPU_USE_OPENMP && !STARPU_DONT_INCLUDE_OPENMP_HEADERS */
+#endif /* __STARPU_OPENMP_H__ */

+ 7 - 1
include/starpu_task.h

@@ -3,7 +3,7 @@
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
- * Copyright (C) 2011  INRIA
+ * Copyright (C) 2011, 2014  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -62,6 +62,9 @@ enum starpu_task_status
 	STARPU_TASK_BLOCKED_ON_TAG,
 	STARPU_TASK_BLOCKED_ON_TASK,
 	STARPU_TASK_BLOCKED_ON_DATA
+#ifdef STARPU_OPENMP
+	, STARPU_TASK_STOPPED
+#endif
 };
 
 typedef uint64_t starpu_tag_t;
@@ -194,6 +197,9 @@ struct starpu_task
 	struct starpu_task *prev;
 	struct starpu_task *next;
 	void *starpu_private;
+#ifdef STARPU_OPENMP
+	struct starpu_omp_task *omp_task;
+#endif
 };
 
 #define STARPU_TASK_INITIALIZER 			\

+ 5 - 1
src/Makefile.am

@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2009-2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-# Copyright (C) 2011  INRIA
+# Copyright (C) 2011, 2014  INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -127,6 +127,7 @@ noinst_HEADERS = 						\
 	debug/traces/starpu_fxt.h				\
 	profiling/bound.h					\
 	profiling/profiling.h					\
+	util/openmp_runtime_support.h				\
 	util/starpu_task_insert_utils.h				\
 	util/starpu_data_cpy.h					\
 	starpu_parameters.h					\
@@ -225,6 +226,9 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	util/starpu_create_sync_task.c				\
 	util/file.c						\
 	util/misc.c						\
+	util/openmp_runtime_support.c				\
+	util/openmp_runtime_support_environment.c		\
+	util/openmp_runtime_support_omp_api.c			\
 	util/starpu_data_cpy.c					\
 	util/starpu_task_insert.c				\
 	util/starpu_task_insert_utils.c				\

+ 7 - 1
src/core/dependencies/task_deps.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -67,7 +68,12 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 
 	STARPU_PTHREAD_MUTEX_LOCK(&job->sync_mutex);
 	if (check)
-		STARPU_ASSERT_MSG(!job->submitted || !task->destroy || task->detach, "Task dependencies have to be set before submission (submitted %u destroy %d detach %d)", job->submitted, task->destroy, task->detach);
+		STARPU_ASSERT_MSG(
+				!job->submitted || !task->destroy || task->detach
+#ifdef STARPU_OPENMP
+				|| job->continuation
+#endif
+				, "Task dependencies have to be set before submission (submitted %u destroy %d detach %d)", job->submitted, task->destroy, task->detach);
 	else
 		STARPU_ASSERT_MSG(job->terminated <= 1, "Task dependencies have to be set before termination (terminated %u)", job->terminated);
 

+ 189 - 49
src/core/jobs.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
- * Copyright (C) 2011  INRIA
+ * Copyright (C) 2011, 2014  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -156,22 +156,70 @@ void _starpu_wait_job(struct _starpu_job *j)
         _STARPU_LOG_OUT();
 }
 
+#ifdef STARPU_OPENMP
+int _starpu_test_job_termination(struct _starpu_job *j)
+{
+	STARPU_ASSERT(j->task);
+	STARPU_ASSERT(!j->task->detach);
+	return (j->terminated == 2);
+}
+void _starpu_job_prepare_for_continuation_ext(struct _starpu_job *j, unsigned continuation_resubmit,
+		void (*continuation_callback_on_sleep)(void *arg), void *continuation_callback_on_sleep_arg)
+{
+	STARPU_ASSERT(!j->continuation);
+	/* continuation are not supported for parallel tasks for now */
+	STARPU_ASSERT(j->task_size == 1);
+	j->continuation = 1;
+	j->continuation_resubmit = continuation_resubmit;
+	j->continuation_callback_on_sleep = continuation_callback_on_sleep;
+	j->continuation_callback_on_sleep_arg = continuation_callback_on_sleep_arg;
+	j->job_successors.ndeps = 0;
+}
+/* Prepare a currently running job for accepting a new set of
+ * dependencies in anticipation of becoming a continuation. */
+void _starpu_job_prepare_for_continuation(struct _starpu_job *j)
+{
+	_starpu_job_prepare_for_continuation_ext(j, 1, NULL, NULL);
+}
+void _starpu_job_set_omp_cleanup_callback(struct _starpu_job *j,
+		void (*omp_cleanup_callback)(void *arg), void *omp_cleanup_callback_arg)
+{
+	j->omp_cleanup_callback = omp_cleanup_callback;
+	j->omp_cleanup_callback_arg = omp_cleanup_callback_arg;
+}
+#endif
+
 void _starpu_handle_job_termination(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
 	unsigned sched_ctx = task->sched_ctx;
 	int workerid = starpu_worker_get_id();
 	double flops = task->flops;
+	const unsigned continuation =
+#ifdef STARPU_OPENMP
+		j->continuation
+#else
+		0
+#endif
+		;
+
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
-	
-	task->status = STARPU_TASK_FINISHED;
-	
-	/* We must have set the j->terminated flag early, so that it is
-	 * possible to express task dependencies within the callback
-	 * function. A value of 1 means that the codelet was executed but that
-	 * the callback is not done yet. */
-	j->terminated = 1;
-		
+#ifdef STARPU_OPENMP
+	if (continuation)
+	{
+		task->status = STARPU_TASK_STOPPED;
+	}
+	else
+#endif
+	{
+		task->status = STARPU_TASK_FINISHED;
+
+		/* We must have set the j->terminated flag early, so that it is
+		 * possible to express task dependencies within the callback
+		 * function. A value of 1 means that the codelet was executed but that
+		 * the callback is not done yet. */
+		j->terminated = 1;
+	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
 
@@ -180,7 +228,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 #endif //STARPU_USE_SC_HYPERVISOR
 
 	/* We release handle reference count */
-	if (task->cl)
+	if (task->cl && !continuation)
 	{
 		unsigned i;
 		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
@@ -202,13 +250,24 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 				_starpu_spin_unlock(&handle->header_lock);
 		}
 	}
-	/* Tell other tasks that we don't exist any more, thus no need for
-	 * implicit dependencies any more.  */
-	_starpu_release_task_enforce_sequential_consistency(j);
+	/* If this is a continuation, we do not release task dependencies now.
+	 * Task dependencies will be released only when the continued task
+	 * fully completes */
+	if (!continuation)
+	{
+		/* Tell other tasks that we don't exist any more, thus no need for
+		 * implicit dependencies any more.  */
+		_starpu_release_task_enforce_sequential_consistency(j);
+	}
+
 	/* Task does not have a cl, but has explicit data dependencies, we need
 	 * to tell them that we will not exist any more before notifying the
-	 * tasks waiting for us */
-	if (j->implicit_dep_handle) {
+	 * tasks waiting for us
+	 *
+	 * For continuations, implicit dependency handles are only released 
+	 * when the task fully completes */
+	if (j->implicit_dep_handle && !continuation)
+	{
 		starpu_data_handle_t handle = j->implicit_dep_handle;
 		_starpu_release_data_enforce_sequential_consistency(j->task, &j->implicit_dep_slot, handle);
 		/* Release reference taken while setting implicit_dep_handle */
@@ -218,46 +277,68 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 
-	/* in case there are dependencies, wake up the proper tasks */
-	_starpu_notify_dependencies(j);
+	/* If this is a continuation, we do not notify task/tag dependencies
+	 * now. Task/tag dependencies will be notified only when the continued
+	 * task fully completes */
+	if (!continuation)
+	{
+		/* in case there are dependencies, wake up the proper tasks */
+		_starpu_notify_dependencies(j);
+	}
 
-	/* the callback is executed after the dependencies so that we may remove the tag
- 	 * of the task itself */
-	if (task->callback_func)
+	/* If this is a continuation, we do not execute the callback
+	 * now. The callback will be executed only when the continued
+	 * task fully completes */
+	if (!continuation)
 	{
-		int profiling = starpu_profiling_status_get();
-		if (profiling && task->profiling_info)
-			_starpu_clock_gettime(&task->profiling_info->callback_start_time);
+		/* the callback is executed after the dependencies so that we may remove the tag
+		 * of the task itself */
+		if (task->callback_func)
+		{
+			int profiling = starpu_profiling_status_get();
+			if (profiling && task->profiling_info)
+				_starpu_clock_gettime(&task->profiling_info->callback_start_time);
 
-		/* so that we can check whether we are doing blocking calls
-		 * within the callback */
-		_starpu_set_local_worker_status(STATUS_CALLBACK);
+			/* so that we can check whether we are doing blocking calls
+			 * within the callback */
+			_starpu_set_local_worker_status(STATUS_CALLBACK);
 
 
-		/* Perhaps we have nested callbacks (eg. with chains of empty
-		 * tasks). So we store the current task and we will restore it
-		 * later. */
-		struct starpu_task *current_task = starpu_task_get_current();
+			/* Perhaps we have nested callbacks (eg. with chains of empty
+			 * tasks). So we store the current task and we will restore it
+			 * later. */
+			struct starpu_task *current_task = starpu_task_get_current();
 
-		_starpu_set_current_task(task);
+			_starpu_set_current_task(task);
 
-		_STARPU_TRACE_START_CALLBACK(j);
-		task->callback_func(task->callback_arg);
-		_STARPU_TRACE_END_CALLBACK(j);
+			_STARPU_TRACE_START_CALLBACK(j);
+			task->callback_func(task->callback_arg);
+			_STARPU_TRACE_END_CALLBACK(j);
 
-		_starpu_set_current_task(current_task);
+			_starpu_set_current_task(current_task);
 
-		_starpu_set_local_worker_status(STATUS_UNKNOWN);
+			_starpu_set_local_worker_status(STATUS_UNKNOWN);
 
-		if (profiling && task->profiling_info)
-			_starpu_clock_gettime(&task->profiling_info->callback_end_time);
+			if (profiling && task->profiling_info)
+				_starpu_clock_gettime(&task->profiling_info->callback_end_time);
+		}
 	}
 
 	/* If the job was executed on a combined worker there is no need for the
 	 * scheduler to process it : the task structure doesn't contain any valuable
 	 * data as it's not linked to an actual worker */
 	/* control task should not execute post_exec_hook */
-	if(j->task_size == 1 && task->cl != NULL && !j->internal)
+	if(j->task_size == 1 && task->cl != NULL && !j->internal
+#ifdef STARPU_OPENMP
+	/* If this is a continuation, we do not execute the post_exec_hook. The
+	 * post_exec_hook will be run only when the continued task fully
+	 * completes.
+	 *
+	 * Note: If needed, a specific hook could be added to handle stopped
+	 * tasks */
+	&& !continuation
+#endif
+			)
 	{
 		_starpu_sched_post_exec_hook(task);
 #ifdef STARPU_USE_SC_HYPERVISOR
@@ -266,6 +347,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
 	}
 
+	/* Note: For now, we keep the TASK_DONE trace event for continuation,
+	 * however we could add a specific event for stopped tasks if needed.
+	 */
 	_STARPU_TRACE_TASK_DONE(j);
 
 	/* NB: we do not save those values before the callback, in case the
@@ -278,9 +362,20 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	/* we do not desallocate the job structure if some is going to
 	 * wait after the task */
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
-	/* A value of 2 is put to specify that not only the codelet but
-	 * also the callback were executed. */
-	j->terminated = 2;
+	if (!continuation)
+	{
+#ifdef STARPU_OPENMP
+		if (j->omp_cleanup_callback)
+		{
+			j->omp_cleanup_callback(j->omp_cleanup_callback_arg);
+			j->omp_cleanup_callback = NULL;
+			j->omp_cleanup_callback_arg = NULL;
+		}
+#endif
+		/* A value of 2 is put to specify that not only the codelet but
+		 * also the callback were executed. */
+		j->terminated = 2;
+	}
 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
 
 #ifdef HAVE_AYUDAME_H
@@ -289,7 +384,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
-	if (detach)
+	if (detach && !continuation)
 	{
 		/* no one is going to synchronize with that task so we release
 		 * the data structures now. In case the job was already locked
@@ -299,9 +394,12 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 			_starpu_task_destroy(task);
 	}
 
-	if (regenerate)
+	/* A continuation is not much different from a regenerated task. */
+	if (regenerate || continuation)
 	{
-		STARPU_ASSERT_MSG(detach && !destroy && !task->synchronous, "Regenerated task must be detached (was %d), and not have detroy=1 (was %d) or synchronous=1 (was %d)", detach, destroy, task->synchronous);
+		STARPU_ASSERT_MSG((detach && !destroy && !task->synchronous)
+				|| continuation
+				, "Regenerated task must be detached (was %d), and not have detroy=1 (was %d) or synchronous=1 (was %d)", detach, destroy, task->synchronous);
 
 #ifdef HAVE_AYUDAME_H
 		if (AYU_event)
@@ -311,9 +409,28 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		}
 #endif
 
-		/* We reuse the same job structure */
-		int ret = _starpu_submit_job(j);
-		STARPU_ASSERT(!ret);
+		{
+#ifdef STARPU_OPENMP
+			unsigned continuation_resubmit = j->continuation_resubmit;
+			void (*continuation_callback_on_sleep)(void *arg) = j->continuation_callback_on_sleep;
+			void *continuation_callback_on_sleep_arg = j->continuation_callback_on_sleep_arg;
+			j->continuation_resubmit = 1;
+			j->continuation_callback_on_sleep = NULL;
+			j->continuation_callback_on_sleep_arg = NULL;
+			if (!continuation || continuation_resubmit)
+#endif
+			{
+				/* We reuse the same job structure */
+				int ret = _starpu_submit_job(j);
+				STARPU_ASSERT(!ret);
+			}
+#ifdef STARPU_OPENMP
+			if (continuation && continuation_callback_on_sleep != NULL)
+			{
+				continuation_callback_on_sleep(continuation_callback_on_sleep_arg);
+			}
+#endif
+		}
 	}
 
 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
@@ -461,6 +578,29 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j)
 	return ret;
 }
 
+#ifdef STARPU_OPENMP
+/* When waking up a continuation, we only enforce new task dependencies */
+unsigned _starpu_reenforce_task_deps_and_schedule(struct _starpu_job *j)
+{
+	unsigned ret;
+        _STARPU_LOG_IN();
+	STARPU_ASSERT(j->discontinuous);
+
+	/* enfore task dependencies */
+	if (_starpu_not_all_task_deps_are_fulfilled(j))
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+		_STARPU_LOG_OUT_TAG("not_all_task_deps_are_fulfilled");
+		return 0;
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+	ret = _starpu_push_task(j);
+
+	_STARPU_LOG_OUT();
+	return ret;
+}
+#endif
+
 /* Ordered tasks are simply recorded as they arrive in the local_ordered_tasks
  * ring buffer, indexed by order, and pulled from its head. */
 /* TODO: replace with perhaps a heap */

+ 48 - 1
src/core/jobs.h

@@ -3,6 +3,7 @@
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -112,6 +113,36 @@ LIST_TYPE(_starpu_job,
 	 */
 	unsigned terminated:2;
 
+#ifdef STARPU_OPENMP
+	/* Job is a continuation or a regular task. */
+	unsigned continuation;
+
+	/* If 0, the prepared continuation is not resubmitted automatically
+	 * when going to sleep, if 1, the prepared continuation is immediately
+	 * resubmitted when going to sleep. */
+	unsigned continuation_resubmit;
+
+	/* Callback function called when:
+	 * - The continuation starpu task is ready to be submitted again if
+	 *   continuation_resubmit = 0;
+	 * - The continuation starpu task has just been re-submitted if
+	 *   continuation_resubmit = 1. */
+	void (*continuation_callback_on_sleep)(void *arg);
+	void *continuation_callback_on_sleep_arg;
+
+	void (*omp_cleanup_callback)(void *arg);
+	void *omp_cleanup_callback_arg;
+
+	/* Job has been stopped at least once. */
+	unsigned discontinuous;
+
+	/* Cumulated execution time for discontinuous jobs */
+	struct timespec cumulated_ts;
+
+	/* Cumulated power consumption for discontinuous jobs */
+	double cumulated_power_consumed;
+#endif
+
 	/* The value of the footprint that identifies the job may be stored in
 	 * this structure. */
 	uint32_t footprint;
@@ -167,13 +198,29 @@ int _starpu_job_finished(struct _starpu_job *j);
 /* Wait for the termination of the job */
 void _starpu_wait_job(struct _starpu_job *j);
 
+#ifdef STARPU_OPENMP
+/* Test for the termination of the job */
+int _starpu_test_job_termination(struct _starpu_job *j);
+
+/* Prepare the job for accepting new dependencies before becoming a continuation. */
+
+void _starpu_job_prepare_for_continuation_ext(struct _starpu_job *j, unsigned continuation_resubmit,
+		void (*continuation_callback_on_sleep)(void *arg), void *continuation_callback_on_sleep_arg);
+void _starpu_job_prepare_for_continuation(struct _starpu_job *j);
+void _starpu_job_set_omp_cleanup_callback(struct _starpu_job *j,
+		void (*omp_cleanup_callback)(void *arg), void *omp_cleanup_callback_arg);
+#endif
+
 /* Specify that the task should not appear in the DAG generated by debug tools. */
 void _starpu_exclude_task_from_dag(struct starpu_task *task);
 
 /* try to submit job j, enqueue it if it's not schedulable yet. The job's sync mutex is supposed to be held already */
 unsigned _starpu_enforce_deps_and_schedule(struct _starpu_job *j);
 unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j);
-
+#ifdef STARPU_OPENMP
+/* When waking up a continuation, we only enforce new task dependencies */
+unsigned _starpu_reenforce_task_deps_and_schedule(struct _starpu_job *j);
+#endif
 
 /* This function must be called after the execution of a job, this triggers all
  * job's dependencies and perform the callback function if any. */

+ 7 - 1
src/core/simgrid.c

@@ -27,6 +27,7 @@
 #ifdef STARPU_SIMGRID
 #include <msg/msg.h>
 #include <smpi/smpif.h>
+#include <sys/resource.h>
 
 #define STARPU_MPI_AS_PREFIX "StarPU-MPI"
 
@@ -178,7 +179,12 @@ int main(int argc, char **argv)
 #endif
 	/* Simgrid uses tiny stacks by default.  This comes unexpected to our users.  */
 	extern xbt_cfg_t _sg_cfg_set;
-	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", 8192);
+	unsigned stack_size = 8192;
+	struct rlimit rlim;
+	if (getrlimit(RLIMIT_STACK, &rlim) == 0 && rlim.rlim_cur != 0 && rlim.rlim_cur != RLIM_INFINITY)
+		stack_size = rlim.rlim_cur / 1024;
+
+	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", stack_size);
 
 	/* Load XML platform */
 	_starpu_simgrid_get_platform_path(path, sizeof(path));

+ 91 - 6
src/core/task.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
- * Copyright (C) 2011  INRIA
+ * Copyright (C) 2011, 2014  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -223,6 +223,33 @@ int starpu_task_wait(struct starpu_task *task)
 	return 0;
 }
 
+#ifdef STARPU_OPENMP
+int _starpu_task_test_termination(struct starpu_task *task)
+{
+	STARPU_ASSERT(task);
+	STARPU_ASSERT_MSG(!task->detach, "starpu_task_wait can only be called on tasks with detach = 0");
+
+	if (task->detach || task->synchronous)
+	{
+		_STARPU_DEBUG("Task is detached or synchronous\n");
+		_STARPU_LOG_OUT_TAG("einval");
+		return -EINVAL;
+	}
+
+	struct _starpu_job *j = (struct _starpu_job *)task->starpu_private;
+
+	int ret = _starpu_test_job_termination(j);
+
+	if (ret)
+	{
+		if (task->destroy)
+			_starpu_task_destroy(task);
+	}
+
+	return ret;
+}
+#endif
+
 struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
 {
 	STARPU_ASSERT(task);
@@ -240,8 +267,13 @@ struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
  * already counted. */
 int _starpu_submit_job(struct _starpu_job *j)
 {
-
 	struct starpu_task *task = j->task;
+	int ret;
+#ifdef STARPU_OPENMP
+	const unsigned continuation = j->continuation;
+#else
+	const unsigned continuation = 0;
+#endif
 
 	_STARPU_LOG_IN();
 	/* notify bound computation of a new task */
@@ -282,7 +314,7 @@ int _starpu_submit_job(struct _starpu_job *j)
 #endif//STARPU_USE_SC_HYPERVISOR
 
 	/* We retain handle reference count */
-	if (task->cl)
+	if (task->cl && !continuation)
 	{
 		unsigned i;
 		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
@@ -300,12 +332,29 @@ int _starpu_submit_job(struct _starpu_job *j)
 	/* Need to atomically set submitted to 1 and check dependencies, since
 	 * this is concucrent with _starpu_notify_cg */
 	j->terminated = 0;
+#ifdef STARPU_OPENMP
+	if (continuation)
+	{
+		j->discontinuous = 1;
+		j->continuation  = 0;
+	}
+#endif
+
 	if (!j->submitted)
 		j->submitted = 1;
 	else
 		j->submitted = 2;
 
-	int ret = _starpu_enforce_deps_and_schedule(j);
+#ifdef STARPU_OPENMP
+	if (continuation)
+	{
+		ret = _starpu_reenforce_task_deps_and_schedule(j);
+	}
+	else
+#endif
+	{
+		ret = _starpu_enforce_deps_and_schedule(j);
+	}
 
 	_STARPU_LOG_OUT();
 	return ret;
@@ -414,11 +463,18 @@ int starpu_task_submit(struct starpu_task *task)
 	int ret;
 	unsigned is_sync = task->synchronous;
 	starpu_task_bundle_t bundle = task->bundle;
-
 	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
 	* task structure, it is possible that this job structure was already
 	* allocated. */
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+	const unsigned continuation =
+#ifdef STARPU_OPENMP
+		j->continuation
+#else
+		0
+#endif
+		;
+
 
 	if (j->internal)
 	{
@@ -483,7 +539,11 @@ int starpu_task_submit(struct starpu_task *task)
 			return -ENODEV;
 		}
 
-		_starpu_detect_implicit_data_deps(task);
+		/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
+		if (!continuation)
+		{
+			_starpu_detect_implicit_data_deps(task);
+		}
 
 		if (task->cl->model)
 			_starpu_init_and_load_perfmodel(task->cl->model);
@@ -890,6 +950,31 @@ void _starpu_set_current_task(struct starpu_task *task)
 	STARPU_PTHREAD_SETSPECIFIC(current_task_key, task);
 }
 
+#ifdef STARPU_OPENMP
+/* Prepare the fields of the currentl task for accepting a new set of
+ * dependencies in anticipation of becoming a continuation.
+ *
+ * When the task becomes 'continued', it will only be queued again when the new
+ * set of dependencies is fulfilled. */
+void _starpu_task_prepare_for_continuation(void)
+{
+	_starpu_job_prepare_for_continuation(_starpu_get_job_associated_to_task(starpu_task_get_current()));
+}
+
+void _starpu_task_prepare_for_continuation_ext(unsigned continuation_resubmit,
+		void (*continuation_callback_on_sleep)(void *arg), void *continuation_callback_on_sleep_arg)
+{
+	_starpu_job_prepare_for_continuation_ext(_starpu_get_job_associated_to_task(starpu_task_get_current()),
+		continuation_resubmit, continuation_callback_on_sleep, continuation_callback_on_sleep_arg);
+}
+
+void _starpu_task_set_omp_cleanup_callback(struct starpu_task *task, void (*omp_cleanup_callback)(void *arg), void *omp_cleanup_callback_arg)
+{
+	_starpu_job_set_omp_cleanup_callback(_starpu_get_job_associated_to_task(task),
+		omp_cleanup_callback, omp_cleanup_callback_arg);
+}
+#endif
+
 /*
  * Returns 0 if tasks does not use any multiformat handle, 1 otherwise.
  */

+ 18 - 1
src/core/task.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011 INRIA
+ * Copyright (C) 2011, 2014 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,12 @@
 /* Internal version of starpu_task_destroy: don't check task->destroy flag */
 void _starpu_task_destroy(struct starpu_task *task);
 
+#ifdef STARPU_OPENMP
+/* Test for the termination of the task.
+ * Call starpu_task_destroy if required and the task is terminated. */
+int _starpu_task_test_termination(struct starpu_task *task);
+#endif
+
 /* A pthread key is used to store the task currently executed on the thread.
  * _starpu_initialize_current_task_key initializes this pthread key and
  * _starpu_set_current_task updates its current value. */
@@ -53,6 +59,17 @@ int
 _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 				     enum starpu_node_kind node_kind);
 
+#ifdef STARPU_OPENMP
+/* Prepare the current task for accepting new dependencies before becoming a continuation. */
+void _starpu_task_prepare_for_continuation_ext(unsigned continuation_resubmit,
+		void (*continuation_callback_on_sleep)(void *arg), void *continuation_callback_on_sleep_arg);
+
+void _starpu_task_prepare_for_continuation(void);
+
+void _starpu_task_set_omp_cleanup_callback(struct starpu_task *task, void (*omp_cleanup_callback)(void *arg),
+		void *omp_cleanup_callback_arg);
+#endif
+
 int _starpu_task_uses_multiformat_handles(struct starpu_task *task);
 
 int _starpu_task_submit_conversion_task(struct starpu_task *task,

+ 6 - 0
src/core/workers.c

@@ -983,6 +983,9 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 	int ret;
 
+#ifdef STARPU_OPENMP
+	_starpu_omp_dummy_init();
+#endif
 #ifdef STARPU_SIMGRID
 	_starpu_simgrid_init();
 #else
@@ -1382,6 +1385,9 @@ void starpu_shutdown(void)
 	/* Drop all remaining tags */
 	_starpu_tag_clear();
 
+#ifdef STARPU_OPENMP
+	_starpu_omp_dummy_shutdown();
+#endif
 	_starpu_close_debug_logfile();
 
 	STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);

+ 4 - 0
src/datawizard/coherency.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -867,6 +868,9 @@ enomem:
 
 void _starpu_push_task_output(struct _starpu_job *j)
 {
+#ifdef STARPU_OPENMP
+	STARPU_ASSERT(!j->continuation);
+#endif
 	_STARPU_TRACE_START_PUSH_OUTPUT(NULL);
 
 	int profiling = starpu_profiling_status_get();

+ 5 - 0
src/datawizard/coherency.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -217,6 +218,10 @@ struct _starpu_data_state
 
 	unsigned lazy_unregister;
 
+#ifdef STARPU_OPENMP
+	unsigned removed_from_context_hash;
+#endif
+
         /* Used for MPI */
         int rank;
 	int tag;

+ 129 - 18
src/datawizard/interfaces/data_interface.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,6 +25,9 @@
 #include <core/task.h>
 #include <core/workers.h>
 #include <datawizard/memstats.h>
+#ifdef STARPU_OPENMP
+#include <util/openmp_runtime_support.h>
+#endif
 
 /* Entry in the `registered_handles' hash table.  */
 struct handle_entry
@@ -90,6 +94,34 @@ void _starpu_data_interface_shutdown()
 	registered_tag_handles = NULL;
 }
 
+#ifdef STARPU_OPENMP
+void _starpu_omp_unregister_region_handles(struct starpu_omp_region *region)
+{
+	_starpu_spin_lock(&region->registered_handles_lock);
+	struct handle_entry *entry, *tmp;
+	HASH_ITER(hh, (region->registered_handles), entry, tmp)
+	{
+		entry->handle->removed_from_context_hash = 1;
+		HASH_DEL(region->registered_handles, entry);
+		starpu_data_unregister_submit(entry->handle);
+		free(entry);
+	}
+	_starpu_spin_unlock(&region->registered_handles_lock);
+}
+
+void _starpu_omp_unregister_task_handles(struct starpu_omp_task *task)
+{
+	struct handle_entry *entry, *tmp;
+	HASH_ITER(hh, task->registered_handles, entry, tmp)
+	{
+		entry->handle->removed_from_context_hash = 1;
+		HASH_DEL(task->registered_handles, entry);
+		starpu_data_unregister_submit(entry->handle);
+		free(entry);
+	}
+}
+#endif
+
 struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interface_id)
 {
 	switch (interface_id)
@@ -136,26 +168,80 @@ void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
 	entry->pointer = ptr;
 	entry->handle = handle;
 
-	_starpu_spin_lock(&registered_handles_lock);
-	HASH_ADD_PTR(registered_handles, pointer, entry);
-	_starpu_spin_unlock(&registered_handles_lock);
+#ifdef STARPU_OPENMP
+	struct starpu_omp_task *task = _starpu_omp_get_task();
+	if (task)
+	{
+		if (task->is_implicit)
+		{
+			struct starpu_omp_region *parallel_region = task->owner_region;
+			_starpu_spin_lock(&parallel_region->registered_handles_lock);
+			HASH_ADD_PTR(parallel_region->registered_handles, pointer, entry);
+			_starpu_spin_unlock(&parallel_region->registered_handles_lock);
+		}
+		else
+		{
+			HASH_ADD_PTR(task->registered_handles, pointer, entry);
+		}
+	}
+	else
+#endif
+	{
+		_starpu_spin_lock(&registered_handles_lock);
+		HASH_ADD_PTR(registered_handles, pointer, entry);
+		_starpu_spin_unlock(&registered_handles_lock);
+	}
 }
 
 starpu_data_handle_t starpu_data_lookup(const void *ptr)
 {
 	starpu_data_handle_t result;
 
-	_starpu_spin_lock(&registered_handles_lock);
+#ifdef STARPU_OPENMP
+	struct starpu_omp_task *task = _starpu_omp_get_task();
+	if (task)
 	{
-		struct handle_entry *entry;
+		if (task->is_implicit)
+		{
+			struct starpu_omp_region *parallel_region = task->owner_region;
+			_starpu_spin_lock(&parallel_region->registered_handles_lock);
+			{
+				struct handle_entry *entry;
 
-		HASH_FIND_PTR(registered_handles, &ptr, entry);
-		if(STARPU_UNLIKELY(entry == NULL))
-			result = NULL;
+				HASH_FIND_PTR(parallel_region->registered_handles, &ptr, entry);
+				if(STARPU_UNLIKELY(entry == NULL))
+					result = NULL;
+				else
+					result = entry->handle;
+			}
+			_starpu_spin_unlock(&parallel_region->registered_handles_lock);
+		}
 		else
-			result = entry->handle;
+		{
+			struct handle_entry *entry;
+
+			HASH_FIND_PTR(task->registered_handles, &ptr, entry);
+			if(STARPU_UNLIKELY(entry == NULL))
+				result = NULL;
+			else
+				result = entry->handle;
+		}
+	}
+	else
+#endif
+	{
+		_starpu_spin_lock(&registered_handles_lock);
+		{
+			struct handle_entry *entry;
+
+			HASH_FIND_PTR(registered_handles, &ptr, entry);
+			if(STARPU_UNLIKELY(entry == NULL))
+				result = NULL;
+			else
+				result = entry->handle;
+		}
+		_starpu_spin_unlock(&registered_handles_lock);
 	}
-	_starpu_spin_unlock(&registered_handles_lock);
 
 	return result;
 }
@@ -519,21 +605,46 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
 void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 {
 	const void *ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
+#ifdef STARPU_OPENMP
+	if (handle->removed_from_context_hash)
+		return;
+#endif
 	if (ram_ptr != NULL)
 	{
 		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
 		 * to another handle existed before (e.g., when using
 		 * filters), it becomes visible again.  */
 		struct handle_entry *entry;
+#ifdef STARPU_OPENMP
+		struct starpu_omp_task *task = _starpu_omp_get_task();
+		if (task)
+		{
+			if (task->is_implicit)
+			{
+				struct starpu_omp_region *parallel_region = task->owner_region;
+				_starpu_spin_lock(&parallel_region->registered_handles_lock);
+				HASH_FIND_PTR(parallel_region->registered_handles, &ram_ptr, entry);
+				STARPU_ASSERT(entry != NULL);
+				HASH_DEL(registered_handles, entry);
+				_starpu_spin_unlock(&parallel_region->registered_handles_lock);
+			}
+			else
+			{
+				HASH_FIND_PTR(task->registered_handles, &ram_ptr, entry);
+				STARPU_ASSERT(entry != NULL);
+				HASH_DEL(task->registered_handles, entry);
+			}
+		}
+		else
+#endif
+		{
 
-		_starpu_spin_lock(&registered_handles_lock);
-		HASH_FIND_PTR(registered_handles, &ram_ptr, entry);
-		STARPU_ASSERT(entry != NULL);
-
-		HASH_DEL(registered_handles, entry);
-
-		_starpu_spin_unlock(&registered_handles_lock);
-
+			_starpu_spin_lock(&registered_handles_lock);
+			HASH_FIND_PTR(registered_handles, &ram_ptr, entry);
+			STARPU_ASSERT(entry != NULL);
+			HASH_DEL(registered_handles, entry);
+			_starpu_spin_unlock(&registered_handles_lock);
+		}
 		free(entry);
 	}
 }

+ 10 - 0
src/datawizard/interfaces/data_interface.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +21,10 @@
 
 #include <starpu.h>
 #include <common/config.h>
+#include <common/uthash.h>
+#ifdef STARPU_OPENMP
+#include <util/openmp_runtime_support.h>
+#endif
 
 /* Generic type representing an interface, for now it's only used before
  * execution on message-passing devices but it can be useful in other cases.
@@ -57,6 +62,11 @@ extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
 extern int _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
 
+#ifdef STARPU_OPENMP
+void _starpu_omp_unregister_region_handles(struct starpu_omp_region *region);
+void _starpu_omp_unregister_task_handles(struct starpu_omp_task *task);
+#endif
+
 struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interface_id);
 
 extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,

+ 15 - 2
src/drivers/cpu/driver_cpu.c

@@ -4,6 +4,7 @@
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,10 +58,17 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
+#ifdef STARPU_OPENMP
+	/* At this point, j->continuation as been cleared as the task is being
+	 * woken up, thus we use j->discontinuous instead for the check */
+	const unsigned continuation_wake_up = j->discontinuous;
+#else
+	const unsigned continuation_wake_up = 0;
+#endif
 
 	STARPU_ASSERT(cl);
 
-	if (rank == 0)
+	if (rank == 0 && !continuation_wake_up)
 	{
 		ret = _starpu_fetch_task_input(j);
 		if (ret != 0)
@@ -116,7 +124,12 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 	{
 		_starpu_driver_update_job_feedback(j, cpu_args,
 				perf_arch, &codelet_start, &codelet_end, profiling);
-		_starpu_push_task_output(j);
+#ifdef STARPU_OPENMP
+		if (!j->continuation)
+#endif
+		{
+			_starpu_push_task_output(j);
+		}
 	}
 
 	return 0;

+ 11 - 1
src/drivers/cuda/driver_cuda.c

@@ -333,7 +333,17 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 
 	/* cleanup the runtime API internal stuffs (which CUBLAS is using) */
 	cures = cudaThreadExit();
-	if (cures)
+	if (cures
+#ifdef STARPU_OPENMP
+		/* When StarPU is used as Open Runtime support,
+		 * starpu_omp_shutdown() will usually be called from a
+		 * destructor, in which case cudaThreadExit() reports a
+		 * cudaErrorCudartUnloading here. There should not
+		 * be any remaining tasks running at this point so
+		 * we can probably ignore it without much consequences. */
+		&& cures != cudaErrorCudartUnloading
+#endif /* STARPU_OPENMP */
+	)
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 #endif /* !SIMGRID */

+ 60 - 4
src/drivers/driver_common/driver_common.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2014  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -205,9 +206,40 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 		}
 
 		if (calibrate_model)
-			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker->devid, measured,j->nimpl);
-
-
+		{
+#ifdef STARPU_OPENMP
+			double time_consumed = measured;
+			unsigned do_update_time_model;
+			if (j->continuation)
+			{
+				/* The job is only paused, thus we accumulate
+				 * its timing, but we don't update its
+				 * perfmodel now. */
+				starpu_timespec_accumulate(&j->cumulated_ts, &measured_ts);
+				do_update_time_model = 0;
+			}
+			else
+			{
+				if (j->discontinuous)
+				{
+					/* The job was paused at least once but is now
+					 * really completing. We need to take into
+					 * account its past execution time in its
+					 * perfmodel. */
+					starpu_timespec_accumulate(&measured_ts, &j->cumulated_ts);
+					time_consumed = starpu_timing_timespec_to_us(&measured_ts);
+				}
+				do_update_time_model = 1;
+			}
+#else
+			const unsigned do_update_time_model = 1;
+			const double time_consumed = measured;
+#endif
+			if (do_update_time_model)
+			{
+				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl);
+			}
+		}
 	}
 
 	if (!updated)
@@ -215,7 +247,31 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
 	{
-		_starpu_update_perfmodel_history(j, j->task->cl->power_model, perf_arch, worker->devid, profiling_info->power_consumed,j->nimpl);
+#ifdef STARPU_OPENMP
+		double power_consumed = profiling_info->power_consumed;
+		unsigned do_update_power_model;
+		if (j->continuation)
+		{
+			j->cumulated_power_consumed += power_consumed;
+			do_update_power_model = 0;
+		}
+		else 
+		{
+			if (j->discontinuous)
+			{
+				power_consumed += j->cumulated_power_consumed;
+			}
+			do_update_power_model = 1;
+		}
+#else
+		const double power_consumed = profiling_info->power_consumed;
+		const unsigned do_update_power_model = 1;
+#endif
+
+		if (do_update_power_model)
+		{
+			_starpu_update_perfmodel_history(j, j->task->cl->power_model, perf_arch, worker->devid, power_consumed, j->nimpl);
+		}
 	}
 }
 

File diff suppressed because it is too large
+ 2403 - 0
src/util/openmp_runtime_support.c


+ 378 - 0
src/util/openmp_runtime_support.h

@@ -0,0 +1,378 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __OPENMP_RUNTIME_SUPPORT_H__
+#define __OPENMP_RUNTIME_SUPPORT_H__
+
+#include <starpu.h>
+
+#ifdef STARPU_OPENMP
+#include <common/list.h>
+#include <common/starpu_spinlock.h>
+#include <common/uthash.h>
+
+/* ucontexts have been deprecated as of POSIX 1-2004
+ * _XOPEN_SOURCE required at least on OS/X
+ * 
+ * TODO: add detection in configure.ac
+ */
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE
+#endif
+#include <ucontext.h>
+
+/*
+ * Arbitrary limit on the number of nested parallel sections
+ */
+#define STARPU_OMP_MAX_ACTIVE_LEVELS 1
+
+/*
+ * Possible abstract names for OpenMP places
+ */
+enum starpu_omp_place_name
+{
+	starpu_omp_place_undefined = 0,
+	starpu_omp_place_threads   = 1,
+	starpu_omp_place_cores     = 2,
+	starpu_omp_place_sockets   = 3,
+	starpu_omp_place_numerical = 4 /* place specified numerically */
+};
+
+struct starpu_omp_numeric_place
+{
+	int excluded_place;
+	int *included_numeric_items;
+	int nb_included_numeric_items;
+	int *excluded_numeric_items;
+	int nb_excluded_numeric_items;
+};
+
+/*
+ * OpenMP place for thread afinity, defined by the OpenMP spec
+ */
+struct starpu_omp_place
+{
+	int abstract_name;
+	int abstract_excluded;
+	int abstract_length;
+	struct starpu_omp_numeric_place *numeric_places;
+	int nb_numeric_places;
+};
+
+/* 
+ * Internal Control Variables (ICVs) declared following
+ * OpenMP 4.0.0 spec section 2.3.1
+ */
+struct starpu_omp_data_environment_icvs
+{
+	/* parallel region icvs */
+	int dyn_var;
+	int nest_var;
+	int *nthreads_var; /* nthreads_var ICV is a list */
+	int thread_limit_var;
+
+	int active_levels_var;
+	int levels_var;
+	int *bind_var; /* bind_var ICV is a list */
+
+	/* loop region icvs */
+	int run_sched_var;
+	unsigned long long run_sched_chunk_var;
+
+	/* program execution icvs */
+	int default_device_var;
+};
+
+struct starpu_omp_device_icvs
+{
+	/* parallel region icvs */
+	int max_active_levels_var;
+
+	/* loop region icvs */
+	int def_sched_var;
+	unsigned long long def_sched_chunk_var;
+
+	/* program execution icvs */
+	int stacksize_var;
+	int wait_policy_var;
+};
+
+struct starpu_omp_implicit_task_icvs
+{
+	/* parallel region icvs */
+	int place_partition_var;
+};
+
+struct starpu_omp_global_icvs
+{
+	/* program execution icvs */
+	int cancel_var;
+};
+
+struct starpu_omp_initial_icv_values
+{
+	int dyn_var;
+	int nest_var;
+	int *nthreads_var;
+	int run_sched_var;
+	unsigned long long run_sched_chunk_var;
+	int def_sched_var;
+	unsigned long long def_sched_chunk_var;
+	int *bind_var;
+	int stacksize_var;
+	int wait_policy_var;
+	int thread_limit_var;
+	int max_active_levels_var;
+	int active_levels_var;
+	int levels_var;
+	int place_partition_var;
+	int cancel_var;
+	int default_device_var;
+
+	/* not a real ICV, but needed to store the contents of OMP_PLACES */
+	struct starpu_omp_place places;
+};
+
+struct starpu_omp_task_group
+{
+	int descendent_task_count;
+	struct starpu_omp_task *leader_task;
+	struct starpu_omp_task_group *p_previous_task_group;
+};
+
+struct starpu_omp_task_link
+{
+	struct starpu_omp_task *task;
+	struct starpu_omp_task_link *next;
+};
+
+struct starpu_omp_condition
+{
+	struct starpu_omp_task_link *contention_list_head;
+};
+
+struct starpu_omp_critical
+{
+	UT_hash_handle hh;
+	struct _starpu_spinlock lock;
+	unsigned state;
+	struct starpu_omp_task_link *contention_list_head;
+	const char *name;
+};
+
+enum starpu_omp_task_state
+{
+	starpu_omp_task_state_clear      = 0,
+	starpu_omp_task_state_preempted  = 1,
+	starpu_omp_task_state_terminated = 2,
+	starpu_omp_task_state_zombie     = 3,
+};
+
+enum starpu_omp_task_wait_on
+{
+	starpu_omp_task_wait_on_task_childs  = 1 << 0,
+	starpu_omp_task_wait_on_region_tasks = 1 << 1,
+	starpu_omp_task_wait_on_barrier      = 1 << 2,
+	starpu_omp_task_wait_on_group        = 1 << 3,
+	starpu_omp_task_wait_on_critical     = 1 << 4,
+	starpu_omp_task_wait_on_condition    = 1 << 5
+};
+
+LIST_TYPE(starpu_omp_task,
+	struct starpu_omp_implicit_task_icvs icvs;
+	struct starpu_omp_task *parent_task;
+	struct starpu_omp_thread *owner_thread;
+	struct starpu_omp_region *owner_region;
+	struct starpu_omp_region *nested_region;
+	int is_implicit;
+	int is_undeferred;
+	int is_final;
+	int is_untied;
+	int rank;
+	int child_task_count;
+	struct starpu_omp_task_group *task_group;
+	struct _starpu_spinlock lock;
+	int wait_on;
+	int barrier_count;
+	int single_id;
+	int single_first;
+	int loop_id;
+	unsigned long long ordered_first_i;
+	unsigned long long ordered_nb_i;
+	int sections_id;
+	struct starpu_omp_data_environment_icvs data_env_icvs;
+	struct starpu_omp_implicit_task_icvs implicit_task_icvs;
+	struct handle_entry *registered_handles;
+
+	struct starpu_task *starpu_task;
+	struct starpu_codelet cl;
+	void **starpu_buffers;
+	void *starpu_cl_arg;
+
+	/* actual task function to be run */
+	void (*cpu_f)(void **starpu_buffers, void *starpu_cl_arg);
+#if STARPU_USE_CUDA
+	void (*cuda_f)(void **starpu_buffers, void *starpu_cl_arg);
+#endif
+#if STARPU_USE_OPENCL
+	void (*opencl_f)(void **starpu_buffers, void *starpu_cl_arg);
+#endif
+
+	enum starpu_omp_task_state state;
+
+	/* 
+	 * context to store the processing state of the task
+	 * in case of blocking/recursive task operation
+	 */
+	ucontext_t ctx;
+
+	/*
+	 * stack to execute the task over, to be able to switch
+	 * in case blocking/recursive task operation
+	 */
+	void *stack;
+
+	size_t stacksize;
+)
+
+LIST_TYPE(starpu_omp_thread,
+
+	UT_hash_handle hh;
+	struct starpu_omp_task *current_task;
+	struct starpu_omp_region *owner_region;
+
+	/*
+	 * stack to execute the initial thread over
+	 * when preempting the initial task
+	 * note: should not be used for other threads
+	 */
+	void *initial_thread_stack;
+
+	/*
+	 * context to store the 'scheduler' state of the thread,
+	 * to which the execution of thread comes back upon a
+	 * blocking/recursive task operation
+	 */
+	ucontext_t ctx;
+
+	struct starpu_driver starpu_driver;
+	struct _starpu_worker *worker;
+)
+
+struct _starpu_omp_lock_internal
+{
+	struct _starpu_spinlock lock;
+	struct starpu_omp_condition cond;
+	unsigned state;
+};
+
+struct _starpu_omp_nest_lock_internal
+{
+	struct _starpu_spinlock lock;
+	struct starpu_omp_condition cond;
+	unsigned state;
+	struct starpu_omp_task *owner_task;
+	unsigned nesting;
+};
+
+struct starpu_omp_loop
+{
+	int id;
+	unsigned long long next_iteration;
+	int nb_completed_threads;
+	struct starpu_omp_loop *next_loop;
+	struct _starpu_spinlock ordered_lock;
+	struct starpu_omp_condition ordered_cond;
+	unsigned long long ordered_iteration;
+};
+
+struct starpu_omp_sections
+{
+	int id;
+	unsigned long long next_section_num;
+	int nb_completed_threads;
+	struct starpu_omp_sections *next_sections;
+};
+
+struct starpu_omp_region
+{
+	struct starpu_omp_data_environment_icvs icvs;
+	struct starpu_omp_region *parent_region;
+	struct starpu_omp_device *owner_device;
+	struct starpu_omp_thread *master_thread;
+	/* note: the list of threads does not include the master_thread */
+	struct starpu_omp_thread_list *thread_list;
+	/* list of implicit omp tasks created to run the region */
+	struct starpu_omp_task_list *implicit_task_list;
+	/* include both the master thread and the region own threads */
+	int nb_threads;
+	struct _starpu_spinlock lock;
+	struct starpu_omp_task *waiting_task;
+	int barrier_count;
+	int bound_explicit_task_count;
+	int single_id;
+	void *copy_private_data;
+	int level;
+	struct starpu_omp_loop *loop_list;
+	struct starpu_omp_sections *sections_list;
+	struct starpu_task *continuation_starpu_task;
+	struct handle_entry *registered_handles;
+	struct _starpu_spinlock registered_handles_lock;
+};
+
+struct starpu_omp_device
+{
+	struct starpu_omp_device_icvs icvs;
+
+	/* atomic fallback implementation lock */
+	struct _starpu_spinlock atomic_lock;
+};
+
+struct starpu_omp_global
+{
+	struct starpu_omp_global_icvs icvs;
+	struct starpu_omp_task *initial_task;
+	struct starpu_omp_thread *initial_thread;
+	struct starpu_omp_region *initial_region;
+	struct starpu_omp_device *initial_device;
+	struct starpu_omp_critical *default_critical;
+	struct starpu_omp_critical *named_criticals;
+	struct _starpu_spinlock named_criticals_lock;
+	struct starpu_omp_thread *hash_workers;
+	struct _starpu_spinlock hash_workers_lock;
+	int nb_starpu_cpu_workers;
+	int *starpu_cpu_worker_ids;
+};
+
+/* 
+ * internal global variables
+ */
+extern struct starpu_omp_initial_icv_values *_starpu_omp_initial_icv_values;
+extern struct starpu_omp_global *_starpu_omp_global_state;
+extern double _starpu_omp_clock_ref;
+
+/* 
+ * internal API
+ */
+void _starpu_omp_environment_init(void);
+void _starpu_omp_environment_exit(void);
+struct starpu_omp_thread *_starpu_omp_get_thread(void);
+struct starpu_omp_task *_starpu_omp_get_task(void);
+void _starpu_omp_dummy_init(void);
+void _starpu_omp_dummy_shutdown(void);
+#endif // STARPU_OPENMP
+
+#endif // __OPENMP_RUNTIME_SUPPORT_H__

+ 941 - 0
src/util/openmp_runtime_support_environment.c

@@ -0,0 +1,941 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#ifdef STARPU_OPENMP
+#include <util/openmp_runtime_support.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <strings.h>
+
+#define _STARPU_INITIAL_PLACES_LIST_SIZE      4
+#define _STARPU_INITIAL_PLACE_ITEMS_LIST_SIZE 4
+#define _STARPU_DEFAULT_STACKSIZE 2097152
+
+static struct starpu_omp_initial_icv_values _initial_icv_values =
+{
+	.dyn_var = 0,
+	.nest_var = 0,
+	.nthreads_var = NULL,
+	.run_sched_var = starpu_omp_sched_static,
+	.run_sched_chunk_var = 0,
+	.def_sched_var = starpu_omp_sched_static,
+	.def_sched_chunk_var = 0,
+	.bind_var = NULL,
+	.stacksize_var = _STARPU_DEFAULT_STACKSIZE,
+	.wait_policy_var = 0,
+	.max_active_levels_var = STARPU_OMP_MAX_ACTIVE_LEVELS,
+	.active_levels_var = 0,
+	.levels_var = 0,
+	.place_partition_var = 0,
+	.cancel_var = 0,
+	.default_device_var = 0
+};
+
+struct starpu_omp_initial_icv_values *_starpu_omp_initial_icv_values = NULL;
+
+/* TODO: move to utils */
+static void remove_spaces(char *str)
+{
+	int i = 0;
+	int j = 0;
+
+	while (str[j] != '\0')
+	{
+		if (isspace(str[j]))
+		{
+			j++;
+			continue;
+		}
+		if (j > i)
+		{
+			str[i] = str[j];
+		}
+		i++;
+		j++;
+	}
+	if (j > i)
+	{
+		str[i] = str[j];
+	}
+}
+/* TODO: move to utils */
+static int strings_cmp(const char *strings[], const char *str)
+{
+	int mode = 0;
+	while (strings[mode])
+	{
+		if (strcasecmp(str, strings[mode]) == 0)
+			break;
+		mode++;
+	}
+	if (strings[mode] == NULL)
+		return -1;
+	return mode;
+}
+/* TODO: move to utils */
+static int stringsn_cmp(const char *strings[], const char *str, size_t n)
+{
+	int mode = 0;
+	while (strings[mode])
+	{
+		if (strncasecmp(str, strings[mode], n) == 0)
+			break;
+		mode++;
+	}
+	if (strings[mode] == NULL)
+		return -1;
+	return mode;
+}
+
+/* TODO: move to utils */
+static void read_boolean_var(const char *var, int *dest)
+{
+	const char *env = getenv(var);
+	if (env)
+	{
+		char *str = strdup(env);
+		if (str == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(str);
+		if (str[0] == '\0')
+		{
+			free(str);
+			return;
+		}
+		static const char *strings[] = { "false", "true", NULL };
+		int mode = strings_cmp(strings, str);
+		if (mode < 0)
+			_STARPU_ERROR("parse error in variable %s\n", var);
+		*dest = mode;
+		free(str);
+	}
+}
+
+/* TODO: move to utils */
+static void read_int_var(const char *var, int *dest)
+{
+	const char *env = getenv(var);
+	if (env)
+	{
+		char *str = strdup(env);
+		if (str == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(str);
+		if (str[0] == '\0')
+		{
+			free(str);
+			return;
+		}
+		errno = 0;
+		int v = (int)strtol(str, NULL, 10);
+		if (errno != 0)
+			_STARPU_ERROR("could not parse environment variable %s, strtol failed with error %s\n", var, strerror(errno));
+		*dest = v;
+		free(str);
+	}
+}
+
+static void read_size_var(const char *var, int *dest)
+{
+	const char *env = getenv(var);
+	if (env)
+	{
+		char *str = strdup(env);
+		if (str == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(str);
+		if (str[0] == '\0')
+		{
+			free(str);
+			return;
+		}
+		char *endptr = NULL;
+		int mult = 1024;
+		errno = 0;
+		int v = (int)strtol(str, &endptr, 10);
+		if (errno != 0)
+			_STARPU_ERROR("could not parse environment variable %s, strtol failed with error %s\n", var, strerror(errno));
+		if (*endptr != '\0')
+		{
+			switch (*endptr)
+			{
+				case 'b':
+				case 'B': mult = 1; break;
+				case 'k':
+				case 'K': mult = 1024; break;
+				case 'm':
+				case 'M': mult = 1024*1024; break;
+				case 'g':
+				case 'G': mult = 1024*1024*1024; break;
+			default:
+				_STARPU_ERROR("could not parse environment variable %s size suffix invalid\n", var);
+			}
+		}
+		*dest = v*mult;
+		free(str);
+	}
+}
+
+static void read_sched_var(const char *var, int *dest, unsigned long long *dest_chunk)
+{
+	const char *env = getenv(var);
+	if (env)
+	{
+		char *str = strdup(env);
+		if (str == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(str);
+		if (str[0] == '\0')
+		{
+			free(str);
+			return;
+		}
+		static const char *strings[] = { "static", "dynamic", "guided", "auto", NULL };
+		int mode = strings_cmp(strings, str);
+		if (mode < 0)
+			_STARPU_ERROR("parse error in variable %s\n", var);
+		*dest = mode;
+		int offset = strlen(strings[mode]);
+		if (str[offset] == ',')
+		{
+			offset++;
+			errno = 0;
+			long long v = strtoll(str+offset, NULL, 10);
+			if (errno != 0)
+				_STARPU_ERROR("could not parse environment variable %s, strtol failed with error %s\n", var, strerror(errno));
+			if (v < 0)
+				_STARPU_ERROR("invalid negative modifier in environment variable %s\n", var);
+			unsigned long long uv = (unsigned long long) v;
+			*dest_chunk = uv;
+		}
+		else
+		{
+			*dest_chunk = 1;
+		}
+		free(str);
+	}
+}
+
+static void read_wait_policy_var(const char *var, int *dest)
+{
+	const char *env = getenv(var);
+	if (env)
+	{
+		char *str = strdup(env);
+		if (str == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(str);
+		if (str[0] == '\0')
+		{
+			free(str);
+			return;
+		}
+		static const char *strings[] = { "passive", "active", NULL };
+		int mode = strings_cmp(strings, str);
+		if (mode < 0)
+			_STARPU_ERROR("parse error in variable %s\n", var);
+		*dest = mode;
+		free(str);
+	}
+}
+
+static void read_display_env_var(const char *var, int *dest)
+{
+	const char *env = getenv(var);
+	if (env)
+	{
+		char *str = strdup(env);
+		if (str == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		remove_spaces(str);
+		if (str[0] == '\0')
+		{
+			free(str);
+			return;
+		}
+		static const char *strings[] = { "false", "true", "verbose", NULL };
+		int mode = strings_cmp(strings, str);
+		if (mode < 0)
+			_STARPU_ERROR("parse error in variable %s\n", var);
+		*dest = mode;
+		free(str);
+	}
+}
+
+static int convert_bind_mode(const char *str, size_t n)
+{
+	static const char *strings[] = { "false", "true", "master", "close", "spread", NULL };
+	int mode = stringsn_cmp(strings, str, n);
+	if (mode < 0)
+		_STARPU_ERROR("proc_bind list parse error\n");
+	return mode;
+}
+
+static void convert_bind_string(const char *_str, int *bind_list, const int max_levels)
+{
+	char *str = strdup(_str);
+	if (str == NULL)
+		_STARPU_ERROR("memory allocation failed\n");
+	remove_spaces(str);
+	if (str[0] == '\0')
+	{
+		free(str);
+		return;
+	}
+	enum { state_split, state_read };
+	int level = 0;
+	int i = 0;
+	int state = state_read;
+	while (1)
+	{
+		if (state == state_split)
+		{
+			if (str[i] == '\0')
+				break;
+			if (str[i] != ',')
+				_STARPU_ERROR("proc_bind list parse error\n");
+			i++;
+			state = state_read;
+		}
+		else if (state == state_read)
+		{
+			int n = 0;
+			while (isalpha(str[i+n]))
+				n++;
+			if (n == 0)
+				_STARPU_ERROR("proc_bind list parse error\n");
+			int mode = convert_bind_mode(str+i,n);
+			STARPU_ASSERT(mode >= starpu_omp_proc_bind_false && mode <= starpu_omp_proc_bind_spread);
+			bind_list[level] = mode;
+			level++;
+			if (level == max_levels)
+				break;
+			i += n;
+			state = state_split;
+		}
+		else
+			_STARPU_ERROR("invalid state in parsing proc_bind list\n");
+	}
+	free(str);
+}
+
+static void convert_num_threads_string(const char *_str, int *num_threads_list, const int max_levels)
+{
+	char *str = strdup(_str);
+	if (str == NULL)
+		_STARPU_ERROR("memory allocation failed\n");
+	remove_spaces(str);
+	if (str[0] == '\0')
+	{
+		free(str);
+		return;
+	}
+	enum { state_split, state_read };
+	int level = 0;
+	int i = 0;
+	int state = state_read;
+	while (1)
+	{
+		/* split a comma separated list of numerical items */
+		if (state == state_split)
+		{
+			if (str[i] == '\0')
+				break;
+			if (str[i] != ',')
+				_STARPU_ERROR("num_threads list parse error\n");
+			i++;
+			state = state_read;
+		}
+		/* read a numerical item */
+		else if (state == state_read)
+		{
+			char *endptr = NULL;
+			errno = 0;
+			int num_threads = (int)strtol(str+i, &endptr, 10);
+			if (errno != 0)
+				_STARPU_ERROR("num_threads list parse error, strtol failed with error %s\n", strerror(errno));
+			if (num_threads < 1)
+				_STARPU_ERROR("num_threads list invalid value\n");
+			num_threads_list[level] = num_threads;
+			level++;
+			if (level == max_levels)
+				break;
+			i = endptr - str;
+			state = state_split;
+		}
+		else
+			_STARPU_ERROR("invalid state in parsing num_threads list\n");
+	}
+	free(str);
+}
+
+static int convert_place_name(const char *str, size_t n)
+{
+	static const char *strings[] = { "threads", "cores", "sockets", NULL };
+	int mode = stringsn_cmp(strings, str, n);
+	if (mode < 0)
+		_STARPU_ERROR("place abstract name parse error\n");
+	return mode+1; /* 0 is for undefined abstract name */
+}
+
+/* Note: this function modifies the string str */
+static void read_a_place_name(char *str, struct starpu_omp_place *places)
+{
+	int i = 0;
+	/* detect exclusion of abstract name expressed as '!' prefix */
+	if (str[i] == '!')
+	{
+		places->abstract_excluded = 1;
+		i++;
+	}
+	else
+	{
+		places->abstract_excluded = 0;
+	}
+	/* detect length value for abstract name expressed as '(length)' suffix) */
+	char *begin_length_spec = strchr(str+i,'(');
+	if (begin_length_spec != NULL)
+	{
+		char *end_length_spec = strrchr(begin_length_spec+1, ')');
+		if (end_length_spec == NULL || end_length_spec <= begin_length_spec+1)
+			_STARPU_ERROR("parse error in places list\n");
+		*begin_length_spec = '\0';
+		*end_length_spec = '\0';
+		errno = 0;
+		int v = (int)strtol(begin_length_spec+1, NULL, 10);
+		if (errno != 0)
+			_STARPU_ERROR("parse error in places list\n");
+		places->abstract_length = v;
+	}
+	else
+	{
+		places->abstract_length = 1;
+	}
+	/* convert abstract place name string to corresponding value */
+	{
+		int mode = convert_place_name(str+i, strlen(str+i));
+		STARPU_ASSERT(mode >= starpu_omp_place_threads && mode <= starpu_omp_place_sockets);
+		places->abstract_name = mode;
+		places->numeric_places = NULL;
+		places->nb_numeric_places = 0;
+	}
+}
+
+static void read_a_places_list(const char *str, struct starpu_omp_place *places)
+{
+	if (str[0] == '\0')
+	{
+		places->numeric_places = NULL;
+		places->nb_numeric_places = 0;
+		places->abstract_name = starpu_omp_place_undefined;
+		return;
+	}
+	enum { state_split,
+		state_read_brace_prefix,
+		state_read_opening_brace,
+		state_read_numeric_prefix,
+		state_read_numeric,
+		state_split_numeric,
+		state_read_closing_brace,
+		state_read_brace_suffix,
+	};
+	struct starpu_omp_numeric_place *places_list = NULL;
+	int places_list_size = 0;
+	int nb_places = 0;
+	int *included_items_list = NULL;
+	int included_items_list_size = 0;
+	int nb_included_items = 0;
+	int *excluded_items_list = NULL;
+	int excluded_items_list_size = 0;
+	int nb_excluded_items = 0;
+	int exclude_place_flag = 0;
+	int exclude_item_flag = 0;
+	int i = 0;
+	int state = state_read_brace_prefix;
+	while (1)
+	{
+		switch (state)
+		{
+			/* split a comma separated list of numerical places */
+			case state_split:
+				if (str[i] == '\0')
+				{
+					goto eol;
+				}
+				else if (str[i] != ',')
+					_STARPU_ERROR("parse error in places list\n");
+				i++;
+				state = state_read_brace_prefix;
+				break;
+			/* read optional exclude flag '!' for numerical place */
+			case state_read_brace_prefix:
+				exclude_place_flag = 0;
+				if (str[i] == '!')
+				{
+					exclude_place_flag = 1;
+					i++;
+				}
+				state = state_read_opening_brace;
+				break;
+			/* read place opening brace */
+			case state_read_opening_brace:
+				if (str[i] != '{')
+					_STARPU_ERROR("parse error in places list\n");
+				i++;
+				state = state_read_numeric_prefix;
+				break;
+			/* read optional exclude flag '!' for numerical item */
+			case state_read_numeric_prefix:
+				exclude_item_flag = 0;
+				if (str[i] == '!')
+				{
+					exclude_item_flag = 1;
+					i++;
+				}
+				state = state_read_numeric;
+				break;
+			/* read numerical item */
+			case state_read_numeric:
+				{
+					char *endptr = NULL;
+					errno = 0;
+					int v = (int)strtol(str+i, &endptr, 10);
+					if (errno != 0)
+						_STARPU_ERROR("parse error in places list, strtol failed with error %s\n", strerror(errno));
+					if (exclude_item_flag)
+					{
+						if (excluded_items_list_size == 0)
+						{
+							excluded_items_list_size = _STARPU_INITIAL_PLACE_ITEMS_LIST_SIZE;
+							excluded_items_list = malloc(excluded_items_list_size * sizeof(int));
+							if (excluded_items_list == NULL)
+								_STARPU_ERROR("memory allocation failed");
+						}
+						else if (nb_excluded_items == excluded_items_list_size)
+						{
+							excluded_items_list_size *= 2;
+							excluded_items_list = realloc(excluded_items_list, excluded_items_list_size * sizeof(int));
+							if (excluded_items_list == NULL)
+								_STARPU_ERROR("memory allocation failed");
+						}
+						excluded_items_list[nb_excluded_items] = v;
+						nb_excluded_items++;
+					}
+					else
+					{
+						if (included_items_list_size == 0)
+						{
+							included_items_list_size = _STARPU_INITIAL_PLACE_ITEMS_LIST_SIZE;
+							included_items_list = malloc(included_items_list_size * sizeof(int));
+							if (included_items_list == NULL)
+								_STARPU_ERROR("memory allocation failed");
+						}
+						else if (nb_included_items == included_items_list_size)
+						{
+							included_items_list_size *= 2;
+							included_items_list = realloc(included_items_list, included_items_list_size * sizeof(int));
+							if (included_items_list == NULL)
+								_STARPU_ERROR("memory allocation failed");
+						}
+						included_items_list[nb_included_items] = v;
+						nb_included_items++;
+					}
+					exclude_item_flag = 0;
+					i = endptr - str;
+					state = state_split_numeric;
+				}
+				break;
+			/* read comma separated or colon separated numerical item list */
+			case state_split_numeric:
+				if (str[i] == ':')
+					/* length and stride colon separated arguments not supported for now */
+					_STARPU_ERROR("colon support unimplemented in numeric place list");
+				if (str[i] == ',')
+				{
+					i++;
+					state = state_read_numeric_prefix;
+				}
+				else
+				{
+					state = state_read_closing_brace;
+				}
+				break;
+			/* read end of numerical item list */
+			case state_read_closing_brace:
+				if (str[i] != '}')
+					_STARPU_ERROR("parse error in places list\n");
+				if (places_list_size == 0)
+				{
+					places_list_size = _STARPU_INITIAL_PLACES_LIST_SIZE;
+					places_list = malloc(places_list_size * sizeof(*places_list));
+					if (places_list == NULL)
+						_STARPU_ERROR("memory allocation failed");
+				}
+				else if (nb_places == places_list_size)
+				{
+					places_list_size *= 2;
+					places_list = realloc(places_list, places_list_size * sizeof(*places_list));
+					if (places_list == NULL)
+						_STARPU_ERROR("memory allocation failed");
+				}
+				places_list[nb_places].excluded_place = exclude_place_flag;
+				places_list[nb_places].included_numeric_items = included_items_list;
+				places_list[nb_places].nb_included_numeric_items = nb_included_items;
+				places_list[nb_places].excluded_numeric_items = excluded_items_list;
+				places_list[nb_places].nb_excluded_numeric_items = nb_excluded_items;
+				nb_places++;
+				exclude_place_flag = 0;
+				included_items_list = NULL;
+				included_items_list_size = 0;
+				nb_included_items = 0;
+				excluded_items_list = NULL;
+				excluded_items_list_size = 0;
+				nb_excluded_items = 0;
+				i++;
+				state = state_read_brace_suffix;
+				break;
+			/* read optional place colon separated suffix */
+			case state_read_brace_suffix:
+				if (str[i] == ':')
+					/* length and stride colon separated arguments not supported for now */
+					_STARPU_ERROR("colon support unimplemented in numeric place list");
+				state = state_split;
+				break;
+			default:
+				_STARPU_ERROR("invalid state in parsing places list\n");
+		}
+	}
+
+eol:
+	places->numeric_places = places_list;
+	places->nb_numeric_places = nb_places;
+	places->abstract_name = starpu_omp_place_numerical;
+}
+
+static void convert_places_string(const char *_str, struct starpu_omp_place *places)
+{
+	char *str = strdup(_str);
+	if (str == NULL)
+		_STARPU_ERROR("memory allocation failed\n");
+	remove_spaces(str);
+	if (str[0] != '\0')
+	{
+		/* check whether this is the start of an abstract name */
+		if (isalpha(str[0]) || (str[0] == '!' && isalpha(str[1])))
+		{
+			read_a_place_name(str, places);
+		}
+		/* else the string must contain a list of braces */
+		else
+		{
+			read_a_places_list(str, places);
+		}
+	}
+	free(str);
+}
+
+static void free_places(struct starpu_omp_place *places)
+{
+	int i;
+	for (i = 0; i < places->nb_numeric_places; i++)
+	{
+		if (places->numeric_places[i].nb_included_numeric_items > 0)
+		{
+			free(places->numeric_places[i].included_numeric_items);
+		}
+		if (places->numeric_places[i].nb_excluded_numeric_items > 0)
+		{
+			free(places->numeric_places[i].excluded_numeric_items);
+		}
+	}
+	if (places->nb_numeric_places > 0)
+	{
+		free(places->numeric_places);
+	}
+}
+
+static void read_omp_environment(void)
+{
+	read_boolean_var("OMP_DYNAMIC", &_initial_icv_values.dyn_var);
+	read_boolean_var("OMP_NESTED", &_initial_icv_values.nest_var);
+	read_sched_var("OMP_SCHEDULE", &_initial_icv_values.run_sched_var, &_initial_icv_values.run_sched_chunk_var);
+	read_size_var("OMP_STACKSIZE", &_initial_icv_values.stacksize_var);
+	read_wait_policy_var("OMP_WAIT_POLICY", &_initial_icv_values.wait_policy_var);
+	read_int_var("OMP_THREAD_LIMIT", &_initial_icv_values.thread_limit_var);
+	read_int_var("OMP_MAX_ACTIVE_LEVELS", &_initial_icv_values.max_active_levels_var);
+	read_boolean_var("OMP_CANCELLATION", &_initial_icv_values.cancel_var);
+	read_int_var("OMP_DEFAULT_DEVICE", &_initial_icv_values.default_device_var);
+
+	const int max_levels = _initial_icv_values.max_active_levels_var;
+
+	/* read OMP_PROC_BIND */
+	{
+		int *bind_list = malloc((1+max_levels) * sizeof(*bind_list));
+		if (bind_list == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		int level;
+		for (level = 0;level < max_levels+1;level++)
+		{
+			/* TODO: check what should be used as default value */
+			bind_list[level] = starpu_omp_proc_bind_undefined;
+		}
+		const char *env = getenv("OMP_PROC_BIND");
+		if (env)
+		{
+			convert_bind_string(env, bind_list, max_levels);
+		}
+		_initial_icv_values.bind_var = bind_list;
+	}
+
+	/* read OMP_NUM_THREADS */
+	{
+		int *num_threads_list = malloc((1+max_levels) * sizeof(*num_threads_list));
+		if (num_threads_list == NULL)
+			_STARPU_ERROR("memory allocation failed\n");
+		int level;
+		for (level = 0;level < max_levels+1;level++)
+		{
+			/* TODO: check what should be used as default value */
+			num_threads_list[level] = 0;
+		}
+		const char *env = getenv("OMP_NUM_THREADS");
+		if (env)
+		{
+			convert_num_threads_string(env, num_threads_list, max_levels);
+		}
+		_initial_icv_values.nthreads_var = num_threads_list;
+	}
+
+	/* read OMP_PLACES */
+	{
+		memset(&_initial_icv_values.places, 0, sizeof(_initial_icv_values.places));
+		_initial_icv_values.places.abstract_name = starpu_omp_place_undefined;
+		const char *env = getenv("OMP_PLACES");
+		if (env)
+		{
+			convert_places_string(env, &_initial_icv_values.places);
+		}
+	}
+
+	_starpu_omp_initial_icv_values = &_initial_icv_values;
+}
+
+static void free_omp_environment(void)
+{
+	/**/
+	_starpu_omp_initial_icv_values = NULL;
+
+	/* OMP_DYNAMIC */
+	/* OMP_NESTED */
+	/* OMP_SCHEDULE */
+	/* OMP_STACKSIZE */
+	/* OMP_WAIT_POLICY */
+	/* OMP_THREAD_LIMIT */
+	/* OMP_MAX_ACTIVE_LEVELS */
+	/* OMP_CANCELLATION */
+	/* OMP_DEFAULT_DEVICE */
+
+	/* OMP_PROC_BIND */
+	free(_initial_icv_values.bind_var);
+	_initial_icv_values.bind_var = NULL;
+
+	/* OMP_NUM_THREADS */
+	free(_initial_icv_values.nthreads_var);
+	_initial_icv_values.nthreads_var = NULL;
+
+	/* OMP_PLACES */
+	free_places(&_initial_icv_values.places);
+}
+
+static void display_omp_environment(int verbosity_level)
+{
+	if (verbosity_level > 0)
+	{
+		printf("OPENMP DISPLAY ENVIRONMENT BEGIN\n");
+		printf("  _OPENMP='xxxxxx'\n");
+		printf("  [host] OMP_DYNAMIC='%s'\n", _starpu_omp_initial_icv_values->dyn_var?"true":"false");
+		printf("  [host] OMP_NESTED='%s'\n", _starpu_omp_initial_icv_values->nest_var?"true":"false");
+		printf("  [host] OMP_SCHEDULE='");
+		switch (_starpu_omp_initial_icv_values->run_sched_var)
+		{
+			case starpu_omp_sched_static:
+				printf("static, %llu", _starpu_omp_initial_icv_values->run_sched_chunk_var);
+				break;
+			case starpu_omp_sched_dynamic:
+				printf("dynamic, %llu", _starpu_omp_initial_icv_values->run_sched_chunk_var);
+				break;
+			case starpu_omp_sched_guided:
+				printf("guided, %llu", _starpu_omp_initial_icv_values->run_sched_chunk_var);
+				break;
+			case starpu_omp_sched_auto:
+				printf("auto, %llu", _starpu_omp_initial_icv_values->run_sched_chunk_var);
+				break;
+			default:
+				printf("<unknown>");
+				break;
+		}
+		printf("'\n");
+				
+		printf("  [host] OMP_STACKSIZE='%d'\n", _starpu_omp_initial_icv_values->stacksize_var);
+		printf("  [host] OMP_WAIT_POLICY='%s'\n", _starpu_omp_initial_icv_values->wait_policy_var?"active":"passive");
+		printf("  [host] OMP_MAX_ACTIVE_LEVELS='%d'\n", _starpu_omp_initial_icv_values->max_active_levels_var);
+		printf("  [host] OMP_CANCELLATION='%s'\n", _starpu_omp_initial_icv_values->cancel_var?"true":"false");
+		printf("  [host] OMP_DEFAULT_DEVICE='%d'\n", _starpu_omp_initial_icv_values->default_device_var);
+		printf("  [host] OMP_PROC_BIND='");
+		{
+			int level;
+			for (level = 0; level < _starpu_omp_initial_icv_values->max_active_levels_var; level++)
+			{
+				if (level > 0)
+				{
+					printf(", ");
+				}
+				switch (_starpu_omp_initial_icv_values->bind_var[level])
+				{
+					case starpu_omp_proc_bind_false:
+						printf("false");
+						break;
+					case starpu_omp_proc_bind_true:
+						printf("true");
+						break;
+					case starpu_omp_proc_bind_master:
+						printf("master");
+						break;
+					case starpu_omp_proc_bind_close:
+						printf("close");
+						break;
+					case starpu_omp_proc_bind_spread:
+						printf("spread");
+						break;
+					default:
+						printf("<unknown>");
+						break;
+				}
+			}
+		}
+		printf("'\n");
+		printf("  [host] OMP_NUM_THREADS='");
+		{
+			int level;
+			for (level = 0; level < _starpu_omp_initial_icv_values->max_active_levels_var; level++)
+			{
+				if (level > 0)
+				{
+					printf(", ");
+				}
+				printf("%d", _starpu_omp_initial_icv_values->nthreads_var[level]);
+			}
+		}
+		printf("'\n");
+		printf("  [host] OMP_PLACES='");
+		{
+			struct starpu_omp_place *places = &_starpu_omp_initial_icv_values->places;
+			if (places->nb_numeric_places > 0)
+			{
+				int p;
+				for (p = 0; p < places->nb_numeric_places; p++)
+				{
+					if (p > 0)
+					{
+						printf(",");
+					}
+					struct starpu_omp_numeric_place *np = &places->numeric_places[p];
+					if (np->excluded_place)
+					{
+						printf("!");
+					}
+					printf("{");
+					int i;
+					for (i = 0; i < np->nb_included_numeric_items; i++)
+					{
+						if (i > 0)
+						{
+							printf(",");
+						}
+						printf("%d", np->included_numeric_items[i]);
+					}
+					for (i = 0; i < np->nb_excluded_numeric_items; i++)
+					{
+						if (i > 0 || np->nb_included_numeric_items)
+						{
+							printf(",");
+						}
+						printf("!%d", np->excluded_numeric_items[i]);
+					}
+					printf("}");
+					/* TODO: print length/stride suffix */
+				}
+			}
+			else
+			{
+				if (places->abstract_excluded)
+				{
+					printf("!");
+				}
+				switch (places->abstract_name)
+				{
+					case starpu_omp_place_undefined:
+						printf("undefined");
+						break;
+					case starpu_omp_place_threads:
+						printf("threads");
+						break;
+					case starpu_omp_place_cores:
+						printf("cores");
+						break;
+					case starpu_omp_place_sockets:
+						printf("sockets");
+						break;
+					case starpu_omp_place_numerical:
+						printf("<numerical>");
+						break;
+					default:
+						printf("<unknown>");
+						break;
+				}
+				if (places->abstract_length)
+				{
+					printf("(%d)", places->abstract_length);
+				}
+			}
+		}
+		printf("'\n");
+
+		if (verbosity_level > 1)
+		{
+			/* no vendor specific runtime variable */
+		}
+		printf("OPENMP DISPLAY ENVIRONMENT END\n");
+	}
+}
+
+void _starpu_omp_environment_init(void)
+{
+	read_omp_environment();
+	int display_env = 0;
+	read_display_env_var("OMP_DISPLAY_ENV", &display_env);
+	if (display_env > 0)
+	{
+		display_omp_environment(display_env);
+	}
+}
+
+void _starpu_omp_environment_exit(void)
+{
+	free_omp_environment();
+}
+#endif /* STARPU_OPENMP */

+ 279 - 0
src/util/openmp_runtime_support_omp_api.c

@@ -0,0 +1,279 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#ifdef STARPU_OPENMP
+#include <util/openmp_runtime_support.h>
+
+#define __not_implemented__ do { fprintf (stderr, "omp lib function %s not implemented\n", __func__); abort(); } while (0)
+
+void starpu_omp_set_num_threads(int threads)
+{
+	STARPU_ASSERT(threads > 0);
+	struct starpu_omp_task *task = _starpu_omp_get_task();
+	STARPU_ASSERT(task != NULL);
+	struct starpu_omp_region *region;
+	region = task->owner_region;
+	STARPU_ASSERT(region != NULL);
+	region->icvs.nthreads_var[0] = threads;
+}
+
+int starpu_omp_get_num_threads()
+{
+	struct starpu_omp_task *task = _starpu_omp_get_task();
+	struct starpu_omp_region *region;
+	if (task == NULL)
+		return 1;
+
+	region = task->owner_region;
+	return region->nb_threads;
+}
+
+static int _starpu_omp_get_region_thread_num(const struct starpu_omp_region * const region)
+{
+	struct starpu_omp_thread *thread = _starpu_omp_get_thread();
+	STARPU_ASSERT(thread != NULL);
+	if (thread == region->master_thread)
+		return 0;
+	struct starpu_omp_thread * region_thread;
+	int tid = 1;
+	for (region_thread  = starpu_omp_thread_list_begin(region->thread_list);
+			region_thread != starpu_omp_thread_list_end(region->thread_list);
+			region_thread  = starpu_omp_thread_list_next(region_thread))
+	{
+		if (thread == region_thread)
+		{
+			return tid;
+		}
+		tid++;
+	}
+	_STARPU_ERROR("unrecognized omp thread\n");
+}
+
+int starpu_omp_get_thread_num()
+{
+	struct starpu_omp_task *task = _starpu_omp_get_task();
+	if (task == NULL)
+		return 0;
+	return _starpu_omp_get_region_thread_num(task->owner_region);
+}
+
+int starpu_omp_get_max_threads()
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	int max_threads = parallel_region->icvs.nthreads_var[0];
+	/* TODO: for now, nested parallel sections are not supported, thus we
+	 * open an active parallel section only if the generating region is the
+	 * initial region */
+	if (parallel_region->level > 0)
+	{
+		max_threads = 1;
+	}
+
+	return max_threads;
+}
+
+int starpu_omp_get_num_procs (void)
+{
+	/* starpu_cpu_worker_get_count defined as topology.ncpus */
+	return starpu_cpu_worker_get_count();
+}
+
+int starpu_omp_in_parallel (void)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	return parallel_region->icvs.active_levels_var > 0;
+}
+
+void starpu_omp_set_dynamic (int dynamic_threads)
+{
+	(void) dynamic_threads;
+	/* TODO: dynamic adjustment of the number of threads is not supported for now */
+}
+
+int starpu_omp_get_dynamic (void)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	return parallel_region->icvs.dyn_var;
+}
+
+void starpu_omp_set_nested (int nested)
+{
+	(void) nested;
+	/* TODO: nested parallelism not supported for now */
+}
+
+int starpu_omp_get_nested (void)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	return parallel_region->icvs.nest_var;
+}
+
+int starpu_omp_get_cancellation(void)
+{
+	return _starpu_omp_global_state->icvs.cancel_var;
+}
+
+void starpu_omp_set_schedule (enum starpu_omp_sched_value kind, int modifier)
+{
+	struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	STARPU_ASSERT(     kind == starpu_omp_sched_static
+			|| kind == starpu_omp_sched_dynamic
+			|| kind == starpu_omp_sched_guided
+			|| kind == starpu_omp_sched_auto);
+	STARPU_ASSERT(modifier >= 0);
+	parallel_region->icvs.run_sched_var = kind;
+	parallel_region->icvs.run_sched_chunk_var = (unsigned long long)modifier;
+}
+
+void starpu_omp_get_schedule (enum starpu_omp_sched_value *kind, int *modifier)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	*kind = parallel_region->icvs.run_sched_var;
+	*modifier = (int)parallel_region->icvs.run_sched_chunk_var;
+}
+
+int starpu_omp_get_thread_limit (void)
+{
+	return starpu_cpu_worker_get_count();
+}
+
+void starpu_omp_set_max_active_levels (int max_levels)
+{
+	struct starpu_omp_device * const device = _starpu_omp_get_task()->owner_region->owner_device;
+	if (max_levels > 1)
+	{
+		/* TODO: nested parallelism not supported for now */
+		max_levels = 1;
+	}
+	device->icvs.max_active_levels_var = max_levels;
+}
+
+int starpu_omp_get_max_active_levels (void)
+{
+	const struct starpu_omp_device * const device = _starpu_omp_get_task()->owner_region->owner_device;
+	return device->icvs.max_active_levels_var;
+}
+
+int starpu_omp_get_level (void)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	return parallel_region->icvs.levels_var;
+}
+
+int starpu_omp_get_ancestor_thread_num (int level)
+{
+	if (level == 0)
+		return 0;
+	const struct starpu_omp_task *task = _starpu_omp_get_task();
+	if (task == NULL)
+		return -1;
+	const struct starpu_omp_region *parallel_region = task->owner_region;
+	if (level < 0 || level > parallel_region->icvs.levels_var)
+		return -1;
+	while (level < parallel_region->icvs.levels_var)
+	{
+		parallel_region = parallel_region->parent_region;
+	}
+	return _starpu_omp_get_region_thread_num(parallel_region);
+}
+
+int starpu_omp_get_team_size (int level)
+{
+	if (level == 0)
+		return 1;
+	const struct starpu_omp_task *task = _starpu_omp_get_task();
+	if (task == NULL)
+		return -1;
+	const struct starpu_omp_region *parallel_region = task->owner_region;
+	if (level < 0 || level > parallel_region->icvs.levels_var)
+		return -1;
+	while (level < parallel_region->icvs.levels_var)
+	{
+		parallel_region = parallel_region->parent_region;
+	}
+	return parallel_region->nb_threads;
+}
+
+int starpu_omp_get_active_level (void)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	return parallel_region->icvs.active_levels_var;
+}
+
+int starpu_omp_in_final(void)
+{
+	const struct starpu_omp_task *task = _starpu_omp_get_task();
+	return task->is_final;
+}
+
+enum starpu_omp_proc_bind_value starpu_omp_get_proc_bind(void)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	int proc_bind = parallel_region->icvs.bind_var[0];
+	return proc_bind;
+}
+
+void starpu_omp_set_default_device(int device_num)
+{
+	(void) device_num;
+	/* TODO: set_default_device not supported for now */
+}
+
+int starpu_omp_get_default_device(void)
+{
+	const struct starpu_omp_region * const parallel_region = _starpu_omp_get_task()->owner_region;
+	return parallel_region->icvs.default_device_var;
+}
+
+int starpu_omp_get_num_devices(void)
+{
+	/* TODO: get_num_devices not supported for now
+	 * assume 1 device */
+	return 1;
+}
+
+int starpu_omp_get_num_teams(void)
+{
+	/* TODO: num_teams not supported for now
+	 * assume 1 team */
+	return 1;
+}
+
+int starpu_omp_get_team_num(void)
+{
+	/* TODO: team_num not supported for now
+	 * assume team_num 0 */
+	return 0;
+}
+
+int starpu_omp_is_initial_device(void)
+{
+	const struct starpu_omp_device * const device = _starpu_omp_get_task()->owner_region->owner_device;
+	return device == _starpu_omp_global_state->initial_device;
+}
+
+double starpu_omp_get_wtime (void)
+{
+	return 1e-6 * (starpu_timing_now() - _starpu_omp_clock_ref);
+}
+
+double starpu_omp_get_wtick (void)
+{
+	/* arbitrary precision value */
+	return 1e-6;
+}
+#endif /* STARPU_OPENMP */

+ 124 - 0
tests/Makefile.am

@@ -223,6 +223,37 @@ noinst_PROGRAMS =				\
 	microbenchs/redundant_buffer		\
 	microbenchs/local_pingpong		\
 	microbenchs/matrix_as_vector		\
+	openmp/init_exit_01			\
+	openmp/init_exit_02			\
+	openmp/environment			\
+	openmp/api_01				\
+	openmp/parallel_01			\
+	openmp/parallel_02			\
+	openmp/parallel_03			\
+	openmp/parallel_barrier_01		\
+	openmp/parallel_master_01		\
+	openmp/parallel_master_inline_01	\
+	openmp/parallel_single_wait_01		\
+	openmp/parallel_single_nowait_01	\
+	openmp/parallel_single_inline_01	\
+	openmp/parallel_single_copyprivate_01	\
+	openmp/parallel_single_copyprivate_inline_01	\
+	openmp/parallel_critical_01		\
+	openmp/parallel_critical_inline_01	\
+	openmp/parallel_critical_named_01	\
+	openmp/parallel_critical_named_inline_01\
+	openmp/parallel_simple_lock_01		\
+	openmp/parallel_nested_lock_01		\
+	openmp/parallel_for_01			\
+	openmp/parallel_for_02			\
+	openmp/parallel_for_ordered_01		\
+	openmp/parallel_sections_01		\
+	openmp/parallel_sections_combined_01	\
+	openmp/task_01				\
+	openmp/task_02				\
+	openmp/taskwait_01			\
+	openmp/taskgroup_01			\
+	openmp/taskgroup_02			\
 	overlap/overlap				\
 	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
@@ -445,6 +476,99 @@ main_subgraph_repeat_regenerate_tag_SOURCES +=		\
 	main/increment.cu
 endif
 
+openmp_init_exit_01_SOURCES = 	\
+	openmp/init_exit_01.c
+
+openmp_init_exit_02_SOURCES = 	\
+	openmp/init_exit_02.c
+
+openmp_environment_SOURCES = 	\
+	openmp/environment.c
+
+openmp_api_01_SOURCES = 	\
+	openmp/api_01.c
+
+openmp_parallel_01_SOURCES = 	\
+	openmp/parallel_01.c
+
+openmp_parallel_02_SOURCES = 	\
+	openmp/parallel_02.c
+
+openmp_parallel_03_SOURCES = 	\
+	openmp/parallel_03.c
+
+openmp_parallel_barrier_01_SOURCES = 	\
+	openmp/parallel_barrier_01.c
+
+openmp_parallel_master_01_SOURCES = 	\
+	openmp/parallel_master_01.c
+
+openmp_parallel_master_inline_01_SOURCES = 	\
+	openmp/parallel_master_inline_01.c
+
+openmp_parallel_single_wait_01_SOURCES = 	\
+	openmp/parallel_single_wait_01.c
+
+openmp_parallel_single_nowait_01_SOURCES = 	\
+	openmp/parallel_single_nowait_01.c
+
+openmp_parallel_single_inline_01_SOURCES = 	\
+	openmp/parallel_single_inline_01.c
+
+openmp_parallel_single_copyprivate_01_SOURCES = 	\
+	openmp/parallel_single_copyprivate_01.c
+
+openmp_parallel_single_copyprivate_inline_01_SOURCES = 	\
+	openmp/parallel_single_copyprivate_inline_01.c
+
+openmp_parallel_critical_01_SOURCES = 	\
+	openmp/parallel_critical_01.c
+
+openmp_parallel_critical_inline_01_SOURCES = 	\
+	openmp/parallel_critical_inline_01.c
+
+openmp_parallel_critical_named_01_SOURCES = 	\
+	openmp/parallel_critical_named_01.c
+
+openmp_parallel_critical_named_inline_01_SOURCES = 	\
+	openmp/parallel_critical_named_inline_01.c
+
+openmp_parallel_simple_lock_01_SOURCES = 	\
+	openmp/parallel_simple_lock_01.c
+
+openmp_parallel_nested_lock_01_SOURCES = 	\
+	openmp/parallel_nested_lock_01.c
+
+openmp_parallel_for_01_SOURCES = 	\
+	openmp/parallel_for_01.c
+
+openmp_parallel_for_02_SOURCES = 	\
+	openmp/parallel_for_02.c
+
+openmp_parallel_for_ordered_01_SOURCES = 	\
+	openmp/parallel_for_ordered_01.c
+
+openmp_parallel_sections_01_SOURCES = 	\
+	openmp/parallel_sections_01.c
+
+openmp_parallel_sections_combined_01_SOURCES = 	\
+	openmp/parallel_sections_combined_01.c
+
+openmp_task_01_SOURCES = 	\
+	openmp/task_01.c
+
+openmp_task_02_SOURCES = 	\
+	openmp/task_02.c
+
+openmp_taskwait_01_SOURCES = 	\
+	openmp/taskwait_01.c
+
+openmp_taskgroup_01_SOURCES = 	\
+	openmp/taskgroup_01.c
+
+openmp_taskgroup_02_SOURCES = 	\
+	openmp/taskgroup_02.c
+
 ###################
 # Block interface #
 ###################

+ 135 - 0
tests/openmp/api_01.c

@@ -0,0 +1,135 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret;
+	/* we clear the whole OMP environment for this test, to check the
+	 * default behaviour of API functions */
+	unsetenv("OMP_DYNAMIC");
+	unsetenv("OMP_NESTED");
+	unsetenv("OMP_SCHEDULE");
+	unsetenv("OMP_STACKSIZE");
+	unsetenv("OMP_WAIT_POLICY");
+	unsetenv("OMP_THREAD_LIMIT");
+	unsetenv("OMP_MAX_ACTIVE_LEVELS");
+	unsetenv("OMP_CANCELLATION");
+	unsetenv("OMP_DEFAULT_DEVICE");
+	unsetenv("OMP_PROC_BIND");
+	unsetenv("OMP_NUM_THREADS");
+	unsetenv("OMP_PLACES");
+	unsetenv("OMP_DISPLAY_ENV");
+	ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+#define check_omp_func(f,_tv)					\
+{								\
+	const int v = (f());					\
+	const int tv = (_tv);					\
+	printf(#f ": %d (should be %d)\n", v, tv);		\
+	STARPU_ASSERT(v == tv);					\
+}
+
+const char * get_sched_name(int sched_value)
+{
+	const char *sched_name = NULL;
+
+	switch (sched_value)
+	{
+		case starpu_omp_sched_undefined: sched_name = "<undefined>"; break;
+		case starpu_omp_sched_static:    sched_name = "static"; break;
+		case starpu_omp_sched_dynamic:   sched_name = "dynamic"; break;
+		case starpu_omp_sched_guided:    sched_name = "guided"; break;
+		case starpu_omp_sched_auto:      sched_name = "auto"; break;
+		case starpu_omp_sched_runtime:   sched_name = "runtime"; break;
+		default: _STARPU_ERROR("invalid omp schedule value");
+	}
+	return sched_name;
+}
+
+int
+main (int argc, char *argv[])
+{
+	const int nb_cpus = starpu_cpu_worker_get_count();
+
+	check_omp_func(starpu_omp_get_num_threads, 1);
+	check_omp_func(starpu_omp_get_thread_num, 0);
+	/* since OMP_NUM_THREADS is cleared, starpu_omp_get_max_threads() should return nb_cpus */
+	check_omp_func(starpu_omp_get_max_threads, nb_cpus);
+	check_omp_func(starpu_omp_get_num_procs, nb_cpus);
+	check_omp_func(starpu_omp_in_parallel, 0);
+	check_omp_func(starpu_omp_get_dynamic, 0);
+	check_omp_func(starpu_omp_get_nested, 0);
+	check_omp_func(starpu_omp_get_cancellation, 0);
+	{
+		const enum starpu_omp_sched_value target_kind = starpu_omp_sched_static;
+		const int target_modifier = 0;
+		enum starpu_omp_sched_value kind;
+		int modifier;
+		const char *sched_name;
+		const char *target_sched_name;
+		starpu_omp_get_schedule(&kind, &modifier);
+		sched_name = get_sched_name(kind);
+		target_sched_name = get_sched_name(target_kind);
+		printf("starpu_omp_get_schedule: %s,%d (should be %s,%d)\n", sched_name, modifier, target_sched_name, target_modifier);
+		STARPU_ASSERT(kind == target_kind && modifier == target_modifier);
+	}
+	check_omp_func(starpu_omp_get_thread_limit, nb_cpus);
+	check_omp_func(starpu_omp_get_max_active_levels, 1);
+	check_omp_func(starpu_omp_get_level, 0);
+	{
+		const int tv = 0;
+		const int v = starpu_omp_get_ancestor_thread_num(0);
+		printf("starpu_omp_get_ancestor_thread_num(0): %d (should be %d)\n", v, tv);
+		STARPU_ASSERT(v == tv);
+	}
+	{
+		const int tv = 1;
+		const int v = starpu_omp_get_team_size(0);
+		printf("starpu_omp_get_team_size(0): %d (should be %d)\n", v, tv);
+		STARPU_ASSERT(v == tv);
+	}
+	check_omp_func(starpu_omp_get_active_level, 0);
+	check_omp_func(starpu_omp_in_final, 0);
+	check_omp_func(starpu_omp_get_proc_bind, starpu_omp_proc_bind_undefined);
+	check_omp_func(starpu_omp_get_default_device, 0);
+	/* TODO: support more than one device */
+	check_omp_func(starpu_omp_get_num_devices, 1);
+	check_omp_func(starpu_omp_get_num_teams, 1);
+	check_omp_func(starpu_omp_get_team_num, 0);
+	check_omp_func(starpu_omp_is_initial_device, 1);
+	return 0;
+}
+#endif

+ 48 - 0
tests/openmp/environment.c

@@ -0,0 +1,48 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+int
+main (int argc, char *argv[]) {
+	setenv("OMP_DYNAMIC","false", 1);
+	setenv("OMP_NESTED","false", 1);
+	setenv("OMP_SCHEDULE","auto", 1);
+	setenv("OMP_STACKSIZE","2M", 1);
+	setenv("OMP_WAIT_POLICY","passive", 1);
+	setenv("OMP_THREAD_LIMIT","0", 1);
+	setenv("OMP_MAX_ACTIVE_LEVELS","4", 1);
+	setenv("OMP_CANCELLATION","false", 1);
+	setenv("OMP_DEFAULT_DEVICE","0", 1);
+	setenv("OMP_PROC_BIND","spread, spread, close", 1);
+	setenv("OMP_NUM_THREADS","4, 16, 2", 1);
+	setenv("OMP_PLACES","{1,2,3,4},{5,6,7,8}", 1);
+	setenv("OMP_DISPLAY_ENV","verbose", 1);
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+	starpu_omp_shutdown();
+	return 0;
+}
+#endif

+ 34 - 0
tests/openmp/init_exit_01.c

@@ -0,0 +1,34 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+int
+main (int argc, char *argv[]) {
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+	starpu_omp_shutdown();
+	return 0;
+}
+#endif

+ 44 - 0
tests/openmp/init_exit_02.c

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+int
+main (int argc, char *argv[]) {
+	return 0;
+}
+#endif

+ 62 - 0
tests/openmp/parallel_01.c

@@ -0,0 +1,62 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 79 - 0
tests/openmp/parallel_02.c

@@ -0,0 +1,79 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_2_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] parallel region 2: task thread = %d\n", (void *)tid, worker_id);
+}
+
+void parallel_region_1_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	struct starpu_omp_parallel_region_attr attr;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] parallel region 1: task thread = %d\n", (void *)tid, worker_id);
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_2_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_1_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 63 - 0
tests/openmp/parallel_03.c

@@ -0,0 +1,63 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 71 - 0
tests/openmp/parallel_barrier_01.c

@@ -0,0 +1,71 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- barrier 1\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+	printf("[tid %p] task thread = %d -- barrier 2\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+	printf("[tid %p] task thread = %d -- barrier 3\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+	printf("[tid %p] task thread = %d -- barrier 4\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+}
+
+int
+main (int argc, char *argv[]) {
+	pthread_t tid;
+	struct starpu_omp_parallel_region_attr attr;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 83 - 0
tests/openmp/parallel_critical_01.c

@@ -0,0 +1,83 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void critical_g(void *arg)
+{
+	(void) arg;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- critical\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_critical(critical_g, NULL, NULL);
+	starpu_omp_critical(critical_g, NULL, NULL);
+	starpu_omp_critical(critical_g, NULL, NULL);
+	starpu_omp_critical(critical_g, NULL, NULL);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	printf("<main>\n");
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 86 - 0
tests/openmp/parallel_critical_inline_01.c

@@ -0,0 +1,86 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+
+	starpu_omp_critical_inline_begin(NULL);
+	printf("[tid %p] task thread = %d -- critical\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end(NULL);
+
+	starpu_omp_critical_inline_begin(NULL);
+	printf("[tid %p] task thread = %d -- critical\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end(NULL);
+
+	starpu_omp_critical_inline_begin(NULL);
+	printf("[tid %p] task thread = %d -- critical\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end(NULL);
+
+	starpu_omp_critical_inline_begin(NULL);
+	printf("[tid %p] task thread = %d -- critical\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end(NULL);
+
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	printf("<main>\n");
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 93 - 0
tests/openmp/parallel_critical_named_01.c

@@ -0,0 +1,93 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void critical_g(void *arg)
+{
+	(void) arg;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- critical \"g\"\n", (void *)tid, worker_id);
+}
+
+void critical_h(void *arg)
+{
+	(void) arg;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- critical \"h\"\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_critical(critical_g, NULL, "g");
+	starpu_omp_critical(critical_h, NULL, "h");
+	starpu_omp_critical(critical_g, NULL, "g");
+	starpu_omp_critical(critical_h, NULL, "h");
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	printf("<main>\n");
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 86 - 0
tests/openmp/parallel_critical_named_inline_01.c

@@ -0,0 +1,86 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+
+	starpu_omp_critical_inline_begin("g");
+	printf("[tid %p] task thread = %d -- critical \"g\"\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end("g");
+
+	starpu_omp_critical_inline_begin("h");
+	printf("[tid %p] task thread = %d -- critical \"h\"\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end("h");
+
+	starpu_omp_critical_inline_begin("g");
+	printf("[tid %p] task thread = %d -- critical \"g\"\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end("g");
+
+	starpu_omp_critical_inline_begin("h");
+	printf("[tid %p] task thread = %d -- critical \"h\"\n", (void *)tid, worker_id);
+	starpu_omp_critical_inline_end("h");
+
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	printf("<main>\n");
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 189 - 0
tests/openmp/parallel_for_01.c

@@ -0,0 +1,189 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+#define NB_ITERS 256
+#define CHUNK 16
+unsigned long long array[NB_ITERS];
+
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void for_g(unsigned long long i, unsigned long long nb_i, void *arg)
+{
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d, for [%s] iterations first=%llu:nb=%llu\n", (void *)tid, worker_id, (const char *)arg, i, nb_i);
+	for (; nb_i > 0; i++, nb_i--)
+	{
+		array[i] = 1;
+	}
+}
+
+void parallel_region_1_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"static chunk", NB_ITERS, CHUNK, starpu_omp_sched_static, 0, 0);
+}
+
+void parallel_region_2_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"static nochunk", NB_ITERS, 0, starpu_omp_sched_static, 0, 0);
+}
+
+void parallel_region_3_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"dynamic chunk", NB_ITERS, CHUNK, starpu_omp_sched_dynamic, 0, 0);
+}
+
+void parallel_region_4_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"dynamic nochunk", NB_ITERS, 0, starpu_omp_sched_dynamic, 0, 0);
+}
+
+void parallel_region_5_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"guided nochunk", NB_ITERS, 0, starpu_omp_sched_guided, 0, 0);
+}
+
+void parallel_region_6_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"guided nochunk", NB_ITERS, 0, starpu_omp_sched_guided, 0, 0);
+}
+
+static void clear_array(void)
+{
+	memset(array, 0, NB_ITERS*sizeof(unsigned long long));
+}
+
+static void check_array(void)
+{
+	unsigned long long i;
+	unsigned long long s = 0;
+	for (i = 0; i < NB_ITERS; i++)
+	{
+		s += array[i];
+	}
+	if (s != NB_ITERS)
+	{
+		printf("missing iterations\n");
+		exit(1);
+	}
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_1_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_2_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_3_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_4_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_5_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_6_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+	return 0;
+}
+#endif

+ 90 - 0
tests/openmp/parallel_for_02.c

@@ -0,0 +1,90 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+#define NB_ITERS 4321
+#define CHUNK 42
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void for_g(unsigned long long i, unsigned long long nb_i, void *arg)
+{
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d, for [%s] iterations first=%llu:nb=%llu\n", (void *)tid, worker_id, (const char *)arg, i, nb_i);
+	for (; nb_i > 0; i++, nb_i--)
+	{
+		printf("[tid %p] task thread = %d, for [%s] iteration %llu\n", (void *)tid, worker_id, (const char *)arg, i);
+	}
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"static chunk", NB_ITERS, CHUNK, starpu_omp_sched_static, 0, 1);
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"static nochunk", NB_ITERS, 0, starpu_omp_sched_static, 0, 1);
+	
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"dynamic chunk", NB_ITERS, CHUNK, starpu_omp_sched_dynamic, 0, 1);
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"dynamic nochunk", NB_ITERS, 0, starpu_omp_sched_dynamic, 0, 1);
+
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"guided chunk", NB_ITERS, CHUNK, starpu_omp_sched_guided, 0, 1);
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"guided nochunk", NB_ITERS, 0, starpu_omp_sched_guided, 0, 1);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 207 - 0
tests/openmp/parallel_for_ordered_01.c

@@ -0,0 +1,207 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+#define NB_ITERS 256
+#define CHUNK 16
+unsigned long long array[NB_ITERS];
+
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+struct s_ordered_arg
+{
+	const char *msg;
+	unsigned long long i;
+};
+
+void ordered_f(void *_arg)
+{
+	struct s_ordered_arg *arg = _arg;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d, for [%s] iteration (ordered) %llu\n", (void *)tid, worker_id, arg->msg, arg->i);
+}
+
+void for_g(unsigned long long i, unsigned long long nb_i, void *arg)
+{
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d, for [%s] iterations first=%llu:nb=%llu\n", (void *)tid, worker_id, (const char *)arg, i, nb_i);
+	for (; nb_i > 0; i++, nb_i--)
+	{
+		struct s_ordered_arg ordered_arg = { arg, i };
+		array[i] = 1;
+		starpu_omp_ordered(ordered_f, &ordered_arg);
+	}
+}
+
+void parallel_region_1_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"static chunk", NB_ITERS, CHUNK, starpu_omp_sched_static, 1, 0);
+}
+
+void parallel_region_2_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"static nochunk", NB_ITERS, 0, starpu_omp_sched_static, 1, 0);
+}
+
+void parallel_region_3_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"dynamic chunk", NB_ITERS, CHUNK, starpu_omp_sched_dynamic, 1, 0);
+}
+
+void parallel_region_4_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"dynamic nochunk", NB_ITERS, 0, starpu_omp_sched_dynamic, 1, 0);
+}
+
+void parallel_region_5_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"guided nochunk", NB_ITERS, 0, starpu_omp_sched_guided, 1, 0);
+}
+
+void parallel_region_6_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+	starpu_omp_for(for_g, (void*)"guided nochunk", NB_ITERS, 0, starpu_omp_sched_guided, 1, 0);
+}
+
+static void clear_array(void)
+{
+	memset(array, 0, NB_ITERS*sizeof(unsigned long long));
+}
+
+static void check_array(void)
+{
+	unsigned long long i;
+	unsigned long long s = 0;
+	for (i = 0; i < NB_ITERS; i++)
+	{
+		s += array[i];
+	}
+	if (s != NB_ITERS)
+	{
+		printf("missing iterations\n");
+		exit(1);
+	}
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_1_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_2_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_3_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_4_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_5_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+
+	clear_array();
+	attr.cl.cpu_funcs[0] = parallel_region_6_f;
+	starpu_omp_parallel_region(&attr);
+	check_array();
+	return 0;
+}
+#endif

+ 83 - 0
tests/openmp/parallel_master_01.c

@@ -0,0 +1,83 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void master_g(void *arg)
+{
+	(void) arg;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- master\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_master(master_g, NULL);
+	starpu_omp_master(master_g, NULL);
+	starpu_omp_master(master_g, NULL);
+	starpu_omp_master(master_g, NULL);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 77 - 0
tests/openmp/parallel_master_inline_01.c

@@ -0,0 +1,77 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	if (starpu_omp_master_inline())
+		printf("[tid %p] task thread = %d -- master\n", (void *)tid, worker_id);
+	if (starpu_omp_master_inline())
+		printf("[tid %p] task thread = %d -- master\n", (void *)tid, worker_id);
+	if (starpu_omp_master_inline())
+		printf("[tid %p] task thread = %d -- master\n", (void *)tid, worker_id);
+	if (starpu_omp_master_inline())
+		printf("[tid %p] task thread = %d -- master\n", (void *)tid, worker_id);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 117 - 0
tests/openmp/parallel_nested_lock_01.c

@@ -0,0 +1,117 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+starpu_omp_nest_lock_t omp_nest_lock;
+
+void locked_func_n2(void)
+{
+	const int worker_id = starpu_worker_get_id();
+	const pthread_t tid = pthread_self();
+	printf("[tid %p] task thread = %d -- locked function n2\n", (void *)tid, worker_id);
+}
+void locked_func_n1(void)
+{
+	const int worker_id = starpu_worker_get_id();
+	const pthread_t tid = pthread_self();
+	printf("[tid %p] task thread = %d -- locked function n1 -->\n", (void *)tid, worker_id);
+	starpu_omp_set_nest_lock(&omp_nest_lock);
+	locked_func_n2();
+	starpu_omp_unset_nest_lock(&omp_nest_lock);
+	printf("[tid %p] task thread = %d -- locked function n1 <--\n", (void *)tid, worker_id);
+}
+
+void master_g1(void *arg)
+{
+	starpu_omp_init_nest_lock(&omp_nest_lock);
+}
+
+void master_g2(void *arg)
+{
+	starpu_omp_destroy_nest_lock(&omp_nest_lock);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	const int worker_id = starpu_worker_get_id();
+	const pthread_t tid = pthread_self();
+	(void) buffers;
+	(void) args;
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_master(master_g1, NULL);
+	starpu_omp_barrier();
+
+	starpu_omp_set_nest_lock(&omp_nest_lock);
+	locked_func_n1();
+	starpu_omp_unset_nest_lock(&omp_nest_lock);
+
+	starpu_omp_set_nest_lock(&omp_nest_lock);
+	locked_func_n1();
+	starpu_omp_unset_nest_lock(&omp_nest_lock);
+
+	starpu_omp_set_nest_lock(&omp_nest_lock);
+	locked_func_n1();
+	starpu_omp_unset_nest_lock(&omp_nest_lock);
+
+	starpu_omp_set_nest_lock(&omp_nest_lock);
+	locked_func_n1();
+	starpu_omp_unset_nest_lock(&omp_nest_lock);
+
+	starpu_omp_barrier();
+	starpu_omp_master(master_g2, NULL);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	printf("<main>\n");
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 106 - 0
tests/openmp/parallel_sections_01.c

@@ -0,0 +1,106 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void f(void *arg)
+{
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d, section [%s]\n", (void *)tid, worker_id, (const char *)arg);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	void (*section_f[4])(void *);
+	void *section_args[4];
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+
+	section_f[0] = f;
+	section_f[1] = f;
+	section_f[2] = f;
+	section_f[3] = f;
+
+	section_args[0] = (void *)"A";
+	section_args[1] = (void *)"B";
+	section_args[2] = (void *)"C";
+	section_args[3] = (void *)"D";
+
+	starpu_omp_sections(4, section_f, section_args, 0);
+
+	section_args[0] = (void *)"E";
+	section_args[1] = (void *)"F";
+	section_args[2] = (void *)"G";
+	section_args[3] = (void *)"H";
+
+	starpu_omp_sections(4, section_f, section_args, 0);
+
+	section_args[0] = (void *)"I";
+	section_args[1] = (void *)"J";
+	section_args[2] = (void *)"K";
+	section_args[3] = (void *)"L";
+
+	starpu_omp_sections(4, section_f, section_args, 0);
+
+	section_args[0] = (void *)"M";
+	section_args[1] = (void *)"N";
+	section_args[2] = (void *)"O";
+	section_args[3] = (void *)"P";
+
+	starpu_omp_sections(4, section_f, section_args, 0);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 100 - 0
tests/openmp/parallel_sections_combined_01.c

@@ -0,0 +1,100 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void f(unsigned long long section_num, void *arg)
+{
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d, section [%llu: %s]\n", (void *)tid, worker_id, section_num, (const char *)arg);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	void *section_args[4];
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d\n", (void *)tid, worker_id);
+
+	section_args[0] = (void *)"A";
+	section_args[1] = (void *)"B";
+	section_args[2] = (void *)"C";
+	section_args[3] = (void *)"D";
+
+	starpu_omp_sections_combined(4, f, section_args, 0);
+
+	section_args[0] = (void *)"E";
+	section_args[1] = (void *)"F";
+	section_args[2] = (void *)"G";
+	section_args[3] = (void *)"H";
+
+	starpu_omp_sections_combined(4, f, section_args, 0);
+
+	section_args[0] = (void *)"I";
+	section_args[1] = (void *)"J";
+	section_args[2] = (void *)"K";
+	section_args[3] = (void *)"L";
+
+	starpu_omp_sections_combined(4, f, section_args, 0);
+
+	section_args[0] = (void *)"M";
+	section_args[1] = (void *)"N";
+	section_args[2] = (void *)"O";
+	section_args[3] = (void *)"P";
+
+	starpu_omp_sections_combined(4, f, section_args, 0);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 107 - 0
tests/openmp/parallel_simple_lock_01.c

@@ -0,0 +1,107 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+starpu_omp_lock_t omp_lock;
+
+void locked_func(void)
+{
+	const int worker_id = starpu_worker_get_id();
+	const pthread_t tid = pthread_self();
+	printf("[tid %p] task thread = %d -- locked function\n", (void *)tid, worker_id);
+}
+
+void master_g1(void *arg)
+{
+	starpu_omp_init_lock(&omp_lock);
+}
+
+void master_g2(void *arg)
+{
+	starpu_omp_destroy_lock(&omp_lock);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	const int worker_id = starpu_worker_get_id();
+	const pthread_t tid = pthread_self();
+	(void) buffers;
+	(void) args;
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_master(master_g1, NULL);
+	starpu_omp_barrier();
+
+	starpu_omp_set_lock(&omp_lock);
+	locked_func();
+	starpu_omp_unset_lock(&omp_lock);
+
+	starpu_omp_set_lock(&omp_lock);
+	locked_func();
+	starpu_omp_unset_lock(&omp_lock);
+
+	starpu_omp_set_lock(&omp_lock);
+	locked_func();
+	starpu_omp_unset_lock(&omp_lock);
+
+	starpu_omp_set_lock(&omp_lock);
+	locked_func();
+	starpu_omp_unset_lock(&omp_lock);
+
+	starpu_omp_barrier();
+	starpu_omp_master(master_g2, NULL);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	printf("<main>\n");
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 91 - 0
tests/openmp/parallel_single_copyprivate_01.c

@@ -0,0 +1,91 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void single_g(void *arg, void *_data, unsigned long long data_size)
+{
+	(void) arg;
+	int *data = _data;
+	STARPU_ASSERT(data_size >= sizeof(*data));
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	*data = worker_id;
+	printf("[tid %p] task thread = %d -- single\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	int single_worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_single_copyprivate(single_g, NULL, &single_worker_id, sizeof(single_worker_id));
+	printf("[tid %p] task thread = %d -- copyprivate: single_worker_id = %d\n", (void *)tid, worker_id, single_worker_id);
+	starpu_omp_single_copyprivate(single_g, NULL, &single_worker_id, sizeof(single_worker_id));
+	printf("[tid %p] task thread = %d -- copyprivate: single_worker_id = %d\n", (void *)tid, worker_id, single_worker_id);
+	starpu_omp_single_copyprivate(single_g, NULL, &single_worker_id, sizeof(single_worker_id));
+	printf("[tid %p] task thread = %d -- copyprivate: single_worker_id = %d\n", (void *)tid, worker_id, single_worker_id);
+	starpu_omp_single_copyprivate(single_g, NULL, &single_worker_id, sizeof(single_worker_id));
+	printf("[tid %p] task thread = %d -- copyprivate: single_worker_id = %d\n", (void *)tid, worker_id, single_worker_id);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 88 - 0
tests/openmp/parallel_single_copyprivate_inline_01.c

@@ -0,0 +1,88 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	int single_worker_id;
+	int *single_data;
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	int i;
+
+	for (i=0; i<4; i++)
+	{
+		if ((single_data = starpu_omp_single_copyprivate_inline_begin(&single_worker_id)) == NULL)
+		{
+			printf("[tid %p] task thread = %d -- single\n", (void *)tid, worker_id);
+			single_worker_id = worker_id;
+		}
+		else
+		{
+			memcpy(&single_worker_id, single_data, sizeof(single_worker_id));
+		}
+		starpu_omp_single_copyprivate_inline_end();
+		printf("[tid %p] task thread = %d -- single_worker_id = %d\n", (void *)tid, worker_id, single_worker_id);
+	}
+
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 94 - 0
tests/openmp/parallel_single_inline_01.c

@@ -0,0 +1,94 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	
+	/* nowait = 0 */
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+	starpu_omp_barrier();
+
+	/* nowait = 1 */
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+	if (starpu_omp_single_inline())
+		printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 83 - 0
tests/openmp/parallel_single_nowait_01.c

@@ -0,0 +1,83 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void single_g(void *arg)
+{
+	(void) arg;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- single nowait\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_single(single_g, NULL, 1);
+	starpu_omp_single(single_g, NULL, 1);
+	starpu_omp_single(single_g, NULL, 1);
+	starpu_omp_single(single_g, NULL, 1);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 83 - 0
tests/openmp/parallel_single_wait_01.c

@@ -0,0 +1,83 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void single_g(void *arg)
+{
+	(void) arg;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- single\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
+	starpu_omp_single(single_g, NULL, 0);
+	starpu_omp_single(single_g, NULL, 0);
+	starpu_omp_single(single_g, NULL, 0);
+	starpu_omp_single(single_g, NULL, 0);
+	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	pthread_t tid;
+	tid = pthread_self();
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	starpu_omp_parallel_region(&attr);
+	printf("<main>\n");
+	return 0;
+}
+#endif

+ 87 - 0
tests/openmp/task_01.c

@@ -0,0 +1,87 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void task_region_g(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: explicit task \"g\"\n", (void *)tid, worker_id);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	struct starpu_omp_task_region_attr attr;
+
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: implicit task \"f\"\n", (void *)tid, worker_id);
+	
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+	starpu_omp_task_region(&attr);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 196 - 0
tests/openmp/task_02.c

@@ -0,0 +1,196 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+#define	NX	64
+int global_vector[NX];
+
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void task_region_h(void *buffers[], void *args)
+{
+	struct starpu_vector_interface *_vector = buffers[0];
+	int nx = STARPU_VECTOR_GET_NX(_vector);
+	int *v = (int *)STARPU_VECTOR_GET_PTR(_vector);
+	int f = (int)(intptr_t)args;
+	int i;
+
+	printf("depth 2 task, entry: vector ptr = %p\n", v);
+
+	for (i = 0; i < nx; i++)
+	{
+                v[i] += f;
+	}
+
+	printf("depth 2 task ending\n");
+}
+
+void task_region_g(void *buffers[], void *args)
+{
+	struct starpu_vector_interface *_vector = buffers[0];
+
+	int nx = STARPU_VECTOR_GET_NX(_vector);
+	int *v = (int *)STARPU_VECTOR_GET_PTR(_vector);
+	int f = (int)(intptr_t)args;
+	
+	printf("depth 1 task, entry: vector ptr = %p\n", v);
+
+	{
+		starpu_data_handle_t task_vector_handle;
+		int i;
+
+		for (i = 0; i < nx; i++)
+		{
+			v[i] += f;
+		}
+
+		starpu_vector_data_register(&task_vector_handle, STARPU_MAIN_RAM, (uintptr_t)v, NX, sizeof(v[0]));
+		printf("depth 1 task, block 1: task_vector_handle = %p\n", task_vector_handle);
+	}
+
+	{
+		starpu_data_handle_t task_vector_handle;
+		struct starpu_omp_task_region_attr attr;
+		int i;
+
+		task_vector_handle = starpu_data_lookup(v);
+		printf("depth 1 task, block 2: task_vector_handle = %p\n", task_vector_handle);
+
+		memset(&attr, 0, sizeof(attr));
+		attr.cl.cpu_funcs[0]  = task_region_h;
+		attr.cl.where         = STARPU_CPU;
+		attr.cl.nbuffers      = 1;
+		attr.cl.modes[0]      = STARPU_RW;
+		attr.handles          = &task_vector_handle;
+		attr.cl_arg_size      = sizeof(void *);
+		attr.cl_arg_free      = 0;
+		attr.if_clause        = 1;
+		attr.final_clause     = 0;
+		attr.untied_clause    = 1;
+		attr.mergeable_clause = 0;
+
+		i = 0;
+
+		attr.cl_arg = (void *)(intptr_t)i++;
+		starpu_omp_task_region(&attr);
+		attr.cl_arg = (void *)(intptr_t)i++;
+		starpu_omp_task_region(&attr);
+	}
+
+	starpu_omp_taskwait();
+}
+
+void master_g1(void *arg)
+{
+	starpu_data_handle_t region_vector_handle;
+	int i;
+
+	printf("master_g1: vector ptr = %p\n", global_vector);
+	for (i = 0; i < NX; i++)
+	{
+		global_vector[i] = 1;
+	}
+
+	starpu_vector_data_register(&region_vector_handle, STARPU_MAIN_RAM, (uintptr_t)global_vector, NX, sizeof(global_vector[0]));
+	printf("master_g1: region_vector_handle = %p\n", region_vector_handle);
+}
+
+void master_g2(void *arg)
+{
+	starpu_data_handle_t region_vector_handle;
+	struct starpu_omp_task_region_attr attr;
+	int i;
+
+	region_vector_handle = starpu_data_lookup(global_vector);
+	printf("master_g2: region_vector_handle = %p\n", region_vector_handle);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.cl.nbuffers      = 1;
+	attr.cl.modes[0]      = STARPU_RW;
+	attr.handles          = &region_vector_handle;
+	attr.cl_arg_size      = sizeof(void *);
+	attr.cl_arg_free      = 0;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+
+	i = 0;
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	starpu_omp_master(master_g1, NULL);
+	starpu_omp_barrier();
+	{
+		starpu_data_handle_t region_vector_handle;
+		region_vector_handle = starpu_data_lookup(global_vector);
+		printf("parallel_region block 1: region_vector_handle = %p\n", region_vector_handle);
+	}
+	starpu_omp_barrier();
+	starpu_omp_master(master_g2, NULL);
+	starpu_omp_barrier();
+	{
+		starpu_data_handle_t region_vector_handle;
+		region_vector_handle = starpu_data_lookup(global_vector);
+		printf("parallel_region block 2: region_vector_handle = %p\n", region_vector_handle);
+	}
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 119 - 0
tests/openmp/taskgroup_01.c

@@ -0,0 +1,119 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void task_region_g(void *buffers[], void *args)
+{
+	(void) buffers;
+	int i = (int)(intptr_t) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: explicit task \"g[%d]\"\n", (void *)tid, worker_id, i);
+}
+
+void taskgroup_f(void *arg)
+{
+	struct starpu_omp_task_region_attr attr;
+	int *p_i = (int *)arg;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.cl_arg_size      = sizeof(void *);
+	attr.cl_arg_free      = 0;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+
+	attr.cl_arg = (void *)(intptr_t)(*p_i)++;
+	starpu_omp_task_region(&attr);
+
+	attr.cl_arg = (void *)(intptr_t)(*p_i)++;
+	starpu_omp_task_region(&attr);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	struct starpu_omp_task_region_attr attr;
+	int i = 0;
+
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: implicit task \"f\"\n", (void *)tid, worker_id);
+	
+	starpu_omp_taskgroup(taskgroup_f, (void *)&i);
+	printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id);
+
+	starpu_omp_taskgroup(taskgroup_f, (void *)&i);
+	printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.cl_arg_size      = sizeof(void *);
+	attr.cl_arg_free      = 0;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 123 - 0
tests/openmp/taskgroup_02.c

@@ -0,0 +1,123 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void task_region_g(void *buffers[], void *args)
+{
+	(void) buffers;
+	int i = (int)(intptr_t) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: explicit task \"g[%d]\"\n", (void *)tid, worker_id, i);
+}
+
+void taskgroup_f(void *arg)
+{
+	struct starpu_omp_task_region_attr attr;
+	int *p_i = (int *)arg;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.cl_arg_size      = sizeof(void *);
+	attr.cl_arg_free      = 0;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+
+	attr.cl_arg = (void *)(intptr_t)(*p_i)++;
+	starpu_omp_task_region(&attr);
+
+	attr.cl_arg = (void *)(intptr_t)(*p_i)++;
+	starpu_omp_task_region(&attr);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	struct starpu_omp_task_region_attr attr;
+	int i = 0;
+
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: implicit task \"f\"\n", (void *)tid, worker_id);
+	
+	starpu_omp_taskgroup_inline_begin();
+	taskgroup_f((void *)&i);
+	starpu_omp_taskgroup_inline_end();
+	printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id);
+
+	starpu_omp_taskgroup_inline_begin();
+	taskgroup_f((void *)&i);
+	starpu_omp_taskgroup_inline_end();
+	printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.cl_arg_size      = sizeof(void *);
+	attr.cl_arg_free      = 0;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif

+ 105 - 0
tests/openmp/taskwait_01.c

@@ -0,0 +1,105 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void task_region_g(void *buffers[], void *args)
+{
+	(void) buffers;
+	int i = (int)(intptr_t) args;
+	int worker_id;
+	pthread_t tid;
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: explicit task \"g[%d]\"\n", (void *)tid, worker_id, i);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	int worker_id;
+	pthread_t tid;
+	struct starpu_omp_task_region_attr attr;
+	int i = 0;
+
+	tid = pthread_self();
+	worker_id = starpu_worker_get_id();
+	printf("[tid %p] task thread = %d: implicit task \"f\"\n", (void *)tid, worker_id);
+	
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CPU;
+	attr.cl_arg_size      = sizeof(void *);
+	attr.cl_arg_free      = 0;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	starpu_omp_taskwait();
+	printf("[tid %p] task thread = %d: implicit task \"f\": taskwait\n", (void *)tid, worker_id);
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	starpu_omp_taskwait();
+	printf("[tid %p] task thread = %d: implicit task \"f\": taskwait\n", (void *)tid, worker_id);
+
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+	attr.cl_arg = (void *)(intptr_t)i++;
+	starpu_omp_task_region(&attr);
+}
+
+int
+main (int argc, char *argv[]) {
+	struct starpu_omp_parallel_region_attr attr;
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+	return 0;
+}
+#endif