Преглед изворни кода

Merge branch 'master' of https://scm.gforge.inria.fr/authscm/ajuven/git/starpu/starpu

Alexis Juven пре 6 година
родитељ
комит
29b4db4b7e

+ 16 - 0
include/starpu_openmp.h

@@ -69,6 +69,20 @@ struct starpu_omp_task_region_attr
 	int final_clause;
 	int untied_clause;
 	int mergeable_clause;
+
+   /*
+    * taskloop attribute
+    * */
+   int is_loop;
+   int nogroup_clause;
+
+   int collapse;
+   int num_tasks;
+   unsigned long long nb_iterations;
+   unsigned long long grainsize;
+   unsigned long long begin_i;
+   unsigned long long end_i;
+   unsigned long long chunk;
 };
 
 #ifdef __cplusplus
@@ -104,6 +118,8 @@ extern void starpu_omp_taskwait(void) __STARPU_OMP_NOTHROW;
 extern void starpu_omp_taskgroup(void (*f)(void *arg), void *arg) __STARPU_OMP_NOTHROW;
 extern void starpu_omp_taskgroup_inline_begin(void) __STARPU_OMP_NOTHROW;
 extern void starpu_omp_taskgroup_inline_end(void) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_taskloop_inline_begin(struct starpu_omp_task_region_attr *attr) __STARPU_OMP_NOTHROW;
+extern void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_attr *attr) __STARPU_OMP_NOTHROW;
 
 extern void starpu_omp_for(void (*f)(unsigned long long _first_i, unsigned long long _nb_i, void *arg), void *arg, unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, int nowait) __STARPU_OMP_NOTHROW;
 extern int starpu_omp_for_inline_first(unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_first_i, unsigned long long *_nb_i) __STARPU_OMP_NOTHROW;

+ 29 - 4
src/datawizard/malloc.c

@@ -129,6 +129,21 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 	return _starpu_malloc_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
 }
 
+/* Return whether we should pin the allocated data */
+static int _starpu_malloc_should_pin(int flags)
+{
+	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0)
+	{
+		if (_starpu_can_submit_cuda_task())
+		{
+			return 1;
+		}
+//		if (_starpu_can_submit_opencl_task())
+//			return 1;
+	}
+	return 0;
+}
+
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
 {
 	int ret=0;
@@ -159,7 +174,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 			starpu_memory_allocate(dst_node, dim, flags | STARPU_MEMORY_OVERFLOW);
 	}
 
-	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
+	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
 		if (_starpu_can_submit_cuda_task())
 		{
@@ -424,7 +439,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
 int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags)
 {
-	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
+	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
 		if (_starpu_can_submit_cuda_task())
 		{
@@ -944,11 +959,21 @@ static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node, int flags)
 	return chunk;
 }
 
+/* Return whether we should use our suballocator */
+static int _starpu_malloc_should_suballoc(unsigned dst_node, size_t size, int flags)
+{
+	return size <= CHUNK_ALLOC_MAX &&
+			(   starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM
+			|| (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM
+			    && _starpu_malloc_should_pin(flags))
+			);
+}
+
 uintptr_t
 starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
 {
 	/* Big allocation, allocate normally */
-	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
+	if (!_starpu_malloc_should_suballoc(dst_node, size, flags))
 		return _starpu_malloc_on_node(dst_node, size, flags);
 
 	/* Round up allocation to block size */
@@ -1046,7 +1071,7 @@ void
 starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
 	/* Big allocation, deallocate normally */
-	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
+	if (!_starpu_malloc_should_suballoc(dst_node, size, flags))
 	{
 		_starpu_free_on_node_flags(dst_node, addr, size, flags);
 		return;

+ 72 - 16
src/util/openmp_runtime_support.c

@@ -381,24 +381,29 @@ static void starpu_omp_explicit_task_entry(struct starpu_omp_task *task)
 {
 	STARPU_ASSERT(!(task->flags & STARPU_OMP_TASK_FLAGS_IMPLICIT));
 	struct _starpu_worker *starpu_worker = _starpu_get_local_worker_key();
-	if (starpu_worker->arch == STARPU_CPU_WORKER)
-	{
-		task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
-	}
+   /* XXX on work */
+   if (task->is_loop) {
+      starpu_omp_for_inline_first_alt(task->nb_iterations, task->chunk, starpu_omp_sched_static, 1, &task->begin_i, &task->end_i);
+   }
+   if (starpu_worker->arch == STARPU_CPU_WORKER)
+   {
+      task->cpu_f(task->starpu_buffers, task->starpu_cl_arg);
+   }
 #ifdef STARPU_USE_CUDA
-	else if (starpu_worker->arch == STARPU_CUDA_WORKER)
-	{
-		task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
-	}
+   else if (starpu_worker->arch == STARPU_CUDA_WORKER)
+   {
+      task->cuda_f(task->starpu_buffers, task->starpu_cl_arg);
+   }
 #endif
 #ifdef STARPU_USE_OPENCL
-	else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
-	{
-		task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
-	}
+   else if (starpu_worker->arch == STARPU_OPENCL_WORKER)
+   {
+      task->opencl_f(task->starpu_buffers, task->starpu_cl_arg);
+   }
 #endif
-	else
-		_STARPU_ERROR("invalid worker architecture");
+   else
+      _STARPU_ERROR("invalid worker architecture");
+   /**/
 	_starpu_omp_unregister_task_handles(task);
 	_starpu_spin_lock(&task->lock);
 	task->state = starpu_omp_task_state_terminated;
@@ -1624,8 +1629,20 @@ void starpu_omp_task_region(const struct starpu_omp_task_region_attr *attr)
 		{
 			generated_task->flags |= STARPU_OMP_TASK_FLAGS_UNDEFERRED;
 		}
-		generated_task->task_group = generating_task->task_group;
-		generated_task->rank = -1;
+      // XXX taskgroup exist
+      if (!attr->nogroup_clause)
+      {
+         generated_task->task_group = generating_task->task_group;
+      }
+      generated_task->rank = -1;
+
+      /* XXX taskloop attributes */
+      generated_task->is_loop = attr->is_loop;
+      generated_task->nb_iterations = attr->nb_iterations;
+      generated_task->grainsize = attr->grainsize;
+      generated_task->chunk = attr->chunk;
+      generated_task->begin_i = attr->begin_i;
+      generated_task->end_i = attr->end_i;
 
 		/*
 		 * save pointer to the regions user function from the task region codelet
@@ -1794,6 +1811,45 @@ void starpu_omp_taskgroup_inline_end(void)
 	free(p_task_group);
 }
 
+// XXX on work
+void starpu_omp_taskloop_inline_begin(struct starpu_omp_task_region_attr *attr)
+{
+   if (!attr->nogroup_clause)
+   {
+      starpu_omp_taskgroup_inline_begin();
+   }
+
+   int nb_subloop;
+   if (attr->num_tasks) {
+      nb_subloop = attr->num_tasks;
+   } else if (attr->grainsize) {
+      nb_subloop = attr->nb_iterations / attr->grainsize;
+   } else {
+      nb_subloop = 4;
+   }
+
+   attr->is_loop = 1;
+
+   int i;
+   int nb_iter_i = attr->nb_iterations / nb_subloop;
+   for (i = 0; i < nb_subloop; i++)
+   {
+      attr->begin_i = nb_iter_i * i;
+      attr->end_i = attr->begin_i + nb_iter_i;
+      attr->end_i += (i+1 != nb_subloop) ? 0 : (attr->nb_iterations % nb_subloop);
+      attr->chunk = attr->end_i - attr->begin_i;
+      starpu_omp_task_region(attr);
+   }
+}
+
+// XXX on work
+void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_attr *attr)
+{
+   if (!attr->nogroup_clause) {
+      starpu_omp_taskgroup_inline_end();
+   }
+}
+
 static inline void _starpu_omp_for_loop(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
 		struct starpu_omp_loop *loop, int first_call,
 		unsigned long long nb_iterations, unsigned long long chunk, int schedule, int ordered, unsigned long long *_first_i, unsigned long long *_nb_i)

+ 10 - 0
src/util/openmp_runtime_support.h

@@ -266,6 +266,16 @@ LIST_TYPE(starpu_omp_task,
 	int stack_vg_id;
 
 	size_t stacksize;
+
+   /*
+    * taskloop attribute
+    * */
+   int is_loop;
+   unsigned long long nb_iterations;
+   unsigned long long grainsize;
+   unsigned long long chunk;
+   unsigned long long begin_i;
+   unsigned long long end_i;
 )
 
 LIST_TYPE(starpu_omp_thread,

+ 4 - 0
tests/Makefile.am

@@ -201,6 +201,7 @@ myPROGRAMS +=					\
 	openmp/task_01				\
 	openmp/task_02				\
 	openmp/task_03				\
+	openmp/taskloop				\
 	openmp/taskwait_01			\
 	openmp/taskgroup_01			\
 	openmp/taskgroup_02			\
@@ -736,6 +737,9 @@ openmp_task_02_SOURCES = 	\
 openmp_task_03_SOURCES = 	\
 	openmp/task_03.c
 
+openmp_taskloop_SOURCES = 	\
+	openmp/taskloop.c
+
 openmp_taskwait_01_SOURCES = 	\
 	openmp/taskwait_01.c
 

+ 70 - 0
tests/openmp/taskloop.c

@@ -0,0 +1,70 @@
+#include <pthread.h>
+#include <starpu.h>
+#include <stdio.h>
+
+/*
+ * Check the OpenMP orphaned task support.
+ */
+
+#if !defined(STARPU_OPENMP)
+int main(void)
+{
+   return STARPU_TEST_SKIPPED;
+}
+#else
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+   int ret = starpu_omp_init();
+   STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+   starpu_omp_shutdown();
+}
+
+void taskloop_callback(unsigned long long begin_i, unsigned long long end_i) {
+   int worker_id;
+   pthread_t tid;
+   tid = pthread_self();
+   worker_id = starpu_worker_get_id();
+   printf ("begin = %llu , end = %llu, %p\n", begin_i, end_i, (void *)starpu_task_get_current());
+}
+
+void taskloop_callback_wrapper(void *buffers[], void *_args)
+{
+   (void) buffers;
+   struct starpu_omp_task_region_attr * args = _args;
+   taskloop_callback(args->begin_i, args->end_i);
+}
+
+int
+main (void)
+{
+   struct starpu_omp_task_region_attr attr;
+   memset(&attr, 0, sizeof(attr));
+#ifdef STARPU_SIMGRID
+   attr.cl.model         = &starpu_perfmodel_nop;
+#endif
+   attr.cl.flags         = STARPU_CODELET_SIMGRID_EXECUTE;
+   attr.cl.cpu_funcs[0]  = taskloop_callback_wrapper;
+   attr.cl_arg           = &attr;
+   attr.cl.where         = STARPU_CPU;
+   attr.if_clause        = 1;
+   attr.final_clause     = 0;
+   attr.untied_clause    = 1;
+   attr.mergeable_clause = 0;
+   attr.nogroup_clause   = 0;
+   attr.is_loop          = 0;
+   attr.collapse         = 0;
+   attr.num_tasks        = 5;
+   attr.nb_iterations    = 400;
+   attr.grainsize        = 130;
+
+   starpu_omp_taskloop_inline_begin(&attr);
+   starpu_omp_taskloop_inline_end(&attr);
+   return 0;
+}
+#endif