瀏覽代碼

- modify the way to unregister OpenMP tasks' data
- make the check for blocking calls within tasks more permissive in the case of OpenMP tasks
- add an example of "CUDA" OpenMP task
- silence a few more cudaErrorCudartUnloading errors caused by starpu_omp_shutdown() being called from a destructor routine

We used to employ the asynchronous starpu_data_unregister_submit() to unregister data from OpenMP tasks.
However:
1) This function does not bring data back to main memory if it was on a remote memory node.
2) Asynchronism is actually not wanted in that case: The OpenMP tasking model naturally leads to nested data registration. Submitting an unregistration request asynchronously on nested data may lead a mother task to complete and release its own data dependencies, while the asynchronous unregistration request on nested data has not yet been performed, leading to consistency issues.
Therefore, we now perform a synchronous starpu_data_unregister on OpenMP data instead. This is normally forbidden within StarPU tasks. However this is fine in that case, because we perform OpenMP data unregistration upon OpenMP regions completion, when all nested tasks accessing the finishing region' registered data must themselves be finished already.

Olivier Aumage 10 年之前
父節點
當前提交
793eba8411

+ 10 - 1
src/core/errorcheck.c

@@ -53,6 +53,15 @@ enum _starpu_worker_status _starpu_get_local_worker_status(void)
 unsigned _starpu_worker_may_perform_blocking_calls(void)
 {
 	enum _starpu_worker_status st = _starpu_get_local_worker_status();
+#ifdef STARPU_OPENMP
+	/* When the current task is an OpenMP task, we may need to block,
+	 * especially when unregistering data used by child tasks. However,
+	 * we don't want to blindly disable the check for non OpenMP tasks. */
+	const struct starpu_task * const task = starpu_task_get_current();
+	const int blocking_call_check_override = task && task->omp_task;
+#else /* STARPU_OPENMP */
+	const int blocking_call_check_override = 0;
+#endif /* STARPU_OPENMP */
 
-	return ( !(st == STATUS_CALLBACK) && !(st == STATUS_EXECUTING));
+	return blocking_call_check_override || ( !(st == STATUS_CALLBACK) && !(st == STATUS_EXECUTING));
 }

+ 2 - 2
src/datawizard/interfaces/data_interface.c

@@ -103,7 +103,7 @@ void _starpu_omp_unregister_region_handles(struct starpu_omp_region *region)
 	{
 		entry->handle->removed_from_context_hash = 1;
 		HASH_DEL(region->registered_handles, entry);
-		starpu_data_unregister_submit(entry->handle);
+		starpu_data_unregister(entry->handle);
 		free(entry);
 	}
 	_starpu_spin_unlock(&region->registered_handles_lock);
@@ -116,7 +116,7 @@ void _starpu_omp_unregister_task_handles(struct starpu_omp_task *task)
 	{
 		entry->handle->removed_from_context_hash = 1;
 		HASH_DEL(task->registered_handles, entry);
-		starpu_data_unregister_submit(entry->handle);
+		starpu_data_unregister(entry->handle);
 		free(entry);
 	}
 }

+ 11 - 1
src/datawizard/malloc.c

@@ -522,7 +522,17 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
 #endif
 			err = cudaFree((void*)addr);
-			if (STARPU_UNLIKELY(err != cudaSuccess))
+			if (STARPU_UNLIKELY(err != cudaSuccess
+#ifdef STARPU_OPENMP
+		/* When StarPU is used as Open Runtime support,
+		 * starpu_omp_shutdown() will usually be called from a
+		 * destructor, in which case cudaThreadExit() reports a
+		 * cudaErrorCudartUnloading here. There should not
+		 * be any remaining tasks running at this point so
+		 * we can probably ignore it without much consequences. */
+		&& err != cudaErrorCudartUnloading
+#endif /* STARPU_OPENMP */
+						))
 				STARPU_CUDA_REPORT_ERROR(err);
 #endif
 			break;

+ 11 - 1
src/drivers/cuda/driver_cuda.c

@@ -218,7 +218,17 @@ void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
 #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
 done:
 #endif
-	if (STARPU_UNLIKELY(cures))
+	if (STARPU_UNLIKELY(cures
+#ifdef STARPU_OPENMP
+		/* When StarPU is used as Open Runtime support,
+		 * starpu_omp_shutdown() will usually be called from a
+		 * destructor, in which case cudaThreadExit() reports a
+		 * cudaErrorCudartUnloading here. There should not
+		 * be any remaining tasks running at this point so
+		 * we can probably ignore it without much consequences. */
+		&& cures != cudaErrorCudartUnloading
+#endif /* STARPU_OPENMP */
+				))
 		STARPU_CUDA_REPORT_ERROR(cures);
 #endif
 }

+ 4 - 0
tests/Makefile.am

@@ -257,6 +257,7 @@ noinst_PROGRAMS =				\
 	openmp/taskgroup_01			\
 	openmp/taskgroup_02			\
 	openmp/array_slice_01			\
+	openmp/cuda_task_01			\
 	overlap/overlap				\
 	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
@@ -575,6 +576,9 @@ openmp_taskgroup_02_SOURCES = 	\
 openmp_array_slice_01_SOURCES = 	\
 	openmp/array_slice_01.c
 
+openmp_cuda_task_01_SOURCES = 	\
+	openmp/cuda_task_01.c
+
 ###################
 # Block interface #
 ###################

+ 186 - 0
tests/openmp/cuda_task_01.c

@@ -0,0 +1,186 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+#include <stdio.h>
+
+#if !defined(STARPU_OPENMP) || !defined(STARPU_USE_CUDA)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+#define	NX	64
+int global_vector_1[NX];
+int global_vector_2[NX];
+
+__attribute__((constructor))
+static void omp_constructor(void)
+{
+	int ret = starpu_omp_init();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
+}
+
+__attribute__((destructor))
+static void omp_destructor(void)
+{
+	starpu_omp_shutdown();
+}
+
+void task_region_g(void *buffers[], void *args)
+{
+	struct starpu_vector_interface *_vector_1 = buffers[0];
+	int nx1 = STARPU_VECTOR_GET_NX(_vector_1);
+	int *v1 = (int *)STARPU_VECTOR_GET_PTR(_vector_1);
+
+	struct starpu_vector_interface *_vector_2 = buffers[1];
+	int nx2 = STARPU_VECTOR_GET_NX(_vector_2);
+	int *v2 = (int *)STARPU_VECTOR_GET_PTR(_vector_2);
+
+	int f = (int)(intptr_t)args;
+	
+	STARPU_ASSERT(nx1 == nx2);
+
+	printf("depth 1 task, entry: vector_1 ptr = %p\n", v1);
+	printf("depth 1 task, entry: vector_2 ptr = %p\n", v2);
+	printf("depth 1 task, entry: f = %d\n", f);
+
+	fprintf(stderr, "cudaMemcpy: -->\n");
+	cudaMemcpy(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice);
+	fprintf(stderr, "cudaMemcpy: <--\n");
+}
+
+void master_g1(void *arg)
+{
+	(void)arg;
+	{
+		starpu_data_handle_t region_vector_handle;
+		int i;
+
+		printf("master_g1: vector ptr = %p\n", global_vector_1);
+		for (i = 0; i < NX; i++)
+		{
+			global_vector_1[i] = 1;
+		}
+
+		starpu_vector_data_register(&region_vector_handle, STARPU_MAIN_RAM, (uintptr_t)global_vector_1, NX, sizeof(global_vector_1[0]));
+		printf("master_g1: region_vector_handle = %p\n", region_vector_handle);
+	}
+	{
+		starpu_data_handle_t region_vector_handle;
+		int i;
+
+		printf("master_g1: vector ptr = %p\n", global_vector_2);
+		for (i = 0; i < NX; i++)
+		{
+			global_vector_2[i] = 0;
+		}
+
+		starpu_vector_data_register(&region_vector_handle, STARPU_MAIN_RAM, (uintptr_t)global_vector_2, NX, sizeof(global_vector_2[0]));
+		printf("master_g1: region_vector_handle = %p\n", region_vector_handle);
+	}
+}
+
+void master_g2(void *arg)
+{
+	(void)arg;
+	starpu_data_handle_t region_vector_handles[2];
+	struct starpu_omp_task_region_attr attr;
+	int i;
+
+	region_vector_handles[0] = starpu_data_lookup(global_vector_1);
+	printf("master_g2: region_vector_handles[0] = %p\n", region_vector_handles[0]);
+	region_vector_handles[1] = starpu_data_lookup(global_vector_2);
+	printf("master_g2: region_vector_handles[1] = %p\n", region_vector_handles[1]);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0]  = NULL;
+	attr.cl.cuda_funcs[0]  = task_region_g;
+	attr.cl.where         = STARPU_CUDA;
+	attr.cl.nbuffers      = 2;
+	attr.cl.modes[0]      = STARPU_R;
+	attr.cl.modes[1]      = STARPU_W;
+	attr.handles          = region_vector_handles;
+	attr.cl_arg_size      = sizeof(void *);
+	attr.cl_arg_free      = 0;
+	attr.if_clause        = 1;
+	attr.final_clause     = 0;
+	attr.untied_clause    = 1;
+	attr.mergeable_clause = 0;
+
+	i = 0;
+
+	attr.cl_arg = (void *)(intptr_t)i;
+	starpu_omp_task_region(&attr);
+}
+
+void parallel_region_f(void *buffers[], void *args)
+{
+	(void)buffers;
+	(void)args;
+	starpu_omp_master(master_g1, NULL);
+	starpu_omp_barrier();
+	{
+		starpu_data_handle_t region_vector_handle_1;
+		region_vector_handle_1 = starpu_data_lookup(global_vector_1);
+		printf("parallel_region block 1: region_vector_handle_1 = %p\n", region_vector_handle_1);
+	}
+	{
+		starpu_data_handle_t region_vector_handle_2;
+		region_vector_handle_2 = starpu_data_lookup(global_vector_2);
+		printf("parallel_region block 1: region_vector_handle_2 = %p\n", region_vector_handle_2);
+	}
+	starpu_omp_barrier();
+	starpu_omp_master(master_g2, NULL);
+	starpu_omp_barrier();
+	{
+		starpu_data_handle_t region_vector_handle_1;
+		region_vector_handle_1 = starpu_data_lookup(global_vector_1);
+		printf("parallel_region block 2: region_vector_handle_1 = %p\n", region_vector_handle_1);
+	}
+	{
+		starpu_data_handle_t region_vector_handle_2;
+		region_vector_handle_2 = starpu_data_lookup(global_vector_2);
+		printf("parallel_region block 2: region_vector_handle_2 = %p\n", region_vector_handle_2);
+	}
+}
+
+int
+main (int argc, char *argv[]) {
+	(void)argc;
+	(void)argv;
+	struct starpu_omp_parallel_region_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.cl.cpu_funcs[0] = parallel_region_f;
+	attr.cl.where        = STARPU_CPU;
+	attr.if_clause       = 1;
+	starpu_omp_parallel_region(&attr);
+
+	int i;
+	for (i = 0; i < NX; i++)
+	{
+		if (global_vector_1[i] != global_vector_2[i])
+		{
+			fprintf(stderr, "check failed: global_vector_1[%d] = %d, global_vector_2[%d] = %d\n", i, global_vector_1[i], i, global_vector_2[i]);
+			return EXIT_FAILURE;
+		}
+	}
+	return 0;
+}
+#endif