10 年之前 · 793eba8411
--- a/src/core/errorcheck.c
+++ b/src/core/errorcheck.c
@@ -53,6 +53,15 @@ enum _starpu_worker_status _starpu_get_local_worker_status(void)
 
				 unsigned _starpu_worker_may_perform_blocking_calls(void)
			
 
				 {
			
 
				 	enum _starpu_worker_status st = _starpu_get_local_worker_status();
			
 
				+#ifdef STARPU_OPENMP
			
 
				+	/* When the current task is an OpenMP task, we may need to block,
			
 
				+	 * especially when unregistering data used by child tasks. However,
			
 
				+	 * we don't want to blindly disable the check for non OpenMP tasks. */
			
 
				+	const struct starpu_task * const task = starpu_task_get_current();
			
 
				+	const int blocking_call_check_override = task && task->omp_task;
			
 
				+#else /* STARPU_OPENMP */
			
 
				+	const int blocking_call_check_override = 0;
			
 
				+#endif /* STARPU_OPENMP */
			
 
				 
			
 
				-	return ( !(st == STATUS_CALLBACK) && !(st == STATUS_EXECUTING));
			
 
				+	return blocking_call_check_override || ( !(st == STATUS_CALLBACK) && !(st == STATUS_EXECUTING));
			
 
				 }
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -103,7 +103,7 @@ void _starpu_omp_unregister_region_handles(struct starpu_omp_region *region)
 
				 	{
			
 
				 		entry->handle->removed_from_context_hash = 1;
			
 
				 		HASH_DEL(region->registered_handles, entry);
			
 
				-		starpu_data_unregister_submit(entry->handle);
			
 
				+		starpu_data_unregister(entry->handle);
			
 
				 		free(entry);
			
 
				 	}
			
 
				 	_starpu_spin_unlock(&region->registered_handles_lock);
			
@@ -116,7 +116,7 @@ void _starpu_omp_unregister_task_handles(struct starpu_omp_task *task)
 
				 	{
			
 
				 		entry->handle->removed_from_context_hash = 1;
			
 
				 		HASH_DEL(task->registered_handles, entry);
			
 
				-		starpu_data_unregister_submit(entry->handle);
			
 
				+		starpu_data_unregister(entry->handle);
			
 
				 		free(entry);
			
 
				 	}
			
 
				 }
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -522,7 +522,17 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
			
 
				 #endif
			
 
				 			err = cudaFree((void*)addr);
			
 
				-			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				+			if (STARPU_UNLIKELY(err != cudaSuccess
			
 
				+#ifdef STARPU_OPENMP
			
 
				+		/* When StarPU is used as Open Runtime support,
			
 
				+		 * starpu_omp_shutdown() will usually be called from a
			
 
				+		 * destructor, in which case cudaThreadExit() reports a
			
 
				+		 * cudaErrorCudartUnloading here. There should not
			
 
				+		 * be any remaining tasks running at this point so
			
 
				+		 * we can probably ignore it without much consequences. */
			
 
				+		&& err != cudaErrorCudartUnloading
			
 
				+#endif /* STARPU_OPENMP */
			
 
				+						))
			
 
				 				STARPU_CUDA_REPORT_ERROR(err);
			
 
				 #endif
			
 
				 			break;
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -218,7 +218,17 @@ void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
 
				 #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
			
 
				 done:
			
 
				 #endif
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				+	if (STARPU_UNLIKELY(cures
			
 
				+#ifdef STARPU_OPENMP
			
 
				+		/* When StarPU is used as Open Runtime support,
			
 
				+		 * starpu_omp_shutdown() will usually be called from a
			
 
				+		 * destructor, in which case cudaThreadExit() reports a
			
 
				+		 * cudaErrorCudartUnloading here. There should not
			
 
				+		 * be any remaining tasks running at this point so
			
 
				+		 * we can probably ignore it without much consequences. */
			
 
				+		&& cures != cudaErrorCudartUnloading
			
 
				+#endif /* STARPU_OPENMP */
			
 
				+				))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 #endif
			
 
				 }
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -257,6 +257,7 @@ noinst_PROGRAMS =				\
 
				 	openmp/taskgroup_01			\
			
 
				 	openmp/taskgroup_02			\
			
 
				 	openmp/array_slice_01			\
			
 
				+	openmp/cuda_task_01			\
			
 
				 	overlap/overlap				\
			
 
				 	overlap/gpu_concurrency			\
			
 
				 	parallel_tasks/explicit_combined_worker	\
			
@@ -575,6 +576,9 @@ openmp_taskgroup_02_SOURCES = 	\
 
				 openmp_array_slice_01_SOURCES = 	\
			
 
				 	openmp/array_slice_01.c
			
 
				 
			
 
				+openmp_cuda_task_01_SOURCES = 	\
			
 
				+	openmp/cuda_task_01.c
			
 
				+
			
 
				 ###################
			
 
				 # Block interface #
			
 
				 ###################
			
--- a/tests/openmp/cuda_task_01.c
+++ b/tests/openmp/cuda_task_01.c
@@ -0,0 +1,186 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2014  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#if !defined(STARPU_OPENMP) || !defined(STARPU_USE_CUDA)
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+#else
			
 
				+#define	NX	64
			
 
				+int global_vector_1[NX];
			
 
				+int global_vector_2[NX];
			
 
				+
			
 
				+__attribute__((constructor))
			
 
				+static void omp_constructor(void)
			
 
				+{
			
 
				+	int ret = starpu_omp_init();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_omp_init");
			
 
				+}
			
 
				+
			
 
				+__attribute__((destructor))
			
 
				+static void omp_destructor(void)
			
 
				+{
			
 
				+	starpu_omp_shutdown();
			
 
				+}
			
 
				+
			
 
				+void task_region_g(void *buffers[], void *args)
			
 
				+{
			
 
				+	struct starpu_vector_interface *_vector_1 = buffers[0];
			
 
				+	int nx1 = STARPU_VECTOR_GET_NX(_vector_1);
			
 
				+	int *v1 = (int *)STARPU_VECTOR_GET_PTR(_vector_1);
			
 
				+
			
 
				+	struct starpu_vector_interface *_vector_2 = buffers[1];
			
 
				+	int nx2 = STARPU_VECTOR_GET_NX(_vector_2);
			
 
				+	int *v2 = (int *)STARPU_VECTOR_GET_PTR(_vector_2);
			
 
				+
			
 
				+	int f = (int)(intptr_t)args;
			
 
				+	
			
 
				+	STARPU_ASSERT(nx1 == nx2);
			
 
				+
			
 
				+	printf("depth 1 task, entry: vector_1 ptr = %p\n", v1);
			
 
				+	printf("depth 1 task, entry: vector_2 ptr = %p\n", v2);
			
 
				+	printf("depth 1 task, entry: f = %d\n", f);
			
 
				+
			
 
				+	fprintf(stderr, "cudaMemcpy: -->\n");
			
 
				+	cudaMemcpy(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice);
			
 
				+	fprintf(stderr, "cudaMemcpy: <--\n");
			
 
				+}
			
 
				+
			
 
				+void master_g1(void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	{
			
 
				+		starpu_data_handle_t region_vector_handle;
			
 
				+		int i;
			
 
				+
			
 
				+		printf("master_g1: vector ptr = %p\n", global_vector_1);
			
 
				+		for (i = 0; i < NX; i++)
			
 
				+		{
			
 
				+			global_vector_1[i] = 1;
			
 
				+		}
			
 
				+
			
 
				+		starpu_vector_data_register(&region_vector_handle, STARPU_MAIN_RAM, (uintptr_t)global_vector_1, NX, sizeof(global_vector_1[0]));
			
 
				+		printf("master_g1: region_vector_handle = %p\n", region_vector_handle);
			
 
				+	}
			
 
				+	{
			
 
				+		starpu_data_handle_t region_vector_handle;
			
 
				+		int i;
			
 
				+
			
 
				+		printf("master_g1: vector ptr = %p\n", global_vector_2);
			
 
				+		for (i = 0; i < NX; i++)
			
 
				+		{
			
 
				+			global_vector_2[i] = 0;
			
 
				+		}
			
 
				+
			
 
				+		starpu_vector_data_register(&region_vector_handle, STARPU_MAIN_RAM, (uintptr_t)global_vector_2, NX, sizeof(global_vector_2[0]));
			
 
				+		printf("master_g1: region_vector_handle = %p\n", region_vector_handle);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void master_g2(void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	starpu_data_handle_t region_vector_handles[2];
			
 
				+	struct starpu_omp_task_region_attr attr;
			
 
				+	int i;
			
 
				+
			
 
				+	region_vector_handles[0] = starpu_data_lookup(global_vector_1);
			
 
				+	printf("master_g2: region_vector_handles[0] = %p\n", region_vector_handles[0]);
			
 
				+	region_vector_handles[1] = starpu_data_lookup(global_vector_2);
			
 
				+	printf("master_g2: region_vector_handles[1] = %p\n", region_vector_handles[1]);
			
 
				+
			
 
				+	memset(&attr, 0, sizeof(attr));
			
 
				+	attr.cl.cpu_funcs[0]  = NULL;
			
 
				+	attr.cl.cuda_funcs[0]  = task_region_g;
			
 
				+	attr.cl.where         = STARPU_CUDA;
			
 
				+	attr.cl.nbuffers      = 2;
			
 
				+	attr.cl.modes[0]      = STARPU_R;
			
 
				+	attr.cl.modes[1]      = STARPU_W;
			
 
				+	attr.handles          = region_vector_handles;
			
 
				+	attr.cl_arg_size      = sizeof(void *);
			
 
				+	attr.cl_arg_free      = 0;
			
 
				+	attr.if_clause        = 1;
			
 
				+	attr.final_clause     = 0;
			
 
				+	attr.untied_clause    = 1;
			
 
				+	attr.mergeable_clause = 0;
			
 
				+
			
 
				+	i = 0;
			
 
				+
			
 
				+	attr.cl_arg = (void *)(intptr_t)i;
			
 
				+	starpu_omp_task_region(&attr);
			
 
				+}
			
 
				+
			
 
				+void parallel_region_f(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void)buffers;
			
 
				+	(void)args;
			
 
				+	starpu_omp_master(master_g1, NULL);
			
 
				+	starpu_omp_barrier();
			
 
				+	{
			
 
				+		starpu_data_handle_t region_vector_handle_1;
			
 
				+		region_vector_handle_1 = starpu_data_lookup(global_vector_1);
			
 
				+		printf("parallel_region block 1: region_vector_handle_1 = %p\n", region_vector_handle_1);
			
 
				+	}
			
 
				+	{
			
 
				+		starpu_data_handle_t region_vector_handle_2;
			
 
				+		region_vector_handle_2 = starpu_data_lookup(global_vector_2);
			
 
				+		printf("parallel_region block 1: region_vector_handle_2 = %p\n", region_vector_handle_2);
			
 
				+	}
			
 
				+	starpu_omp_barrier();
			
 
				+	starpu_omp_master(master_g2, NULL);
			
 
				+	starpu_omp_barrier();
			
 
				+	{
			
 
				+		starpu_data_handle_t region_vector_handle_1;
			
 
				+		region_vector_handle_1 = starpu_data_lookup(global_vector_1);
			
 
				+		printf("parallel_region block 2: region_vector_handle_1 = %p\n", region_vector_handle_1);
			
 
				+	}
			
 
				+	{
			
 
				+		starpu_data_handle_t region_vector_handle_2;
			
 
				+		region_vector_handle_2 = starpu_data_lookup(global_vector_2);
			
 
				+		printf("parallel_region block 2: region_vector_handle_2 = %p\n", region_vector_handle_2);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main (int argc, char *argv[]) {
			
 
				+	(void)argc;
			
 
				+	(void)argv;
			
 
				+	struct starpu_omp_parallel_region_attr attr;
			
 
				+
			
 
				+	memset(&attr, 0, sizeof(attr));
			
 
				+	attr.cl.cpu_funcs[0] = parallel_region_f;
			
 
				+	attr.cl.where        = STARPU_CPU;
			
 
				+	attr.if_clause       = 1;
			
 
				+	starpu_omp_parallel_region(&attr);
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < NX; i++)
			
 
				+	{
			
 
				+		if (global_vector_1[i] != global_vector_2[i])
			
 
				+		{
			
 
				+			fprintf(stderr, "check failed: global_vector_1[%d] = %d, global_vector_2[%d] = %d\n", i, global_vector_1[i], i, global_vector_2[i]);
			
 
				+			return EXIT_FAILURE;
			
 
				+		}
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif