11 years ago · a276de8730
--- a/ChangeLog
+++ b/ChangeLog
@@ -43,7 +43,7 @@ New features:
 
				   * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
			
 
				   * New functions starpu_pause() and starpu_resume()
			
 
				   * New codelet specific_nodes field to specify explicit target nodes for data.
			
 
				-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
			
 
				+  * Use streams for all CUDA transfers, even initiated by CPUs.
			
 
				   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
			
 
				     CUDA and OpenCL kernel execution.
			
 
				   * Add paje traces statistics tools.
			
--- a/configure.ac
+++ b/configure.ac
@@ -1369,6 +1369,7 @@ AC_MSG_RESULT($enable_debug)
 
				 AC_ARG_ENABLE(spinlock_check, [AS_HELP_STRING([--enable-spinlock-check], [enable spinlock check])], enable_spinlock_check=$enableval, enable_spinlock_check=no)
			
 
				 
			
 
				 if test x$enable_debug = xyes; then
			
 
				+	AC_DEFINE(STARPU_DEBUG, [1], [enable debugging statements])
			
 
				 	CFLAGS="$CFLAGS -O0"
			
 
				 	enable_spinlock_check=yes
			
 
				 else
			
--- a/include/starpu_tree.h
+++ b/include/starpu_tree.h
@@ -41,7 +41,7 @@ struct starpu_tree *starpu_tree_get(struct starpu_tree *tree, int id);
 
				 
			
 
				 struct starpu_tree *starpu_tree_get_neighbour(struct starpu_tree *tree, struct starpu_tree *node, int *visited, int *present);
			
 
				 
			
 
				-int starpu_tree_free(struct starpu_tree *tree);
			
 
				+void starpu_tree_free(struct starpu_tree *tree);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/src/core/tree.c
+++ b/src/core/tree.c
@@ -138,19 +138,16 @@ struct starpu_tree* starpu_tree_get_neighbour(struct starpu_tree *tree, struct s
 
				 	return starpu_tree_get_neighbour(tree, father, visited, present);
			
 
				 }
			
 
				 
			
 
				-int starpu_tree_free(struct starpu_tree *tree)
			
 
				+void starpu_tree_free(struct starpu_tree *tree)
			
 
				 {
			
 
				-	if(tree->arity == 0)
			
 
				-		return 1;
			
 
				 	int i;
			
 
				 	for(i = 0; i < tree->arity; i++)
			
 
				 	{
			
 
				-		if(starpu_tree_free(tree->nodes[i]))
			
 
				-		{
			
 
				-			free(tree->nodes);
			
 
				-			tree->arity = 0;
			
 
				-			return 1;
			
 
				-		}
			
 
				+		starpu_tree_free(tree->nodes[i]);
			
 
				+		free(tree->nodes[i]);
			
 
				+		tree->nodes[i] = NULL;
			
 
				 	}
			
 
				-	return 0;
			
 
				+	free(tree->nodes);
			
 
				+	tree->nodes = NULL;
			
 
				+	tree->arity = 0;
			
 
				 }
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -900,11 +900,7 @@ static void _starpu_build_tree(void)
 
				 	struct starpu_tree* tree = (struct starpu_tree*)malloc(sizeof(struct starpu_tree));
			
 
				 	config.topology.tree = tree;
			
 
				 
			
 
				-	hwloc_topology_t topology;
			
 
				-	hwloc_topology_init(&topology);
			
 
				-	hwloc_topology_load(topology);
			
 
				-
			
 
				-	hwloc_obj_t root = hwloc_get_root_obj(topology);
			
 
				+	hwloc_obj_t root = hwloc_get_root_obj(config.topology.hwtopology);
			
 
				 
			
 
				 /* 	char string[128]; */
			
 
				 /* 	hwloc_obj_snprintf(string, sizeof(string), topology, root, "#", 0); */
			
@@ -912,7 +908,7 @@ static void _starpu_build_tree(void)
 
				 
			
 
				 	/* level, is_pu, is in the tree (it will be true only after add*/
			
 
				 	starpu_tree_insert(tree, root->logical_index, 0,root->type == HWLOC_OBJ_PU, root->arity, NULL);
			
 
				-	_fill_tree(tree, root, 1, topology);
			
 
				+	_fill_tree(tree, root, 1, config.topology.hwtopology);
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -194,13 +194,7 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
			
 
				 			/* GPUs not always allow direct remote access: if CUDA4
			
 
				 			 * is enabled, we allow two CUDA devices to communicate. */
			
 
				-			return
			
 
				-#if 0
			
 
				-				/* CUDA does not seem very safe with concurrent
			
 
				-				 * transfer queueing, avoid queueing from CPUs */
			
 
				-				kind == STARPU_CPU_RAM ||
			
 
				-#endif
			
 
				-				kind == STARPU_CUDA_RAM;
			
 
				+			return kind == STARPU_CPU_RAM || kind == STARPU_CUDA_RAM;
			
 
				 		}
			
 
				 #else
			
 
				 			/* Direct GPU-GPU transfers are not allowed in general */
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -163,7 +163,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			stream = starpu_cuda_get_local_out_transfer_stream();
			
 
				+			stream = starpu_cuda_get_out_transfer_stream(src_node);
			
 
				 			if (copy_methods->cuda_to_ram_async)
			
 
				 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 			else
			
@@ -199,7 +199,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			stream = starpu_cuda_get_local_in_transfer_stream();
			
 
				+			stream = starpu_cuda_get_in_transfer_stream(dst_node);
			
 
				 			if (copy_methods->ram_to_cuda_async)
			
 
				 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 			else
			
@@ -531,7 +531,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 
				 				(void*) src + src_offset, src_node,
			
 
				 				(void*) dst + dst_offset, dst_node,
			
 
				 				size,
			
 
				-				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
			
 
				+				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
			
 
				 				cudaMemcpyDeviceToHost);
			
 
				 
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
@@ -539,7 +539,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 
				 				(void*) src + src_offset, src_node,
			
 
				 				(void*) dst + dst_offset, dst_node,
			
 
				 				size,
			
 
				-				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
			
 
				+				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
			
 
				 				cudaMemcpyHostToDevice);
			
 
				 
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -42,8 +42,8 @@ static unsigned ncudagpus;
 
				 static size_t global_mem[STARPU_MAXCUDADEVS];
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				-static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
			
 
				-static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
			
 
				+static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				+static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
			
 
				 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
			
 
				 static cudaEvent_t task_events[STARPU_NMAXWORKERS];
			
@@ -116,18 +116,18 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void)
			
 
				+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node)
			
 
				 {
			
 
				-	int worker = starpu_worker_get_id();
			
 
				+	int devid = _starpu_memory_node_get_devid(node);
			
 
				 
			
 
				-	return in_transfer_streams[worker];
			
 
				+	return in_transfer_streams[devid];
			
 
				 }
			
 
				 
			
 
				-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void)
			
 
				+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node)
			
 
				 {
			
 
				-	int worker = starpu_worker_get_id();
			
 
				+	int devid = _starpu_memory_node_get_devid(node);
			
 
				 
			
 
				-	return out_transfer_streams[worker];
			
 
				+	return out_transfer_streams[devid];
			
 
				 }
			
 
				 
			
 
				 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
			
@@ -263,11 +263,11 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-		cures = cudaStreamCreate(&in_transfer_streams[workerid]);
			
 
				+		cures = cudaStreamCreate(&in_transfer_streams[devid]);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-		cures = cudaStreamCreate(&out_transfer_streams[workerid]);
			
 
				+		cures = cudaStreamCreate(&out_transfer_streams[devid]);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
@@ -293,8 +293,8 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 
				 
			
 
				 		cudaEventDestroy(task_events[workerid]);
			
 
				 		cudaStreamDestroy(streams[workerid]);
			
 
				-		cudaStreamDestroy(in_transfer_streams[workerid]);
			
 
				-		cudaStreamDestroy(out_transfer_streams[workerid]);
			
 
				+		cudaStreamDestroy(in_transfer_streams[devid]);
			
 
				+		cudaStreamDestroy(out_transfer_streams[devid]);
			
 
				 	}
			
 
				 
			
 
				 	for (i = 0; i < ncudagpus; i++)
			
@@ -588,6 +588,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 #endif
			
 
				 		/* Synchronous execution */
			
 
				 		{
			
 
				+#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
			
 
				+			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
			
 
				+#endif
			
 
				 			finish_job_on_cuda(j, args);
			
 
				 		}
			
 
				 		_STARPU_TRACE_START_PROGRESS(memnode);
			
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -48,8 +48,8 @@ void *_starpu_cuda_worker(void *);
 
				 #  define _starpu_cuda_discover_devices(config) ((void) config)
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
			
 
				-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
			
 
				+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node);
			
 
				+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node);
			
 
				 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
			
 
				 
			
 
				 struct _starpu_worker_set;