瀏覽代碼

- merge trunk

Olivier Aumage 11 年之前
父節點
當前提交
a276de8730

+ 1 - 1
ChangeLog

@@ -43,7 +43,7 @@ New features:
   * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
   * New functions starpu_pause() and starpu_resume()
   * New codelet specific_nodes field to specify explicit target nodes for data.
-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+  * Use streams for all CUDA transfers, even initiated by CPUs.
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
     CUDA and OpenCL kernel execution.
   * Add paje traces statistics tools.

+ 1 - 0
configure.ac

@@ -1369,6 +1369,7 @@ AC_MSG_RESULT($enable_debug)
 AC_ARG_ENABLE(spinlock_check, [AS_HELP_STRING([--enable-spinlock-check], [enable spinlock check])], enable_spinlock_check=$enableval, enable_spinlock_check=no)
 
 if test x$enable_debug = xyes; then
+	AC_DEFINE(STARPU_DEBUG, [1], [enable debugging statements])
 	CFLAGS="$CFLAGS -O0"
 	enable_spinlock_check=yes
 else

+ 1 - 1
include/starpu_tree.h

@@ -41,7 +41,7 @@ struct starpu_tree *starpu_tree_get(struct starpu_tree *tree, int id);
 
 struct starpu_tree *starpu_tree_get_neighbour(struct starpu_tree *tree, struct starpu_tree *node, int *visited, int *present);
 
-int starpu_tree_free(struct starpu_tree *tree);
+void starpu_tree_free(struct starpu_tree *tree);
 
 #ifdef __cplusplus
 }

+ 7 - 10
src/core/tree.c

@@ -138,19 +138,16 @@ struct starpu_tree* starpu_tree_get_neighbour(struct starpu_tree *tree, struct s
 	return starpu_tree_get_neighbour(tree, father, visited, present);
 }
 
-int starpu_tree_free(struct starpu_tree *tree)
+void starpu_tree_free(struct starpu_tree *tree)
 {
-	if(tree->arity == 0)
-		return 1;
 	int i;
 	for(i = 0; i < tree->arity; i++)
 	{
-		if(starpu_tree_free(tree->nodes[i]))
-		{
-			free(tree->nodes);
-			tree->arity = 0;
-			return 1;
-		}
+		starpu_tree_free(tree->nodes[i]);
+		free(tree->nodes[i]);
+		tree->nodes[i] = NULL;
 	}
-	return 0;
+	free(tree->nodes);
+	tree->nodes = NULL;
+	tree->arity = 0;
 }

+ 2 - 6
src/core/workers.c

@@ -900,11 +900,7 @@ static void _starpu_build_tree(void)
 	struct starpu_tree* tree = (struct starpu_tree*)malloc(sizeof(struct starpu_tree));
 	config.topology.tree = tree;
 
-	hwloc_topology_t topology;
-	hwloc_topology_init(&topology);
-	hwloc_topology_load(topology);
-
-	hwloc_obj_t root = hwloc_get_root_obj(topology);
+	hwloc_obj_t root = hwloc_get_root_obj(config.topology.hwtopology);
 
 /* 	char string[128]; */
 /* 	hwloc_obj_snprintf(string, sizeof(string), topology, root, "#", 0); */
@@ -912,7 +908,7 @@ static void _starpu_build_tree(void)
 
 	/* level, is_pu, is in the tree (it will be true only after add*/
 	starpu_tree_insert(tree, root->logical_index, 0,root->type == HWLOC_OBJ_PU, root->arity, NULL);
-	_fill_tree(tree, root, 1, topology);
+	_fill_tree(tree, root, 1, config.topology.hwtopology);
 #endif
 }
 

+ 1 - 7
src/datawizard/coherency.c

@@ -194,13 +194,7 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
 			/* GPUs not always allow direct remote access: if CUDA4
 			 * is enabled, we allow two CUDA devices to communicate. */
-			return
-#if 0
-				/* CUDA does not seem very safe with concurrent
-				 * transfer queueing, avoid queueing from CPUs */
-				kind == STARPU_CPU_RAM ||
-#endif
-				kind == STARPU_CUDA_RAM;
+			return kind == STARPU_CPU_RAM || kind == STARPU_CUDA_RAM;
 		}
 #else
 			/* Direct GPU-GPU transfers are not allowed in general */

+ 4 - 4
src/datawizard/copy_driver.c

@@ -163,7 +163,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_out_transfer_stream();
+			stream = starpu_cuda_get_out_transfer_stream(src_node);
 			if (copy_methods->cuda_to_ram_async)
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -199,7 +199,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_in_transfer_stream();
+			stream = starpu_cuda_get_in_transfer_stream(dst_node);
 			if (copy_methods->ram_to_cuda_async)
 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -531,7 +531,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
+				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
 				cudaMemcpyDeviceToHost);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -539,7 +539,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
+				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
 				cudaMemcpyHostToDevice);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):

+ 15 - 12
src/drivers/cuda/driver_cuda.c

@@ -42,8 +42,8 @@ static unsigned ncudagpus;
 static size_t global_mem[STARPU_MAXCUDADEVS];
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
-static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
-static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
+static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
+static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 static cudaEvent_t task_events[STARPU_NMAXWORKERS];
@@ -116,18 +116,18 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 }
 
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void)
+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node)
 {
-	int worker = starpu_worker_get_id();
+	int devid = _starpu_memory_node_get_devid(node);
 
-	return in_transfer_streams[worker];
+	return in_transfer_streams[devid];
 }
 
-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void)
+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node)
 {
-	int worker = starpu_worker_get_id();
+	int devid = _starpu_memory_node_get_devid(node);
 
-	return out_transfer_streams[worker];
+	return out_transfer_streams[devid];
 }
 
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
@@ -263,11 +263,11 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 
-		cures = cudaStreamCreate(&in_transfer_streams[workerid]);
+		cures = cudaStreamCreate(&in_transfer_streams[devid]);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 
-		cures = cudaStreamCreate(&out_transfer_streams[workerid]);
+		cures = cudaStreamCreate(&out_transfer_streams[devid]);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
@@ -293,8 +293,8 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 
 		cudaEventDestroy(task_events[workerid]);
 		cudaStreamDestroy(streams[workerid]);
-		cudaStreamDestroy(in_transfer_streams[workerid]);
-		cudaStreamDestroy(out_transfer_streams[workerid]);
+		cudaStreamDestroy(in_transfer_streams[devid]);
+		cudaStreamDestroy(out_transfer_streams[devid]);
 	}
 
 	for (i = 0; i < ncudagpus; i++)
@@ -588,6 +588,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 #endif
 		/* Synchronous execution */
 		{
+#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
+			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
+#endif
 			finish_job_on_cuda(j, args);
 		}
 		_STARPU_TRACE_START_PROGRESS(memnode);

+ 2 - 2
src/drivers/cuda/driver_cuda.h

@@ -48,8 +48,8 @@ void *_starpu_cuda_worker(void *);
 #  define _starpu_cuda_discover_devices(config) ((void) config)
 #endif
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node);
+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 
 struct _starpu_worker_set;