Przeglądaj źródła

Use different streams for gpu-gpu transfers

Samuel Thibault 11 lat temu
rodzic
commit
b4976f3d00

+ 2 - 8
src/core/simgrid.c

@@ -188,14 +188,8 @@ static int transfers_are_sequential(struct transfer *new_transfer, struct transf
 			&& new_transfer->dst_node == old_transfer->dst_node)
 		return 1;
 
-	/* These constraints come from StarPU */
-
-	/* StarPU uses one stream per direction */
-	/* RAM->GPU and GPU->RAM are already handled by "same source/destination" */
-
-	/* StarPU uses one stream per running GPU for GPU-GPU transfers */
-	if (new_is_gpu_gpu && old_is_gpu_gpu && new_transfer->run_node == old_transfer->run_node)
-		return 1;
+	/* StarPU's constraint on CUDA transfers is using one stream per
+	 * source/destination pair, which is already handled above */
 
 	return 0;
 }

+ 7 - 7
src/datawizard/copy_driver.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -163,7 +163,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_out_transfer_stream();
+			stream = starpu_cuda_get_out_transfer_stream(src_node);
 			if (copy_methods->cuda_to_ram_async)
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -199,7 +199,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_in_transfer_stream();
+			stream = starpu_cuda_get_in_transfer_stream(dst_node);
 			if (copy_methods->ram_to_cuda_async)
 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -231,7 +231,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_peer_transfer_stream();
+			stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
 			if (copy_methods->cuda_to_cuda_async)
 				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -531,7 +531,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
+				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
 				cudaMemcpyDeviceToHost);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -539,7 +539,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
+				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
 				cudaMemcpyHostToDevice);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
@@ -547,7 +547,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_local_peer_transfer_stream():NULL,
+				async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
 				cudaMemcpyDeviceToDevice);
 
 #endif

+ 28 - 20
src/drivers/cuda/driver_cuda.c

@@ -42,9 +42,9 @@ static int ncudagpus;
 static size_t global_mem[STARPU_NMAXWORKERS];
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
-static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
-static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
-static cudaStream_t peer_transfer_streams[STARPU_NMAXWORKERS];
+static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
+static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
+static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 #endif /* STARPU_USE_CUDA */
 
@@ -113,25 +113,26 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 }
 
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void)
+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node)
 {
-	int worker = starpu_worker_get_id();
+	int devid = _starpu_memory_node_get_devid(node);
 
-	return in_transfer_streams[worker];
+	return in_transfer_streams[devid];
 }
 
-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void)
+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node)
 {
-	int worker = starpu_worker_get_id();
+	int devid = _starpu_memory_node_get_devid(node);
 
-	return out_transfer_streams[worker];
+	return out_transfer_streams[devid];
 }
 
-cudaStream_t starpu_cuda_get_local_peer_transfer_stream(void)
+cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
 {
-	int worker = starpu_worker_get_id();
+	int src_devid = _starpu_memory_node_get_devid(src_node);
+	int dst_devid = _starpu_memory_node_get_devid(dst_node);
 
-	return peer_transfer_streams[worker];
+	return peer_transfer_streams[src_devid][dst_devid];
 }
 
 cudaStream_t starpu_cuda_get_local_stream(void)
@@ -196,6 +197,7 @@ static void init_context(unsigned devid)
 {
 	cudaError_t cures;
 	int workerid;
+	int i;
 
 	/* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
 
@@ -252,27 +254,33 @@ static void init_context(unsigned devid)
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&in_transfer_streams[workerid]);
+	cures = cudaStreamCreate(&in_transfer_streams[devid]);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&out_transfer_streams[workerid]);
+	cures = cudaStreamCreate(&out_transfer_streams[devid]);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&peer_transfer_streams[workerid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	for (i = 0; i < ncudagpus; i++)
+	{
+		cures = cudaStreamCreate(&peer_transfer_streams[i][devid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
 }
 
 static void deinit_context(int workerid)
 {
 	cudaError_t cures;
+	int devid = starpu_worker_get_devid(workerid);
+	int i;
 
 	cudaStreamDestroy(streams[workerid]);
-	cudaStreamDestroy(in_transfer_streams[workerid]);
-	cudaStreamDestroy(out_transfer_streams[workerid]);
-	cudaStreamDestroy(peer_transfer_streams[workerid]);
+	cudaStreamDestroy(in_transfer_streams[devid]);
+	cudaStreamDestroy(out_transfer_streams[devid]);
+	for (i = 0; i < ncudagpus; i++)
+		cudaStreamDestroy(peer_transfer_streams[i][devid]);
 
 	/* cleanup the runtime API internal stuffs (which CUBLAS is using) */
 	cures = cudaThreadExit();

+ 4 - 4
src/drivers/cuda/driver_cuda.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -48,9 +48,9 @@ void *_starpu_cuda_worker(void *);
 #  define _starpu_cuda_discover_devices(config) ((void) config)
 #endif
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
-cudaStream_t starpu_cuda_get_local_peer_transfer_stream(void);
+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node);
+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node);
+cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 
 int _starpu_run_cuda(struct starpu_driver *);
 int _starpu_cuda_driver_init(struct starpu_driver *);