Explorar el Código

port r12759 from 1.1: Make sure we use the local stream, and not the stream of another device. Define separate streams for GPU-GPU transfers, depending on which device emits it

Samuel Thibault hace 11 años
padre
commit
7240b27585
Se han modificado 3 ficheros con 43 adiciones y 16 borrados
  1. 4 4
      src/datawizard/copy_driver.c
  2. 37 10
      src/drivers/cuda/driver_cuda.c
  3. 2 2
      src/drivers/cuda/driver_cuda.h

+ 4 - 4
src/datawizard/copy_driver.c

@@ -163,7 +163,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_out_transfer_stream(src_node);
+			stream = starpu_cuda_get_local_out_transfer_stream();
 			if (copy_methods->cuda_to_ram_async)
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -199,7 +199,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_in_transfer_stream(dst_node);
+			stream = starpu_cuda_get_local_in_transfer_stream();
 			if (copy_methods->ram_to_cuda_async)
 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -533,7 +533,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
 				cudaMemcpyDeviceToHost);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -541,7 +541,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
 				cudaMemcpyHostToDevice);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):

+ 37 - 10
src/drivers/cuda/driver_cuda.c

@@ -44,7 +44,10 @@ static size_t global_mem[STARPU_MAXCUDADEVS];
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
-static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
+/* Note: streams are not thread-safe, so we define them for each CUDA worker
+ * emitting a GPU-GPU transfer */
+static cudaStream_t in_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
+static cudaStream_t out_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 static cudaEvent_t task_events[STARPU_NMAXWORKERS];
 #endif /* STARPU_USE_CUDA */
@@ -116,26 +119,44 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 }
 
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node)
+cudaStream_t starpu_cuda_get_local_in_transfer_stream()
 {
-	int devid = _starpu_memory_node_get_devid(node);
+	int worker = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(worker);
+	cudaStream_t stream;
 
-	return in_transfer_streams[devid];
+	stream = in_transfer_streams[devid];
+	STARPU_ASSERT(stream);
+	return stream;
 }
 
-cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node)
+cudaStream_t starpu_cuda_get_local_out_transfer_stream()
 {
-	int devid = _starpu_memory_node_get_devid(node);
+	int worker = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(worker);
+	cudaStream_t stream;
 
-	return out_transfer_streams[devid];
+	stream = out_transfer_streams[devid];
+	STARPU_ASSERT(stream);
+	return stream;
 }
 
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
 {
+	int worker = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(worker);
 	int src_devid = _starpu_memory_node_get_devid(src_node);
 	int dst_devid = _starpu_memory_node_get_devid(dst_node);
+	cudaStream_t stream;
 
-	return peer_transfer_streams[src_devid][dst_devid];
+	STARPU_ASSERT(devid == src_devid || devid == dst_devid);
+
+	if (devid == dst_devid)
+		stream = in_peer_transfer_streams[src_devid][dst_devid];
+	else
+		stream = out_peer_transfer_streams[src_devid][dst_devid];
+	STARPU_ASSERT(stream);
+	return stream;
 }
 
 cudaStream_t starpu_cuda_get_local_stream(void)
@@ -274,7 +295,10 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 
 	for (i = 0; i < ncudagpus; i++)
 	{
-		cures = cudaStreamCreate(&peer_transfer_streams[i][devid]);
+		cures = cudaStreamCreate(&in_peer_transfer_streams[i][devid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaStreamCreate(&out_peer_transfer_streams[devid][i]);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
@@ -300,7 +324,10 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 	cudaStreamDestroy(out_transfer_streams[devid]);
 
 	for (i = 0; i < ncudagpus; i++)
-		cudaStreamDestroy(peer_transfer_streams[i][devid]);
+	{
+		cudaStreamDestroy(in_peer_transfer_streams[i][devid]);
+		cudaStreamDestroy(out_peer_transfer_streams[devid][i]);
+	}
 
 	/* cleanup the runtime API internal stuffs (which CUBLAS is using) */
 	cures = cudaThreadExit();

+ 2 - 2
src/drivers/cuda/driver_cuda.h

@@ -48,8 +48,8 @@ void *_starpu_cuda_worker(void *);
 #  define _starpu_cuda_discover_devices(config) ((void) config)
 #endif
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node);
-cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node);
+cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
+cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 
 struct _starpu_worker_set;