소스 검색

port r12533 from 1.1: Do not let non-CUDA workers use non-0 streams, CUDA seems not very threadsafe with that. Make the coherency engine avoid selecting non-CUDA workers to issue transfers, to avoid letting them use the 0 stream.

Samuel Thibault 11 년 전
부모
커밋
058e113825
4개의 변경된 파일25개의 추가작업 그리고 19개의 파일을 삭제
  1. 7 1
      src/datawizard/coherency.c
  2. 4 4
      src/datawizard/copy_driver.c
  3. 12 12
      src/drivers/cuda/driver_cuda.c
  4. 2 2
      src/drivers/cuda/driver_cuda.h

+ 7 - 1
src/datawizard/coherency.c

@@ -194,7 +194,13 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
 			/* GPUs not always allow direct remote access: if CUDA4
 			/* GPUs not always allow direct remote access: if CUDA4
 			 * is enabled, we allow two CUDA devices to communicate. */
 			 * is enabled, we allow two CUDA devices to communicate. */
-			return kind == STARPU_CPU_RAM || kind == STARPU_CUDA_RAM;
+			return
+#if 0
+				/* CUDA does not seem very safe with concurrent
+				 * transfer queueing, avoid queueing from CPUs */
+				kind == STARPU_CPU_RAM ||
+#endif
+				kind == STARPU_CUDA_RAM;
 		}
 		}
 #else
 #else
 			/* Direct GPU-GPU transfers are not allowed in general */
 			/* Direct GPU-GPU transfers are not allowed in general */

+ 4 - 4
src/datawizard/copy_driver.c

@@ -163,7 +163,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 
-			stream = starpu_cuda_get_out_transfer_stream(src_node);
+			stream = starpu_cuda_get_local_out_transfer_stream();
 			if (copy_methods->cuda_to_ram_async)
 			if (copy_methods->cuda_to_ram_async)
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
 			else
@@ -199,7 +199,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 
-			stream = starpu_cuda_get_in_transfer_stream(dst_node);
+			stream = starpu_cuda_get_local_in_transfer_stream();
 			if (copy_methods->ram_to_cuda_async)
 			if (copy_methods->ram_to_cuda_async)
 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
 			else
@@ -531,7 +531,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
 				size,
-				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
 				cudaMemcpyDeviceToHost);
 				cudaMemcpyDeviceToHost);
 
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -539,7 +539,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
 				size,
-				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
 				cudaMemcpyHostToDevice);
 				cudaMemcpyHostToDevice);
 
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):

+ 12 - 12
src/drivers/cuda/driver_cuda.c

@@ -42,8 +42,8 @@ static int ncudagpus;
 static size_t global_mem[STARPU_MAXCUDADEVS];
 static size_t global_mem[STARPU_MAXCUDADEVS];
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t streams[STARPU_NMAXWORKERS];
-static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
-static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
+static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
+static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
 static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 static cudaEvent_t task_events[STARPU_MAXCUDADEVS];
 static cudaEvent_t task_events[STARPU_MAXCUDADEVS];
@@ -116,18 +116,18 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node)
+cudaStream_t starpu_cuda_get_local_in_transfer_stream(void)
 {
 {
-	int devid = _starpu_memory_node_get_devid(node);
+	int worker = starpu_worker_get_id();
 
 
-	return in_transfer_streams[devid];
+	return in_transfer_streams[worker];
 }
 }
 
 
-cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node)
+cudaStream_t starpu_cuda_get_local_out_transfer_stream(void)
 {
 {
-	int devid = _starpu_memory_node_get_devid(node);
+	int worker = starpu_worker_get_id();
 
 
-	return out_transfer_streams[devid];
+	return out_transfer_streams[worker];
 }
 }
 
 
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
@@ -261,11 +261,11 @@ static void init_context(unsigned devid)
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 
-	cures = cudaStreamCreate(&in_transfer_streams[devid]);
+	cures = cudaStreamCreate(&in_transfer_streams[workerid]);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 
-	cures = cudaStreamCreate(&out_transfer_streams[devid]);
+	cures = cudaStreamCreate(&out_transfer_streams[workerid]);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 
@@ -285,8 +285,8 @@ static void deinit_context(int workerid)
 
 
 	cudaEventDestroy(task_events[workerid]);
 	cudaEventDestroy(task_events[workerid]);
 	cudaStreamDestroy(streams[workerid]);
 	cudaStreamDestroy(streams[workerid]);
-	cudaStreamDestroy(in_transfer_streams[devid]);
-	cudaStreamDestroy(out_transfer_streams[devid]);
+	cudaStreamDestroy(in_transfer_streams[workerid]);
+	cudaStreamDestroy(out_transfer_streams[workerid]);
 	for (i = 0; i < ncudagpus; i++)
 	for (i = 0; i < ncudagpus; i++)
 		cudaStreamDestroy(peer_transfer_streams[i][devid]);
 		cudaStreamDestroy(peer_transfer_streams[i][devid]);
 
 

+ 2 - 2
src/drivers/cuda/driver_cuda.h

@@ -48,8 +48,8 @@ void *_starpu_cuda_worker(void *);
 #  define _starpu_cuda_discover_devices(config) ((void) config)
 #  define _starpu_cuda_discover_devices(config) ((void) config)
 #endif
 #endif
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node);
-cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node);
+cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
+cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 
 
 struct _starpu_worker;
 struct _starpu_worker;