소스 검색

directly lookup destination node instead of relying on the node of the current worker

Samuel Thibault 8 년 전
부모
커밋
43a3c904ed
3개의 변경된 파일29개의 추가작업 그리고 15개의 파일을 삭제
  1. 6 7
      src/datawizard/copy_driver.c
  2. 21 8
      src/drivers/cuda/driver_cuda.c
  3. 2 0
      src/drivers/cuda/driver_cuda.h

+ 6 - 7
src/datawizard/copy_driver.c

@@ -168,9 +168,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		unsigned devid;
 		if ((src_kind == STARPU_CUDA_RAM) && (dst_kind == STARPU_CUDA_RAM))
 		{
-			/* GPU-GPU transfer, issue it from the device we are supposed to drive */
-			int worker = starpu_worker_get_id_check();
-			devid = starpu_worker_get_devid(worker);
+			/* GPU-GPU transfer, issue it from the destination */
+			devid = _starpu_memory_node_get_devid(dst_node);
 		}
 		else
 		{
@@ -212,7 +211,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_out_transfer_stream();
+			stream = starpu_cuda_get_out_transfer_stream(src_node);
 			if (copy_methods->cuda_to_ram_async)
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -248,7 +247,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_local_in_transfer_stream();
+			stream = starpu_cuda_get_in_transfer_stream(dst_node);
 			if (copy_methods->ram_to_cuda_async)
 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -665,7 +664,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) (src + src_offset), src_node,
 				(void*) (dst + dst_offset), dst_node,
 				size,
-				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
+				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
 				cudaMemcpyDeviceToHost);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -673,7 +672,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) (src + src_offset), src_node,
 				(void*) (dst + dst_offset), dst_node,
 				size,
-				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
+				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
 				cudaMemcpyHostToDevice);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):

+ 21 - 8
src/drivers/cuda/driver_cuda.c

@@ -167,6 +167,16 @@ cudaStream_t starpu_cuda_get_local_in_transfer_stream()
 	return stream;
 }
 
+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned dst_node)
+{
+	int dst_devid = _starpu_memory_node_get_devid(dst_node);
+	cudaStream_t stream;
+
+	stream = in_transfer_streams[dst_devid];
+	STARPU_ASSERT(stream);
+	return stream;
+}
+
 cudaStream_t starpu_cuda_get_local_out_transfer_stream()
 {
 	int worker = starpu_worker_get_id_check();
@@ -178,20 +188,23 @@ cudaStream_t starpu_cuda_get_local_out_transfer_stream()
 	return stream;
 }
 
+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned src_node)
+{
+	int src_devid = _starpu_memory_node_get_devid(src_node);
+	cudaStream_t stream;
+
+	stream = out_transfer_streams[src_devid];
+	STARPU_ASSERT(stream);
+	return stream;
+}
+
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
 {
-	int worker = starpu_worker_get_id_check();
-	int devid = starpu_worker_get_devid(worker);
 	int src_devid = _starpu_memory_node_get_devid(src_node);
 	int dst_devid = _starpu_memory_node_get_devid(dst_node);
 	cudaStream_t stream;
 
-	STARPU_ASSERT(devid == src_devid || devid == dst_devid);
-
-	if (devid == dst_devid)
-		stream = in_peer_transfer_streams[src_devid][dst_devid];
-	else
-		stream = out_peer_transfer_streams[src_devid][dst_devid];
+	stream = in_peer_transfer_streams[src_devid][dst_devid];
 	STARPU_ASSERT(stream);
 	return stream;
 }

+ 2 - 0
src/drivers/cuda/driver_cuda.h

@@ -47,7 +47,9 @@ void *_starpu_cuda_worker(void *);
 #endif
 #ifdef STARPU_USE_CUDA
 cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned dst_node);
 cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned src_node);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 
 struct _starpu_worker_set;