|
@@ -331,21 +331,15 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
|
|
|
}
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
|
|
|
+static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
|
|
|
{
|
|
|
struct starpu_vector_interface *src_vector = src_interface;
|
|
|
struct starpu_vector_interface *dst_vector = dst_interface;
|
|
|
+ int ret;
|
|
|
|
|
|
- cudaError_t cures;
|
|
|
-
|
|
|
- cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
-
|
|
|
+ ret = starpu_cuda_copy_async_sync((void *)src_vector->ptr, src_node, (void *)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize, stream, kind);
|
|
|
_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
|
|
|
-
|
|
|
- return 0;
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
@@ -385,24 +379,21 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
|
|
|
{
|
|
|
- return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
|
|
|
+ return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToHost);
|
|
|
}
|
|
|
|
|
|
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
|
|
|
{
|
|
|
- return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
|
|
|
+ return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyHostToDevice);
|
|
|
}
|
|
|
|
|
|
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
|
|
|
{
|
|
|
if (src_node == dst_node)
|
|
|
{
|
|
|
- return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
|
|
|
+ return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToDevice);
|
|
|
}
|
|
|
else
|
|
|
{
|
|
@@ -416,38 +407,11 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- cudaStream_t stream, enum cudaMemcpyKind kind)
|
|
|
-{
|
|
|
- struct starpu_vector_interface *src_vector = src_interface;
|
|
|
- struct starpu_vector_interface *dst_vector = dst_interface;
|
|
|
-
|
|
|
- cudaError_t cures;
|
|
|
-
|
|
|
- _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
|
|
|
- cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
|
|
|
- _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
|
|
|
- if (cures)
|
|
|
- {
|
|
|
- /* do it in a synchronous fashion */
|
|
|
- cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
-
|
|
|
- return 0;
|
|
|
- }
|
|
|
-
|
|
|
- _STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
|
|
|
-
|
|
|
- return -EAGAIN;
|
|
|
-}
|
|
|
-
|
|
|
static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
|
|
|
{
|
|
|
if (src_node == dst_node)
|
|
|
{
|
|
|
- return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
|
|
|
+ return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
|
|
|
}
|
|
|
else
|
|
|
{
|
|
@@ -461,16 +425,14 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
|
|
|
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
|
|
|
{
|
|
|
- return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
|
|
|
+ return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
|
|
|
}
|
|
|
|
|
|
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
|
|
|
- void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
|
|
|
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
|
|
|
{
|
|
|
- return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
|
|
|
+ return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
|
|
|
}
|
|
|
|
|
|
#endif // STARPU_USE_CUDA
|