|
@@ -32,6 +32,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
|
|
|
static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
|
|
|
static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
|
|
|
static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
|
|
|
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
|
|
|
#endif
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
|
static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
|
|
@@ -48,6 +49,7 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
|
|
|
.cuda_to_ram = copy_cuda_to_ram,
|
|
|
.ram_to_cuda_async = copy_ram_to_cuda_async,
|
|
|
.cuda_to_ram_async = copy_cuda_to_ram_async,
|
|
|
+ .cuda_to_cuda = copy_cuda_to_cuda,
|
|
|
#endif
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
|
.ram_to_opencl = copy_ram_to_opencl,
|
|
@@ -55,7 +57,6 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
|
|
|
.ram_to_opencl_async = copy_ram_to_opencl_async,
|
|
|
.opencl_to_ram_async = copy_opencl_to_ram_async,
|
|
|
#endif
|
|
|
- .cuda_to_cuda = NULL,
|
|
|
.cuda_to_spu = NULL,
|
|
|
.spu_to_ram = NULL,
|
|
|
.spu_to_cuda = NULL,
|
|
@@ -380,7 +381,7 @@ static void free_block_buffer_on_node(void *interface, uint32_t node)
|
|
|
}
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
|
|
|
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
|
|
|
{
|
|
|
starpu_block_interface_t *src_block = src_interface;
|
|
|
starpu_block_interface_t *dst_block = dst_interface;
|
|
@@ -398,7 +399,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
|
|
|
if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
|
{
|
|
|
cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
|
|
|
- nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
|
|
|
+ nx*ny*nz*elemsize, kind);
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
}
|
|
@@ -406,7 +407,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
|
|
|
/* Are all plans contiguous */
|
|
|
cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
(char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
|
|
|
+ nx*ny*elemsize, nz, kind);
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
}
|
|
@@ -421,7 +422,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
|
|
|
|
|
|
cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
(char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, cudaMemcpyDeviceToHost);
|
|
|
+ nx*elemsize, ny, kind);
|
|
|
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
STARPU_CUDA_REPORT_ERROR(cures);
|
|
@@ -435,7 +436,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
|
|
|
+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream, enum cudaMemcpyKind kind)
|
|
|
{
|
|
|
starpu_block_interface_t *src_block = src_interface;
|
|
|
starpu_block_interface_t *dst_block = dst_interface;
|
|
@@ -457,11 +458,11 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
|
|
|
if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
|
{
|
|
|
cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
|
|
|
- nx*ny*nz*elemsize, cudaMemcpyDeviceToHost, *stream);
|
|
|
+ nx*ny*nz*elemsize, kind, *stream);
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
{
|
|
|
cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
|
|
|
- nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
|
|
|
+ nx*ny*nz*elemsize, kind);
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
cudaThreadSynchronize();
|
|
@@ -477,12 +478,12 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
|
|
|
/* Are all plans contiguous */
|
|
|
cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
(char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, cudaMemcpyDeviceToHost, *stream);
|
|
|
+ nx*ny*elemsize, nz, kind, *stream);
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
{
|
|
|
cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
(char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
|
|
|
+ nx*ny*elemsize, nz, kind);
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
cudaThreadSynchronize();
|
|
@@ -504,7 +505,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
|
|
|
|
|
|
cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
(char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, cudaMemcpyDeviceToHost, *stream);
|
|
|
+ nx*elemsize, ny, kind, *stream);
|
|
|
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
{
|
|
@@ -533,7 +534,7 @@ no_async_default:
|
|
|
|
|
|
cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
(char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, cudaMemcpyDeviceToHost);
|
|
|
+ nx*elemsize, ny, kind);
|
|
|
|
|
|
if (STARPU_UNLIKELY(cures))
|
|
|
STARPU_CUDA_REPORT_ERROR(cures);
|
|
@@ -545,170 +546,29 @@ no_async_default:
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
|
|
|
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
|
|
|
{
|
|
|
- starpu_block_interface_t *src_block = src_interface;
|
|
|
- starpu_block_interface_t *dst_block = dst_interface;
|
|
|
-
|
|
|
- uint32_t nx = src_block->nx;
|
|
|
- uint32_t ny = src_block->ny;
|
|
|
- uint32_t nz = src_block->nz;
|
|
|
- size_t elemsize = src_block->elemsize;
|
|
|
-
|
|
|
- cudaError_t cures;
|
|
|
- int ret;
|
|
|
-
|
|
|
- /* We may have a contiguous buffer for the entire block, or contiguous
|
|
|
- * plans within the block, we can avoid many small transfers that way */
|
|
|
- if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
|
|
|
- {
|
|
|
- /* Is that a single contiguous buffer ? */
|
|
|
- if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
|
- {
|
|
|
- cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
|
|
|
- nx*ny*nz*elemsize, cudaMemcpyHostToDevice, *stream);
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- {
|
|
|
- cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
|
|
|
- nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- cudaThreadSynchronize();
|
|
|
-
|
|
|
- ret = 0;
|
|
|
- }
|
|
|
- else {
|
|
|
- ret = -EAGAIN;
|
|
|
- }
|
|
|
-
|
|
|
- }
|
|
|
- else {
|
|
|
- /* Are all plans contiguous */
|
|
|
- cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
- (char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, cudaMemcpyHostToDevice, *stream);
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- {
|
|
|
- cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
- (char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- cudaThreadSynchronize();
|
|
|
-
|
|
|
- ret = 0;
|
|
|
- }
|
|
|
- else {
|
|
|
- ret = -EAGAIN;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- else {
|
|
|
- /* Default case: we transfer all lines one by one: ny*nz transfers */
|
|
|
- unsigned layer;
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
- {
|
|
|
- uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
|
|
|
- uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
|
|
|
-
|
|
|
- cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
- (char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, cudaMemcpyHostToDevice, *stream);
|
|
|
-
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- {
|
|
|
- /* I don't know how to do that "better" */
|
|
|
- goto no_async_default;
|
|
|
- }
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- ret = -EAGAIN;
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
-
|
|
|
- return ret;
|
|
|
-
|
|
|
-no_async_default:
|
|
|
-
|
|
|
- {
|
|
|
- unsigned layer;
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
- {
|
|
|
- uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
|
|
|
- uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
|
|
|
-
|
|
|
- cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
- (char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, cudaMemcpyHostToDevice);
|
|
|
-
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- }
|
|
|
- cudaThreadSynchronize();
|
|
|
-
|
|
|
- STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
- return 0;
|
|
|
- }
|
|
|
+ return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
|
|
|
}
|
|
|
|
|
|
static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
|
|
|
{
|
|
|
- starpu_block_interface_t *src_block = src_interface;
|
|
|
- starpu_block_interface_t *dst_block = dst_interface;
|
|
|
-
|
|
|
- uint32_t nx = src_block->nx;
|
|
|
- uint32_t ny = src_block->ny;
|
|
|
- uint32_t nz = src_block->nz;
|
|
|
- size_t elemsize = src_block->elemsize;
|
|
|
-
|
|
|
- cudaError_t cures;
|
|
|
-
|
|
|
- /* We may have a contiguous buffer for the entire block, or contiguous
|
|
|
- * plans within the block, we can avoid many small transfers that way */
|
|
|
- if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
|
|
|
- {
|
|
|
- /* Is that a single contiguous buffer ? */
|
|
|
- if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
|
- {
|
|
|
- cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
|
|
|
- nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
|
|
|
- }
|
|
|
- else {
|
|
|
- /* Are all plans contiguous */
|
|
|
- cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
- (char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
|
|
|
- }
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- }
|
|
|
- else {
|
|
|
- /* Default case: we transfer all lines one by one: ny*nz transfers */
|
|
|
- unsigned layer;
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
- {
|
|
|
- uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
|
|
|
- uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
|
|
|
-
|
|
|
- cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
- (char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, cudaMemcpyHostToDevice);
|
|
|
-
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- }
|
|
|
- }
|
|
|
+ return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
|
|
|
+}
|
|
|
|
|
|
- cudaThreadSynchronize();
|
|
|
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
|
|
|
+{
|
|
|
+ return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
|
|
|
+}
|
|
|
|
|
|
- STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
|
|
|
+{
|
|
|
+ return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
|
|
|
+}
|
|
|
|
|
|
- return 0;
|
|
|
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
|
|
|
+{
|
|
|
+ return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
|
|
|
}
|
|
|
#endif // STARPU_USE_CUDA
|
|
|
|