|
@@ -18,40 +18,10 @@
|
|
|
|
|
|
#include <starpu.h>
|
|
|
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
|
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
|
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
|
|
|
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
|
|
|
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
|
-#endif
|
|
|
-#ifdef STARPU_USE_OPENCL
|
|
|
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
|
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
|
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
|
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
|
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
|
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
|
-#endif
|
|
|
static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
|
|
|
|
|
|
static const struct starpu_data_copy_methods block_copy_data_methods_s =
|
|
|
{
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
- .ram_to_cuda = copy_ram_to_cuda,
|
|
|
- .cuda_to_ram = copy_cuda_to_ram,
|
|
|
- .ram_to_cuda_async = copy_ram_to_cuda_async,
|
|
|
- .cuda_to_ram_async = copy_cuda_to_ram_async,
|
|
|
- .cuda_to_cuda = copy_cuda_to_cuda,
|
|
|
-#endif
|
|
|
-#ifdef STARPU_USE_OPENCL
|
|
|
- .ram_to_opencl = copy_ram_to_opencl,
|
|
|
- .opencl_to_ram = copy_opencl_to_ram,
|
|
|
- .opencl_to_opencl = copy_opencl_to_opencl,
|
|
|
- .ram_to_opencl_async = copy_ram_to_opencl_async,
|
|
|
- .opencl_to_ram_async = copy_opencl_to_ram_async,
|
|
|
- .opencl_to_opencl_async = copy_opencl_to_opencl_async,
|
|
|
-#endif
|
|
|
.any_to_any = copy_any_to_any,
|
|
|
};
|
|
|
|
|
@@ -494,279 +464,6 @@ static void free_block_buffer_on_node(void *data_interface, unsigned node)
|
|
|
starpu_free_on_node(node, block_interface->dev_handle, nx*ny*nz*elemsize);
|
|
|
}
|
|
|
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
|
|
|
-{
|
|
|
- struct starpu_block_interface *src_block = src_interface;
|
|
|
- struct starpu_block_interface *dst_block = dst_interface;
|
|
|
-
|
|
|
- uint32_t nx = src_block->nx;
|
|
|
- uint32_t ny = src_block->ny;
|
|
|
- uint32_t nz = src_block->nz;
|
|
|
- size_t elemsize = src_block->elemsize;
|
|
|
-
|
|
|
- cudaError_t cures;
|
|
|
-
|
|
|
- if (IS_CONTIGUOUS_MATRIX(nx, ny, src_block->ldy) && (src_block->ldy == dst_block->ldy))
|
|
|
- {
|
|
|
-
|
|
|
- if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
|
|
|
- IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
|
|
|
- {
|
|
|
- starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, NULL, kind);
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
-
|
|
|
- cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
- (char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, kind);
|
|
|
- if (!cures)
|
|
|
- cures = cudaDeviceSynchronize();
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- }
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
-
|
|
|
-
|
|
|
- unsigned layer;
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
- {
|
|
|
- uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
|
|
|
- uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
|
|
|
-
|
|
|
- cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
- (char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, kind);
|
|
|
-
|
|
|
- if (!cures)
|
|
|
- cures = cudaDeviceSynchronize();
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
-
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
|
|
|
-{
|
|
|
- struct starpu_block_interface *src_block = src_interface;
|
|
|
- struct starpu_block_interface *dst_block = dst_interface;
|
|
|
-
|
|
|
- uint32_t nx = src_block->nx;
|
|
|
- uint32_t ny = src_block->ny;
|
|
|
- uint32_t nz = src_block->nz;
|
|
|
- size_t elemsize = src_block->elemsize;
|
|
|
-
|
|
|
- cudaError_t cures;
|
|
|
-
|
|
|
- int ret;
|
|
|
-
|
|
|
-
|
|
|
- * plans within the block, we can avoid many small transfers that way */
|
|
|
- if (IS_CONTIGUOUS_MATRIX(nx, ny, src_block->ldy) && (src_block->ldy == dst_block->ldy))
|
|
|
- {
|
|
|
-
|
|
|
- if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
|
|
|
- IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
|
|
|
- {
|
|
|
- ret = starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, stream, kind);
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
- double start;
|
|
|
-
|
|
|
- starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
|
|
|
- cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
- (char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, kind, stream);
|
|
|
- starpu_interface_end_driver_copy_async(src_node, dst_node, start);
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- {
|
|
|
- cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
|
|
|
- (char *)src_block->ptr, src_block->ldz*elemsize,
|
|
|
- nx*ny*elemsize, nz, kind);
|
|
|
- if (!cures)
|
|
|
- cures = cudaDeviceSynchronize();
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
-
|
|
|
- ret = 0;
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
- ret = -EAGAIN;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
-
|
|
|
-
|
|
|
- unsigned layer;
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
- {
|
|
|
- uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
|
|
|
- uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
|
|
|
- double start;
|
|
|
-
|
|
|
- starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
|
|
|
- cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
- (char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, kind, stream);
|
|
|
- starpu_interface_end_driver_copy_async(src_node, dst_node, start);
|
|
|
-
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- {
|
|
|
-
|
|
|
- goto no_async_default;
|
|
|
- }
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- ret = -EAGAIN;
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
-
|
|
|
- return ret;
|
|
|
-
|
|
|
-no_async_default:
|
|
|
-
|
|
|
- {
|
|
|
- unsigned layer;
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
- {
|
|
|
- uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
|
|
|
- uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
|
|
|
-
|
|
|
- cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
|
|
|
- (char *)src_ptr, src_block->ldy*elemsize,
|
|
|
- nx*elemsize, ny, kind);
|
|
|
-
|
|
|
- if (!cures)
|
|
|
- cures = cudaDeviceSynchronize();
|
|
|
- if (STARPU_UNLIKELY(cures))
|
|
|
- STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
- }
|
|
|
-
|
|
|
- starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
- return 0;
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
|
|
|
-{
|
|
|
- return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
-{
|
|
|
- return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
-{
|
|
|
- return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
|
|
|
-{
|
|
|
- return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
|
|
|
-{
|
|
|
- return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef STARPU_USE_OPENCL
|
|
|
-static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
|
-{
|
|
|
- struct starpu_block_interface *src_block = src_interface;
|
|
|
- struct starpu_block_interface *dst_block = dst_interface;
|
|
|
- int ret = 0;
|
|
|
-
|
|
|
- uint32_t nx = src_block->nx;
|
|
|
- uint32_t ny = src_block->ny;
|
|
|
-
|
|
|
-
|
|
|
- * plans within the block, we can avoid many small transfers that way */
|
|
|
- if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
|
|
|
- IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
|
|
|
-
|
|
|
- {
|
|
|
- ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
|
|
|
- dst_block->dev_handle, dst_block->offset, dst_node,
|
|
|
- src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
|
- event);
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
-
|
|
|
-
|
|
|
- unsigned layer;
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
- {
|
|
|
- unsigned j;
|
|
|
- for(j=0 ; j<src_block->ny ; j++)
|
|
|
- {
|
|
|
- ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
|
|
|
- src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
|
|
|
- src_node,
|
|
|
- dst_block->dev_handle,
|
|
|
- dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
|
|
|
- dst_node,
|
|
|
- src_block->nx*src_block->elemsize,
|
|
|
- event);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
-
|
|
|
- return ret;
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
|
-{
|
|
|
- return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
|
-{
|
|
|
- return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
|
-{
|
|
|
- return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
-{
|
|
|
- return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
-{
|
|
|
- return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
|
|
|
-}
|
|
|
-
|
|
|
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
-{
|
|
|
- return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
|
|
|
-}
|
|
|
-
|
|
|
-#endif
|
|
|
-
|
|
|
static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
|
|
|
{
|
|
|
struct starpu_block_interface *src_block = (struct starpu_block_interface *) src_interface;
|