|
@@ -720,14 +720,61 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
|
|
starpu_block_interface_t *dst_block = dst_interface;
|
|
starpu_block_interface_t *dst_block = dst_interface;
|
|
int err,ret;
|
|
int err,ret;
|
|
|
|
|
|
-
|
|
+ uint32_t nx = src_block->nx;
|
|
- STARPU_ASSERT((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy));
|
|
+ uint32_t ny = src_block->ny;
|
|
|
|
|
|
- err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
|
|
+
|
|
- src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
+ * plans within the block, we can avoid many small transfers that way */
|
|
- dst_block->offset, (cl_event*)_event, &ret);
|
|
+ if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
|
|
- if (STARPU_UNLIKELY(err))
|
|
+ {
|
|
- STARPU_OPENCL_REPORT_ERROR(err);
|
|
+
|
|
|
|
+ if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
|
|
+ {
|
|
|
|
+ err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
|
|
|
|
+ src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
|
|
+ dst_block->offset, (cl_event*)_event, &ret);
|
|
|
|
+ if (STARPU_UNLIKELY(err))
|
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
+ }
|
|
|
|
+ else {
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ STARPU_ASSERT(0);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ else {
|
|
|
|
+
|
|
|
|
+ unsigned layer;
|
|
|
|
+ for (layer = 0; layer < src_block->nz; layer++)
|
|
|
|
+ {
|
|
|
|
+ unsigned j;
|
|
|
|
+ for(j=0 ; j<src_block->ny ; j++) {
|
|
|
|
+ void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
|
|
|
|
+ err = _starpu_opencl_copy_ram_to_opencl(ptr, (cl_mem)dst_block->dev_handle,
|
|
|
|
+ src_block->nx*src_block->elemsize,
|
|
|
|
+ layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
|
|
|
|
+ + dst_block->offset, NULL);
|
|
|
|
+ if (STARPU_UNLIKELY(err))
|
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
|
|
STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
|
|
|
@@ -740,14 +787,53 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
|
|
starpu_block_interface_t *dst_block = dst_interface;
|
|
starpu_block_interface_t *dst_block = dst_interface;
|
|
int err, ret;
|
|
int err, ret;
|
|
|
|
|
|
-
|
|
+
|
|
- STARPU_ASSERT((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy));
|
|
+ * plans within the block, we can avoid many small transfers that way */
|
|
-
|
|
+ if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
|
|
- err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
|
|
+ {
|
|
- src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
+
|
|
- src_block->offset, (cl_event*)_event, &ret);
|
|
+ if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
- if (STARPU_UNLIKELY(err))
|
|
+ {
|
|
- STARPU_OPENCL_REPORT_ERROR(err);
|
|
+ err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
|
|
|
|
+ src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
|
|
+ src_block->offset, (cl_event*)_event, &ret);
|
|
|
|
+ if (STARPU_UNLIKELY(err))
|
|
|
|
+ STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
+ }
|
|
|
|
+ else {
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ STARPU_ASSERT(0);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ else {
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ unsigned layer;
|
|
|
|
+ for (layer = 0; layer < src_block->nz; layer++)
|
|
|
|
+ {
|
|
|
|
+ unsigned j;
|
|
|
|
+ for(j=0 ; j<src_block->ny ; j++) {
|
|
|
|
+ void *ptr = (void *)dst_block->ptr+(layer*dst_block->ldz*dst_block->elemsize)+(j*dst_block->ldy*dst_block->elemsize);
|
|
|
|
+ err = _starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, ptr,
|
|
|
|
+ src_block->nx*src_block->elemsize,
|
|
|
|
+ layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
|
|
|
|
+ src_block->offset, NULL);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
|
|
STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
|
|