|
@@ -1,6 +1,6 @@
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
*
|
|
*
|
|
- * Copyright (C) 2009-2012 Université de Bordeaux 1
|
|
|
|
|
|
+ * Copyright (C) 2009-2013 Université de Bordeaux 1
|
|
* Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
|
|
* Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
|
|
*
|
|
*
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
@@ -38,8 +38,10 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
|
|
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
|
|
static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
|
|
+static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
|
|
#endif
|
|
#endif
|
|
|
|
|
|
static struct starpu_data_copy_methods block_copy_data_methods_s =
|
|
static struct starpu_data_copy_methods block_copy_data_methods_s =
|
|
@@ -55,8 +57,10 @@ static struct starpu_data_copy_methods block_copy_data_methods_s =
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
.ram_to_opencl = copy_ram_to_opencl,
|
|
.ram_to_opencl = copy_ram_to_opencl,
|
|
.opencl_to_ram = copy_opencl_to_ram,
|
|
.opencl_to_ram = copy_opencl_to_ram,
|
|
|
|
+ .opencl_to_opencl = copy_opencl_to_opencl,
|
|
.ram_to_opencl_async = copy_ram_to_opencl_async,
|
|
.ram_to_opencl_async = copy_ram_to_opencl_async,
|
|
.opencl_to_ram_async = copy_opencl_to_ram_async,
|
|
.opencl_to_ram_async = copy_opencl_to_ram_async,
|
|
|
|
+ .opencl_to_opencl_async = copy_opencl_to_opencl_async,
|
|
#endif
|
|
#endif
|
|
};
|
|
};
|
|
|
|
|
|
@@ -494,11 +498,11 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
|
|
#endif // STARPU_USE_CUDA
|
|
#endif // STARPU_USE_CUDA
|
|
|
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
#ifdef STARPU_USE_OPENCL
|
|
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event)
|
|
|
|
|
|
+static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
{
|
|
{
|
|
struct starpu_block_interface *src_block = src_interface;
|
|
struct starpu_block_interface *src_block = src_interface;
|
|
struct starpu_block_interface *dst_block = dst_interface;
|
|
struct starpu_block_interface *dst_block = dst_interface;
|
|
- int err, ret = 0;
|
|
|
|
|
|
+ int ret = 0;
|
|
|
|
|
|
uint32_t nx = src_block->nx;
|
|
uint32_t nx = src_block->nx;
|
|
uint32_t ny = src_block->ny;
|
|
uint32_t ny = src_block->ny;
|
|
@@ -510,11 +514,10 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
|
|
/* Is that a single contiguous buffer ? */
|
|
/* Is that a single contiguous buffer ? */
|
|
if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
{
|
|
{
|
|
- err = starpu_opencl_copy_ram_to_opencl((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
|
|
|
|
|
|
+ ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node, src_block->offset,
|
|
|
|
+ dst_block->dev_handle, dst_node, dst_block->offset,
|
|
src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
- dst_block->offset, event, &ret);
|
|
|
|
- if (STARPU_UNLIKELY(err))
|
|
|
|
- STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
|
|
+ event);
|
|
}
|
|
}
|
|
else
|
|
else
|
|
{
|
|
{
|
|
@@ -525,36 +528,20 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
|
|
else
|
|
else
|
|
{
|
|
{
|
|
/* Default case: we transfer all lines one by one: ny*nz transfers */
|
|
/* Default case: we transfer all lines one by one: ny*nz transfers */
|
|
|
|
+ /* TODO: rect support */
|
|
unsigned layer;
|
|
unsigned layer;
|
|
for (layer = 0; layer < src_block->nz; layer++)
|
|
for (layer = 0; layer < src_block->nz; layer++)
|
|
{
|
|
{
|
|
unsigned j;
|
|
unsigned j;
|
|
for(j=0 ; j<src_block->ny ; j++)
|
|
for(j=0 ; j<src_block->ny ; j++)
|
|
{
|
|
{
|
|
- void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
|
|
|
|
- err = starpu_opencl_copy_ram_to_opencl(ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
|
|
|
|
|
|
+ ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node,
|
|
|
|
+ src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
|
|
|
|
+ dst_block->dev_handle, dst_node,
|
|
|
|
+ dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
|
|
src_block->nx*src_block->elemsize,
|
|
src_block->nx*src_block->elemsize,
|
|
- layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
|
|
|
|
- + dst_block->offset, NULL, NULL);
|
|
|
|
- if (STARPU_UNLIKELY(err))
|
|
|
|
- STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
|
|
+ NULL);
|
|
}
|
|
}
|
|
-
|
|
|
|
- // int *foo = (int *)(src_block->ptr+(layer*src_block->ldz*src_block->elemsize));
|
|
|
|
- // fprintf(stderr, "layer %d --> value %d\n", layer, foo[1]);
|
|
|
|
- // const size_t buffer_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
|
|
|
|
- // //const size_t buffer_origin[3] = {0, 0, 0};
|
|
|
|
- // const size_t host_origin[3] = {layer*dst_block->ldz*dst_block->elemsize+dst_block->offset, 0, 0};
|
|
|
|
- // size_t region[3] = {src_block->nx*src_block->elemsize,src_block->ny, 1};
|
|
|
|
- // size_t buffer_row_pitch=region[0];
|
|
|
|
- // size_t buffer_slice_pitch=region[1] * buffer_row_pitch;
|
|
|
|
- // size_t host_row_pitch=region[0];
|
|
|
|
- // size_t host_slice_pitch=region[1] * host_row_pitch;
|
|
|
|
- //
|
|
|
|
- // _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
|
|
|
|
- // buffer_origin, host_origin, region,
|
|
|
|
- // buffer_row_pitch, buffer_slice_pitch,
|
|
|
|
- // host_row_pitch, host_slice_pitch, NULL);
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -563,67 +550,19 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
|
|
return ret;
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
|
|
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event)
|
|
|
|
|
|
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
{
|
|
{
|
|
- struct starpu_block_interface *src_block = src_interface;
|
|
|
|
- struct starpu_block_interface *dst_block = dst_interface;
|
|
|
|
- int err, ret = 0;
|
|
|
|
-
|
|
|
|
- /* We may have a contiguous buffer for the entire block, or contiguous
|
|
|
|
- * plans within the block, we can avoid many small transfers that way */
|
|
|
|
- if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
|
|
|
|
- {
|
|
|
|
- /* Is that a single contiguous buffer ? */
|
|
|
|
- if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
|
|
|
|
- {
|
|
|
|
- err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
|
|
|
|
- src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
|
|
|
|
- src_block->offset, event, &ret);
|
|
|
|
- if (STARPU_UNLIKELY(err))
|
|
|
|
- STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
- }
|
|
|
|
- else
|
|
|
|
- {
|
|
|
|
- /* Are all plans contiguous */
|
|
|
|
- STARPU_ASSERT_MSG(0, "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- else
|
|
|
|
- {
|
|
|
|
- /* Default case: we transfer all lines one by one: ny*nz transfers */
|
|
|
|
- /* XXX non contiguous buffers are not properly supported yet. (TODO) */
|
|
|
|
- unsigned layer;
|
|
|
|
- for (layer = 0; layer < src_block->nz; layer++)
|
|
|
|
- {
|
|
|
|
- unsigned j;
|
|
|
|
- for(j=0 ; j<src_block->ny ; j++)
|
|
|
|
- {
|
|
|
|
- void *ptr = (void *)dst_block->ptr+(layer*dst_block->ldz*dst_block->elemsize)+(j*dst_block->ldy*dst_block->elemsize);
|
|
|
|
- err = starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, src_node, ptr, dst_node,
|
|
|
|
- src_block->nx*src_block->elemsize,
|
|
|
|
- layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
|
|
|
|
- src_block->offset, NULL, NULL);
|
|
|
|
- if (STARPU_UNLIKELY(err))
|
|
|
|
- STARPU_OPENCL_REPORT_ERROR(err);
|
|
|
|
- }
|
|
|
|
- // const size_t buffer_origin[3] = {src_block->offset, 0, 0};
|
|
|
|
- // const size_t host_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
|
|
|
|
- // size_t region[3] = {src_block->nx*src_block->elemsize,src_block->ny, 1};
|
|
|
|
- // size_t buffer_row_pitch=region[0];
|
|
|
|
- // size_t buffer_slice_pitch=region[1] * buffer_row_pitch;
|
|
|
|
- // size_t host_row_pitch=region[0];
|
|
|
|
- // size_t host_slice_pitch=region[1] * host_row_pitch;
|
|
|
|
- //
|
|
|
|
- // _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, src_node, (void *)dst_block->ptr, dst_node,
|
|
|
|
- // buffer_origin, host_origin, region,
|
|
|
|
- // buffer_row_pitch, buffer_slice_pitch,
|
|
|
|
- // host_row_pitch, host_slice_pitch, NULL);
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
|
|
|
|
+}
|
|
|
|
|
|
- _STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
|
|
|
|
|
|
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
|
|
+{
|
|
|
|
+ return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
|
|
|
|
+}
|
|
|
|
|
|
- return ret;
|
|
|
|
|
|
+static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
|
|
|
|
+{
|
|
|
|
+ return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
|
|
}
|
|
}
|
|
|
|
|
|
static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
@@ -636,6 +575,11 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
|
|
return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
|
|
return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
|
|
|
|
+{
|
|
|
|
+ return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
|
|
|
|
+}
|
|
|
|
+
|
|
#endif
|
|
#endif
|
|
|
|
|
|
/* as not all platform easily have a BLAS lib installed ... */
|
|
/* as not all platform easily have a BLAS lib installed ... */
|