ソースを参照

Fix in-place partitioning for OpenCL. OpenCL codelets are not supposed to use the ptr pointer, since in OpenCL we only have a cl_mem

Samuel Thibault 13 年 前
コミット
08f4ce1f07
共有32 個のファイルを変更した413 個の追加85 個の削除を含む
  1. 36 0
      doc/chapters/advanced-examples.texi
  2. 24 0
      doc/chapters/basic-api.texi
  3. 5 3
      doc/chapters/basic-examples.texi
  4. 1 1
      doc/chapters/vector_scal_opencl.texi
  5. 1 1
      doc/tutorial/vector_scal_opencl.c
  6. 1 1
      examples/basic_examples/block_opencl.c
  7. 1 1
      examples/basic_examples/vector_scal_opencl.c
  8. 9 7
      examples/filters/fblock_opencl.c
  9. 2 1
      examples/filters/fblock_opencl_kernel.cl
  10. 1 1
      examples/incrementer/incrementer_kernels_opencl.c
  11. 1 1
      examples/mandelbrot/mandelbrot.c
  12. 3 3
      examples/matvecmult/matvecmult.c
  13. 2 2
      examples/spmv/spmv_kernels.c
  14. 2 2
      examples/spmv/spmv_opencl.cl
  15. 7 2
      include/starpu_data_interfaces.h
  16. 8 7
      src/datawizard/interfaces/block_interface.c
  17. 7 5
      src/datawizard/interfaces/matrix_filters.c
  18. 8 7
      src/datawizard/interfaces/matrix_interface.c
  19. 2 12
      src/datawizard/interfaces/multiformat_interface.c
  20. 13 9
      src/datawizard/interfaces/vector_filters.c
  21. 8 7
      src/datawizard/interfaces/vector_interface.c
  22. 4 4
      src/drivers/opencl/driver_opencl.c
  23. 1 1
      src/drivers/opencl/driver_opencl.h
  24. 12 0
      tests/Makefile.am
  25. 182 0
      tests/datawizard/in_place_partition.c
  26. 1 1
      tests/datawizard/interfaces/block/block_opencl.c
  27. 1 1
      tests/datawizard/interfaces/matrix/matrix_opencl.c
  28. 1 1
      tests/datawizard/interfaces/vector/test_vector_opencl.c
  29. 39 0
      tests/datawizard/scal_cuda.cu
  30. 26 0
      tests/datawizard/scal_opencl.cl
  31. 2 2
      tests/datawizard/sync_and_notify_data_opencl.c
  32. 2 2
      tests/datawizard/write_only_tmp_buffer.c

+ 36 - 0
doc/chapters/advanced-examples.texi

@@ -232,6 +232,9 @@ starpu_data_partition(handle, &f);
 @end smallexample
 @end cartouche
 
+The task submission then uses @code{starpu_data_get_sub_data} to retrive the
+sub-handles to be passed as tasks parameters.
+
 @cartouche
 @smallexample
 /* Submit a task on each sub-vector */
@@ -255,6 +258,39 @@ for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
 Partitioning can be applied several times, see
 @code{examples/basic_examples/mult.c} and @code{examples/filters/}.
 
+Wherever the whole piece of data is already available, the partitioning will
+be done in-place, i.e. without allocating new buffers but just using pointers
+inside the existing copy. This is particularly important to be aware of when
+using OpenCL, where the kernel parameters are not pointers, but handles. The
+kernel thus needs to be also passed the offset within the OpenCL buffer:
+
+@cartouche
+@smallexample
+void opencl_func(void *buffers[], void *cl_arg)
+@{
+    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
+
+    ...
+    clSetKernelArg(kernel, 0, sizeof(vector), &vector);
+    clSetKernelArg(kernel, 1, sizeof(offset), &offset);
+    ...
+@}
+@end smallexample
+@end cartouche
+
+And the kernel has to shift from the pointer passed by the OpenCL driver:
+
+@cartouche
+@smallexample
+__kernel void opencl_kernel(__global int *vector, unsigned offset)
+@{
+    block = (__global void *)block + offset;
+    ...
+@}
+@end smallexample
+@end cartouche
+
 @node Performance model example
 @section Performance model example
 

+ 24 - 0
doc/chapters/basic-api.texi

@@ -608,6 +608,14 @@ todo
 todo
 @end deftypefun
 
+@deftypefun uintptr_t STARPU_VECTOR_GET_DEV_HANDLE ({void *}@var{interface})
+todo
+@end deftypefun
+
+@deftypefun uintptr_t STARPU_VECTOR_GET_OFFSET ({void *}@var{interface})
+todo
+@end deftypefun
+
 @deftypefun uint32_t STARPU_VECTOR_GET_NX ({void *}@var{interface})
 todo
 @end deftypefun
@@ -643,6 +651,14 @@ todo
 todo
 @end deftypefun
 
+@deftypefun uintptr_t STARPU_MATRIX_GET_DEV_HANDLE ({void *}@var{interface})
+todo
+@end deftypefun
+
+@deftypefun uintptr_t STARPU_MATRIX_GET_OFFSET ({void *}@var{interface})
+todo
+@end deftypefun
+
 @deftypefun uint32_t STARPU_MATRIX_GET_NX ({void *}@var{interface})
 todo
 @end deftypefun
@@ -694,6 +710,14 @@ todo
 todo
 @end deftypefun
 
+@deftypefun uintptr_t STARPU_BLOCK_GET_DEV_HANDLE ({void *}@var{interface})
+todo
+@end deftypefun
+
+@deftypefun uintptr_t STARPU_BLOCK_GET_OFFSET ({void *}@var{interface})
+todo
+@end deftypefun
+
 @deftypefun uint32_t STARPU_BLOCK_GET_NX ({void *}@var{interface})
 todo
 @end deftypefun

+ 5 - 3
doc/chapters/basic-examples.texi

@@ -421,8 +421,10 @@ __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
 @end smallexample
 @end cartouche
 
-Similarly to CUDA, the pointer returned by @code{STARPU_VECTOR_GET_PTR} is here
-a device pointer, so that it is passed as such to the OpenCL kernel.
+Contrary to CUDA and CPU, @code{STARPU_VECTOR_GET_DEV_HANDLE} has to be used,
+which returns a @code{cl_mem} (which is not a device pointer, but an OpenCL
+handle), which can be passed as such to the OpenCL kernel. The difference is
+important when using partitioning, see @ref{Partitioning Data}.
 
 @cartouche
 @smallexample
@@ -442,7 +444,7 @@ void scal_opencl_func(void *buffers[], void *_args)
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
 @i{    id = starpu_worker_get_id();}
 @i{    devid = starpu_worker_get_devid(id);}

+ 1 - 1
doc/chapters/vector_scal_opencl.texi

@@ -22,7 +22,7 @@ void scal_opencl_func(void *buffers[], void *_args)
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
     id = starpu_worker_get_id();
     devid = starpu_worker_get_devid(id);

+ 1 - 1
doc/tutorial/vector_scal_opencl.c

@@ -44,7 +44,7 @@ void scal_opencl_func(void *buffers[], void *_args)
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
     id = starpu_worker_get_id();
     devid = starpu_worker_get_devid(id);

+ 1 - 1
examples/basic_examples/block_opencl.c

@@ -35,7 +35,7 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_command_queue queue;
 	cl_event event;
 	int id, devid, err;
-	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(descr[0]);
+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);

+ 1 - 1
examples/basic_examples/vector_scal_opencl.c

@@ -37,7 +37,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
 	/* OpenCL copy of the vector pointer */
-	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
 	id = starpu_worker_get_id();
 	devid = starpu_worker_get_devid(id);

+ 9 - 7
examples/filters/fblock_opencl.c

@@ -37,7 +37,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 	cl_event event;
 
         int *factor = cl_arg;
-	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(buffers[0]);
+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(buffers[0]);
+	unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(buffers[0]);
@@ -51,12 +52,13 @@ void opencl_func(void *buffers[], void *cl_arg)
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 	CHECK_CL_SET_KERNEL_ARG(kernel, 0, sizeof(block), &block);
-	CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(nx), &nx);
-	CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(ny), &ny);
-	CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(nz), &nz);
-	CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(ldy), &ldy);
-	CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldz), &ldz);
-	CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(*factor), factor);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(offset), &offset);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(nx), &nx);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(ny), &ny);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(nz), &nz);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldy), &ldy);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(ldz), &ldz);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 
 	{
 		size_t global=nx*ny*nz;

+ 2 - 1
examples/filters/fblock_opencl_kernel.cl

@@ -14,9 +14,10 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void fblock_opencl(__global int* block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
+__kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 {
         int i, j, k;
+        block = (__global void *)block + offset;
         for(k=0; k<nz ; k++)
 	{
                 for(j=0; j<ny ; j++)

+ 1 - 1
examples/incrementer/incrementer_kernels_opencl.c

@@ -21,7 +21,7 @@
 extern struct starpu_opencl_program opencl_program;
 void opencl_codelet(void *descr[], void *_args)
 {
-	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_event event;

+ 1 - 1
examples/mandelbrot/mandelbrot.c

@@ -240,7 +240,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 	int *pcnt; /* unused for CUDA tasks */
 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
 
-	cl_mem data = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
+	cl_mem data = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
 
 	cl_kernel kernel;
 	cl_command_queue queue;

+ 3 - 3
examples/matvecmult/matvecmult.c

@@ -29,9 +29,9 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 	cl_kernel kernel;
 	cl_command_queue queue;
 	int id, devid, err, n;
-	cl_mem matrix = (cl_mem)STARPU_MATRIX_GET_PTR(descr[0]);
-	cl_mem vector = (cl_mem)STARPU_VECTOR_GET_PTR(descr[1]);
-	cl_mem mult = (cl_mem)STARPU_VECTOR_GET_PTR(descr[2]);
+	cl_mem matrix = (cl_mem)STARPU_MATRIX_GET_DEV_HANDLE(descr[0]);
+	cl_mem vector = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[1]);
+	cl_mem mult = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[2]);
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 	cl_event event;

+ 2 - 2
examples/spmv/spmv_kernels.c

@@ -35,10 +35,10 @@ void spmv_kernel_opencl(void *descr[], void *args)
 	uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
 	uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
 
-	cl_mem vecin = (cl_mem)STARPU_VECTOR_GET_PTR(descr[1]);
+	cl_mem vecin = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[1]);
 	uint32_t nx_in = STARPU_VECTOR_GET_NX(descr[1]);
 
-	cl_mem vecout = (cl_mem)STARPU_VECTOR_GET_PTR(descr[2]);
+	cl_mem vecout = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[2]);
 	uint32_t nx_out = STARPU_VECTOR_GET_NX(descr[2]);
 
         id = starpu_worker_get_id();

+ 2 - 2
examples/spmv/spmv_opencl.cl

@@ -17,8 +17,8 @@
 __kernel void spmv(unsigned nnz, unsigned nrow,
                    __global float* nzval, __global unsigned* colind,
                    __global unsigned* rowptr, unsigned firstentry,
-                   __global float *vecin, unsigned nx_in,
-                   __global float *vecout, unsigned nx_out)
+                   __global float *vecin, unsigned vecin_offset, unsigned nx_in,
+                   __global float *vecout, unsigned vecout_offset, unsigned nx_out)
 {
 	unsigned row;
 	// for (row = 0; row < nrow; row++)

+ 7 - 2
include/starpu_data_interfaces.h

@@ -181,6 +181,8 @@ size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle);
 
 /* helper methods */
 #define STARPU_MATRIX_GET_PTR(interface)	(((struct starpu_matrix_interface *)(interface))->ptr)
+#define STARPU_MATRIX_GET_DEV_HANDLE(interface)	(((struct starpu_matrix_interface *)(interface))->dev_handle)
+#define STARPU_MATRIX_GET_OFFSET(interface)	(((struct starpu_matrix_interface *)(interface))->offset)
 #define STARPU_MATRIX_GET_NX(interface)	(((struct starpu_matrix_interface *)(interface))->nx)
 #define STARPU_MATRIX_GET_NY(interface)	(((struct starpu_matrix_interface *)(interface))->ny)
 #define STARPU_MATRIX_GET_LD(interface)	(((struct starpu_matrix_interface *)(interface))->ld)
@@ -215,6 +217,8 @@ size_t starpu_block_get_elemsize(starpu_data_handle_t handle);
 
 /* helper methods */
 #define STARPU_BLOCK_GET_PTR(interface)	(((struct starpu_block_interface *)(interface))->ptr)
+#define STARPU_BLOCK_GET_DEV_HANDLE(interface)	(((struct starpu_block_interface *)(interface))->dev_handle)
+#define STARPU_BLOCK_GET_OFFSET(interface)	(((struct starpu_block_interface *)(interface))->offset)
 #define STARPU_BLOCK_GET_NX(interface)	(((struct starpu_block_interface *)(interface))->nx)
 #define STARPU_BLOCK_GET_NY(interface)	(((struct starpu_block_interface *)(interface))->ny)
 #define STARPU_BLOCK_GET_NZ(interface)	(((struct starpu_block_interface *)(interface))->nz)
@@ -240,6 +244,8 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 
 /* helper methods */
 #define STARPU_VECTOR_GET_PTR(interface)	(((struct starpu_vector_interface *)(interface))->ptr)
+#define STARPU_VECTOR_GET_DEV_HANDLE(interface)	(((struct starpu_vector_interface *)(interface))->dev_handle)
+#define STARPU_VECTOR_GET_OFFSET(interface)	(((struct starpu_vector_interface *)(interface))->offset)
 #define STARPU_VECTOR_GET_NX(interface)	(((struct starpu_vector_interface *)(interface))->nx)
 #define STARPU_VECTOR_GET_ELEMSIZE(interface)	(((struct starpu_vector_interface *)(interface))->elemsize)
 
@@ -248,6 +254,7 @@ struct starpu_variable_interface
 {
 	uintptr_t ptr;
 	size_t elemsize;
+	/* No dev_handle, since it can not be filtered, offset will always be zero */
 };
 
 void starpu_variable_data_register(starpu_data_handle_t *handle, uint32_t home_node,
@@ -368,8 +375,6 @@ struct starpu_multiformat_interface
 #ifdef STARPU_USE_OPENCL
 	void *opencl_ptr;
 #endif
-	uintptr_t dev_handle;
-	size_t offset;
 	uint32_t nx;
 	struct starpu_multiformat_data_interface_ops *ops;
 };

+ 8 - 7
src/datawizard/interfaces/block_interface.c

@@ -293,7 +293,7 @@ size_t starpu_block_get_elemsize(starpu_data_handle_t handle)
 /* returns the size of the allocated area */
 static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	uintptr_t addr = 0;
+	uintptr_t addr = 0, handle = 0;
 	unsigned fail = 0;
 	ssize_t allocated_memory;
 
@@ -312,7 +312,7 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 	switch(kind)
 	{
 		case STARPU_CPU_RAM:
-			addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
+			handle = addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
 			if (!addr)
 				fail = 1;
 
@@ -330,6 +330,7 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 
 				fail = 1;
 			}
+			handle = addr;
 
 			break;
 #endif
@@ -337,9 +338,9 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
-                                addr = (uintptr_t)ptr;
+				cl_mem mem;
+                                ret = _starpu_opencl_allocate_memory(&mem, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
+				handle = (uintptr_t)mem;
 				if (ret)
 				{
 					fail = 1;
@@ -358,7 +359,7 @@ static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst
 
 		/* update the data properly in consequence */
 		dst_block->ptr = addr;
-                dst_block->dev_handle = addr;
+		dst_block->dev_handle = handle;
                 dst_block->offset = 0;
 		dst_block->ldy = nx;
 		dst_block->ldz = nx*ny;
@@ -396,7 +397,7 @@ static void free_block_buffer_on_node(void *data_interface, uint32_t node)
 #endif
 #ifdef STARPU_USE_OPENCL
                 case STARPU_OPENCL_RAM:
-                        clReleaseMemObject((void *)block_interface->ptr);
+			clReleaseMemObject((void *)block_interface->dev_handle);
                         break;
 #endif
 		default:

+ 7 - 5
src/datawizard/interfaces/matrix_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -46,9 +46,10 @@ void starpu_block_filter_func(void *father_interface, void *child_interface, STA
 	matrix_child->elemsize = elemsize;
 
 	/* is the information on this node valid ? */
-	if (matrix_father->ptr)
+	if (matrix_father->dev_handle)
 	{
-		matrix_child->ptr = matrix_father->ptr + offset;
+		if (matrix_father->ptr)
+			matrix_child->ptr = matrix_father->ptr + offset;
 		matrix_child->ld = matrix_father->ld;
 		matrix_child->dev_handle = matrix_father->dev_handle;
 		matrix_child->offset = matrix_father->offset + offset;
@@ -75,10 +76,11 @@ void starpu_vertical_block_filter_func(void *father_interface, void *child_inter
 	matrix_child->elemsize = elemsize;
 
 	/* is the information on this node valid ? */
-	if (matrix_father->ptr)
+	if (matrix_father->dev_handle)
 	{
 		size_t offset = (size_t)id*chunk_size*matrix_father->ld*elemsize;
-		matrix_child->ptr = matrix_father->ptr + offset;
+		if (matrix_father->ptr)
+			matrix_child->ptr = matrix_father->ptr + offset;
 		matrix_child->ld = matrix_father->ld;
 		matrix_child->dev_handle = matrix_father->dev_handle;
 		matrix_child->offset = matrix_father->offset + offset;

+ 8 - 7
src/datawizard/interfaces/matrix_interface.c

@@ -272,7 +272,7 @@ size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle)
 /* returns the size of the allocated area */
 static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	uintptr_t addr = 0;
+	uintptr_t addr = 0, handle = 0;
 	unsigned fail = 0;
 	ssize_t allocated_memory;
 
@@ -292,7 +292,7 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 	switch(kind)
 	{
 		case STARPU_CPU_RAM:
-			addr = (uintptr_t)malloc((size_t)nx*ny*elemsize);
+			handle = addr = (uintptr_t)malloc((size_t)nx*ny*elemsize);
 			if (!addr)
 				fail = 1;
 
@@ -307,6 +307,7 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 
 				fail = 1;
 			}
+			handle = addr;
 
 			ld = nx;
 
@@ -316,9 +317,9 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*elemsize, CL_MEM_READ_WRITE);
-                                addr = (uintptr_t)ptr;
+				cl_mem mem;
+                                ret = _starpu_opencl_allocate_memory(&mem, nx*ny*elemsize, CL_MEM_READ_WRITE);
+				handle = (uintptr_t)mem;
 				if (ret)
 				{
 					fail = 1;
@@ -337,7 +338,7 @@ static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t ds
 
 		/* update the data properly in consequence */
 		matrix_interface->ptr = addr;
-                matrix_interface->dev_handle = addr;
+		matrix_interface->dev_handle = handle;
                 matrix_interface->offset = 0;
 		matrix_interface->ld = ld;
 	}
@@ -374,7 +375,7 @@ static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
 #endif
 #ifdef STARPU_USE_OPENCL
                 case STARPU_OPENCL_RAM:
-                        clReleaseMemObject((void *)matrix_interface->ptr);
+			clReleaseMemObject((void *)matrix_interface->dev_handle);
                         break;
 #endif
 		default:

+ 2 - 12
src/datawizard/interfaces/multiformat_interface.c

@@ -141,8 +141,6 @@ static void register_multiformat_handle(starpu_data_handle_t handle, uint32_t ho
 #ifdef STARPU_USE_OPENCL
 			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
 #endif
-			local_interface->dev_handle = multiformat_interface->dev_handle;
-			local_interface->offset     = multiformat_interface->offset;
 		}
 		else
 		{
@@ -153,8 +151,6 @@ static void register_multiformat_handle(starpu_data_handle_t handle, uint32_t ho
 #ifdef STARPU_USE_OPENCL
 			local_interface->opencl_ptr = NULL;
 #endif
-			local_interface->dev_handle = 0;
-			local_interface->offset     = 0;
 		}
 		local_interface->nx = multiformat_interface->nx;
 		local_interface->ops = multiformat_interface->ops;
@@ -186,8 +182,6 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handleptr,
 		.opencl_ptr = NULL,
 #endif
 		.nx         = nobjects,
-		.dev_handle = (uintptr_t) ptr,
-		.offset     = 0,
 		.ops        = format_ops
 	};
 	starpu_data_register(handleptr, home_node, &multiformat, &interface_multiformat_ops);
@@ -298,7 +292,6 @@ static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32
 			else
 			{
 				multiformat_interface->cpu_ptr = (void *) addr;
-				multiformat_interface->dev_handle = addr;
 			}
 
 #ifdef STARPU_USE_CUDA
@@ -322,7 +315,6 @@ static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32
 				else
 				{
 					multiformat_interface->cuda_ptr = (void *)addr;
-					multiformat_interface->dev_handle = addr;
 				}
 
 				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
@@ -347,7 +339,6 @@ static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32
 				else
 				{
 					multiformat_interface->opencl_ptr = (void *)addr;
-					multiformat_interface->dev_handle = addr;
 
 				}
 
@@ -365,7 +356,6 @@ static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32
 	if (fail)
 		return -ENOMEM;
 
-	multiformat_interface->offset = 0;
 	return allocated_memory;
 }
 
@@ -639,7 +629,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 							   (cl_mem) dst_multiformat->cpu_ptr,
 							   dst_node,
 							   size,
-							   dst_multiformat->offset,
+							   0,
 							   (cl_event *) _event,
 							   &ret);
         if (STARPU_UNLIKELY(err))
@@ -677,7 +667,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 							   dst_multiformat->opencl_ptr,
 							   dst_node,
 							   size,
-                                                           src_multiformat->offset,
+                                                           0,
 							   (cl_event *)_event,
 							   &ret);
         if (STARPU_UNLIKELY(err))

+ 13 - 9
src/datawizard/interfaces/vector_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -39,9 +39,10 @@ void starpu_block_filter_func_vector(void *father_interface, void *child_interfa
 	vector_child->nx = child_nx;
 	vector_child->elemsize = elemsize;
 
-	if (vector_father->ptr)
+	if (vector_father->dev_handle)
 	{
-		vector_child->ptr = vector_father->ptr + offset;
+		if (vector_father->ptr)
+			vector_child->ptr = vector_father->ptr + offset;
 		vector_child->dev_handle = vector_father->dev_handle;
 		vector_child->offset = vector_father->offset + offset;
 	}
@@ -69,9 +70,10 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 		vector_child->nx = length_first;
 		vector_child->elemsize = elemsize;
 
-		if (vector_father->ptr)
+		if (vector_father->dev_handle)
 		{
-			vector_child->ptr = vector_father->ptr;
+			if (vector_father->ptr)
+				vector_child->ptr = vector_father->ptr;
 			vector_child->offset = vector_father->offset;
 			vector_child->dev_handle = vector_father->dev_handle;
 		}
@@ -81,9 +83,10 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 		vector_child->nx = nx - length_first;
 		vector_child->elemsize = elemsize;
 
-		if (vector_father->ptr)
+		if (vector_father->dev_handle)
 		{
-			vector_child->ptr = vector_father->ptr + length_first*elemsize;
+			if (vector_father->ptr)
+				vector_child->ptr = vector_father->ptr + length_first*elemsize;
 			vector_child->offset = vector_father->offset + length_first*elemsize;
 			vector_child->dev_handle = vector_father->dev_handle;
 		}
@@ -107,14 +110,15 @@ void starpu_vector_list_filter_func(void *father_interface, void *child_interfac
 	vector_child->nx = chunk_size;
 	vector_child->elemsize = elemsize;
 
-	if (vector_father->ptr)
+	if (vector_father->dev_handle)
 	{
 		/* compute the current position */
 		unsigned i;
 		for (i = 0; i < id; i++)
 			current_pos += length_tab[i];
 
-		vector_child->ptr = vector_father->ptr + current_pos*elemsize;
+		if (vector_father->ptr)
+			vector_child->ptr = vector_father->ptr + current_pos*elemsize;
 		vector_child->offset = vector_father->offset + current_pos*elemsize;
 		vector_child->dev_handle = vector_father->dev_handle;
 	}

+ 8 - 7
src/datawizard/interfaces/vector_interface.c

@@ -236,7 +236,7 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *) data_interface_;
 
 	unsigned fail = 0;
-	uintptr_t addr = 0;
+	uintptr_t addr = 0, handle = 0;
 	ssize_t allocated_memory;
 
 	uint32_t nx = vector_interface->nx;
@@ -251,7 +251,7 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 	switch(kind)
 	{
 		case STARPU_CPU_RAM:
-			addr = (uintptr_t)malloc(nx*elemsize);
+			addr = handle = (uintptr_t)malloc(nx*elemsize);
 			if (!addr)
 				fail = 1;
 			break;
@@ -265,15 +265,16 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 
 				fail = 1;
 			}
+			handle = addr;
 			break;
 #endif
 #ifdef STARPU_USE_OPENCL
 	        case STARPU_OPENCL_RAM:
 			{
                                 int ret;
-                                void *ptr;
-                                ret = _starpu_opencl_allocate_memory(&ptr, nx*elemsize, CL_MEM_READ_WRITE);
-                                addr = (uintptr_t)ptr;
+				cl_mem mem;
+                                ret = _starpu_opencl_allocate_memory(&mem, nx*elemsize, CL_MEM_READ_WRITE);
+				handle = (uintptr_t)mem;
 				if (ret)
 				{
 					fail = 1;
@@ -293,7 +294,7 @@ static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t ds
 
 	/* update the data properly in consequence */
 	vector_interface->ptr = addr;
-        vector_interface->dev_handle = addr;
+	vector_interface->dev_handle = handle;
         vector_interface->offset = 0;
 
 	return allocated_memory;
@@ -321,7 +322,7 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 #endif
 #ifdef STARPU_USE_OPENCL
                 case STARPU_OPENCL_RAM:
-                        clReleaseMemObject((void *)vector_interface->ptr);
+			clReleaseMemObject((cl_mem)vector_interface->dev_handle);
                         break;
 #endif
 		default:

+ 4 - 4
src/drivers/opencl/driver_opencl.c

@@ -175,17 +175,17 @@ cl_int _starpu_opencl_deinit_context(int devid)
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags)
+cl_int _starpu_opencl_allocate_memory(cl_mem *mem, size_t size, cl_mem_flags flags)
 {
 	cl_int err;
-        cl_mem address;
+        cl_mem memory;
         struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
-	address = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	memory = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
 	if (err == CL_OUT_OF_HOST_MEMORY) return err;
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-        *addr = address;
+        *mem = memory;
         return CL_SUCCESS;
 }
 

+ 1 - 1
src/drivers/opencl/driver_opencl.h

@@ -40,7 +40,7 @@ extern
 unsigned _starpu_opencl_get_device_count(void);
 
 extern
-cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
+cl_int _starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
 
 extern
 cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event);

+ 12 - 0
tests/Makefile.am

@@ -175,6 +175,7 @@ noinst_PROGRAMS =				\
 	datawizard/interfaces/multiformat/advanced/same_handle \
 	datawizard/interfaces/variable/variable_interface    \
 	datawizard/interfaces/vector/test_vector_interface   \
+	datawizard/in_place_partition   \
 	errorcheck/starpu_init_noworker		\
 	errorcheck/invalid_blocking_calls	\
 	errorcheck/invalid_tasks		\
@@ -265,6 +266,17 @@ datawizard_sync_and_notify_data_implicit_SOURCES +=	\
 	datawizard/sync_and_notify_data_opencl.c
 endif
 
+datawizard_in_place_partition_SOURCES =	\
+	datawizard/in_place_partition.c
+if STARPU_USE_CUDA
+datawizard_in_place_partition_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_in_place_partition_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
 if STARPU_USE_GORDON
 datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_gordon_kernels.c

+ 182 - 0
tests/datawizard/in_place_partition.c

@@ -0,0 +1,182 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../helper.h"
+
+void scal_func_cpu(void *buffers[], void *cl_arg)
+{
+	unsigned i;
+
+	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
+	unsigned *val = (unsigned *) STARPU_VECTOR_GET_PTR(vector);
+	unsigned n = STARPU_VECTOR_GET_NX(vector);
+
+	/* scale the vector */
+	for (i = 0; i < n; i++)
+		val[i] *= 2;
+}
+
+#ifdef STARPU_USE_CUDA
+extern void scal_func_cuda(void *buffers[], void *cl_arg);
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static struct starpu_opencl_program opencl_program;
+
+void scal_func_opencl(void *buffers[], void *_args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+	unsigned offset = STARPU_VECTOR_GET_OFFSET(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
+	err |= clSetKernelArg(kernel, 1, sizeof(offset), &offset);
+	err |= clSetKernelArg(kernel, 2, sizeof(n), &n);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global) local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}
+#endif
+
+static struct starpu_codelet codelet =
+{
+        .where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+		| STARPU_CUDA
+#endif
+#ifdef STARPU_USE_OPENCL
+		| STARPU_OPENCL
+#endif
+		,
+	.cpu_funcs = { scal_func_cpu, NULL },
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = { scal_func_opencl, NULL },
+#endif
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { scal_func_cuda, NULL },
+#endif
+	.modes = { STARPU_RW },
+        .model = NULL,
+        .nbuffers = 1
+};
+
+
+int main(int argc, char **argv)
+{
+	unsigned *foo;
+	starpu_data_handle_t handle;
+	int ret;
+	int n, i, size;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+#endif
+
+	n = starpu_worker_get_count();
+	size = 10 * n;
+
+	foo = calloc(size, sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	starpu_vector_data_register(&handle, 0, (uintptr_t)foo, size, sizeof(*foo));
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = n > 1 ? n : 2,
+	};
+
+	starpu_data_partition(handle, &f);
+
+	for (i = 0; i < n; i++) {
+		struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		task->cl = &codelet;
+		task->execute_on_a_specific_worker = 1;
+		task->workerid = i;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unregister(handle);
+	starpu_shutdown();
+
+	for (i = 0; i < size; i++) {
+		if (foo[i] != i*2) {
+			fprintf(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
+			return EXIT_FAILURE;
+		}
+	}
+
+        return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 1 - 1
tests/datawizard/interfaces/block/block_opencl.c

@@ -39,7 +39,7 @@ test_block_opencl_func(void *buffers[], void *args)
 	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
         unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
         unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
-	cl_mem block = (cl_mem) STARPU_BLOCK_GET_PTR(buffers[0]);
+	cl_mem block = (cl_mem) STARPU_BLOCK_GET_DEV_HANDLE(buffers[0]);
 
 	cl_context context;
 	id = starpu_worker_get_id();

+ 1 - 1
tests/datawizard/interfaces/matrix/matrix_opencl.c

@@ -40,7 +40,7 @@ void test_matrix_opencl_func(void *buffers[], void *args)
 	factor = *(int *)args;
 	n = STARPU_MATRIX_GET_NX(buffers[0]);
 	n*= STARPU_MATRIX_GET_NY(buffers[0]);
-	val = (cl_mem)STARPU_MATRIX_GET_PTR(buffers[0]);
+	val = (cl_mem)STARPU_MATRIX_GET_DEV_HANDLE(buffers[0]);
 
 	id = starpu_worker_get_id();
 	devid = starpu_worker_get_devid(id);

+ 1 - 1
tests/datawizard/interfaces/vector/test_vector_opencl.c

@@ -35,7 +35,7 @@ test_vector_opencl_func(void *buffers[], void *args)
 	starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
 
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
 	cl_context context;
 	id = starpu_worker_get_id();

+ 39 - 0
tests/datawizard/scal_cuda.cu

@@ -0,0 +1,39 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+static __global__ void vector_mult_cuda(unsigned *val, unsigned n)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i < n)
+               val[i] *= 2;
+}
+
+extern "C" void scal_func_cuda(void *buffers[], void *_args)
+{
+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+
+        vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 26 - 0
tests/datawizard/scal_opencl.cl

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Université Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void vector_mult_opencl(__global float* val, unsigned offset, unsigned nx)
+{
+        const int i = get_global_id(0);
+	val = (__global void*) val + offset;
+        if (i < nx)
+	{
+                val[i] *= 2;
+        }
+}

+ 2 - 2
tests/datawizard/sync_and_notify_data_opencl.c

@@ -22,7 +22,7 @@ extern struct starpu_opencl_program opencl_code;
 
 void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
 {
-	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_event event;
@@ -52,7 +52,7 @@ void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
 
 void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
 {
-	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_event event;

+ 2 - 2
tests/datawizard/write_only_tmp_buffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,7 +31,7 @@ starpu_data_handle_t v_handle;
 
 static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
-	cl_mem buf = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
+	cl_mem buf = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
         char ptr = 42;
         cl_command_queue queue;
         int id = starpu_worker_get_id();