13 年之前 · fdd27aab48
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -50,7 +50,9 @@ EXTRA_DIST = 					\
 
				 	basic_examples/block_opencl_kernel.cl			\
			
 
				 	openmp/vector_scal.c			\
			
 
				 	filters/fblock_opencl_kernel.cl		\
			
 
				-	filters/multiformat/opencl.cl
			
 
				+	filters/multiformat/opencl.cl           \
			
 
				+	filters/custom_mf/conversion_opencl.cl  \
			
 
				+	filters/custom_mf/custom_opencl.cl
			
 
				 
			
 
				 CLEANFILES = 					\
			
 
				 	gordon/null_kernel_gordon.spuelf
			
@@ -409,6 +411,15 @@ filters_custom_mf_custom_mf_filter_SOURCES+=\
 
				 	filters/custom_mf/cuda.cu
			
 
				 endif
			
 
				 
			
 
				+if STARPU_USE_OPENCL
			
 
				+filters_custom_mf_custom_mf_filter_SOURCES+=\
			
 
				+	filters/custom_mf/conversion_opencl.c \
			
 
				+	filters/custom_mf/custom_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	filters/custom_mf/conversion_opencl.cl \
			
 
				+	filters/custom_mf/custom_opencl.cl
			
 
				+endif
			
 
				+
			
 
				 filters_multiformat_multiformat_filter_SOURCES=                \
			
 
				 	filters/multiformat/multiformat_filter.c               \
			
 
				 	filters/multiformat/multiformat_ops.c                  \
			
--- a/examples/filters/custom_mf/conversion_opencl.c
+++ b/examples/filters/custom_mf/conversion_opencl.c
@@ -0,0 +1,102 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "custom_types.h"
			
 
				+#include "custom_interface.h"
			
 
				+
			
 
				+extern struct starpu_opencl_program opencl_conversion_program;
			
 
				+
			
 
				+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) args;
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	unsigned n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	n*=2;
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_conversion_program,
			
 
				+					"custom_opencl_conversion",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
			
 
				+	if (starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					  sizeof(aop), &aop,
			
 
				+					  sizeof(x), &x,
			
 
				+					  sizeof(n), &n,
			
 
				+					  0) != 3)
			
 
				+	{
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		assert(0);
			
 
				+	}
			
 
				+	
			
 
				+
			
 
				+	{
			
 
				+		size_t global=n;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(
			
 
				+				queue,
			
 
				+				kernel,
			
 
				+				1,       /* work_dim */
			
 
				+				NULL,    /* global_work_offset */
			
 
				+				&global, /* global_work_size */
			
 
				+				&local,  /* local_work_size */
			
 
				+				0,       /* num_events_in_wait_list */
			
 
				+				NULL,    /* event_wait_list */
			
 
				+				&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/examples/filters/custom_mf/conversion_opencl.cl
+++ b/examples/filters/custom_mf/conversion_opencl.cl
@@ -0,0 +1,32 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "custom_types.h"
			
 
				+
			
 
				+/*
			
 
				+ * The first n/2 values of x are actual xs. The last N/2 values are ys.
			
 
				+ */
			
 
				+__kernel void custom_opencl_conversion(__global struct point *aop,
			
 
				+				       __global float *x,
			
 
				+				       int nx)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+	if (i < nx/2)
			
 
				+		x[i] = aop[i].x;
			
 
				+	else if (i < nx)
			
 
				+		x[i] = aop[i-nx/2].y;
			
 
				+
			
 
				+}
			
--- a/examples/filters/custom_mf/custom_conversion_codelets.c
+++ b/examples/filters/custom_mf/custom_conversion_codelets.c
@@ -55,3 +55,41 @@ struct starpu_codelet cuda_to_cpu_cl =
 
				 	.name = "codelet_cuda_to_cpu"
			
 
				 };
			
 
				 #endif
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
			
 
				+{
			
 
				+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		aop[i].x = x[i];
			
 
				+		aop[i].y = x[i+n];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+extern void cpu_to_opencl_opencl_func(void *buffers[], void *arg);
			
 
				+
			
 
				+struct starpu_codelet cpu_to_opencl_cl =
			
 
				+{
			
 
				+	.where = STARPU_OPENCL,
			
 
				+	.opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_cpu_to_opencl"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet opencl_to_cpu_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = { opencl_to_cpu_cpu_func, NULL },
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_opencl_to_cpu"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_hash.h>
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				 #include "custom_interface.h"
			
 
				 #include "custom_types.h"
			
 
				 
			
@@ -36,7 +39,22 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node,
 
				 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				 				   void *dst_interface, unsigned dst_node,
			
 
				 				   cudaStream_t stream);
			
 
				-#endif
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				+				 void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event);
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event);
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				 static const struct starpu_data_copy_methods custom_copy_data_methods_s =
			
 
				 {
			
@@ -51,11 +69,11 @@ static const struct starpu_data_copy_methods custom_copy_data_methods_s =
 
				 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl       = NULL,
			
 
				-	.opencl_to_ram       = NULL,
			
 
				-	.opencl_to_opencl    = NULL,
			
 
				-        .ram_to_opencl_async = NULL,
			
 
				-	.opencl_to_ram_async = NULL,
			
 
				+	.ram_to_opencl       = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram       = copy_opencl_to_ram,
			
 
				+	.opencl_to_opencl    = copy_opencl_to_opencl,
			
 
				+        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				+	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				 #endif
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram  = NULL,
			
@@ -126,6 +144,9 @@ register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *da
 
				 #ifdef STARPU_USE_CUDA
			
 
				 			local_interface->cuda_ptr   = custom_interface->cuda_ptr;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			local_interface->opencl_ptr = custom_interface->opencl_ptr;
			
 
				+#endif
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -133,6 +154,9 @@ register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *da
 
				 #ifdef STARPU_USE_CUDA
			
 
				 			local_interface->cuda_ptr   = NULL;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			local_interface->opencl_ptr = NULL;
			
 
				+#endif
			
 
				 		}
			
 
				 		local_interface->nx = custom_interface->nx;
			
 
				 		local_interface->ops = custom_interface->ops;
			
@@ -160,7 +184,19 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t nod
 
				 			custom_interface->cpu_ptr = NULL;
			
 
				 			return -ENOMEM;
			
 
				 		}
			
 
				-#endif
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		custom_interface->opencl_ptr = malloc(size);
			
 
				+		if (custom_interface->cuda_ptr == NULL)
			
 
				+		{
			
 
				+			free(custom_interface->cpu_ptr);
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			free(custom_interface->cuda_ptr);
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+			
			
 
				 		break;
			
 
				 #if STARPU_USE_CUDA
			
 
				 	case STARPU_CUDA_RAM:
			
@@ -180,6 +216,31 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t nod
 
				 		break;
			
 
				 	}
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	case STARPU_OPENCL_RAM:
			
 
				+	{
			
 
				+		/* XXX : StarPU shoulf probably provide starpu_opencl_allocate_memory(). */
			
 
				+		cl_context context;
			
 
				+		cl_command_queue queue;
			
 
				+		int id = starpu_worker_get_id();
			
 
				+		int devid = starpu_worker_get_devid(id);
			
 
				+		starpu_opencl_get_queue(devid, &queue);
			
 
				+		starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+		cl_int err;
			
 
				+		cl_mem memory;
			
 
				+
			
 
				+		/* */
			
 
				+		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
			
 
				+		memory = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
			
 
				+        	if (err != CL_SUCCESS)
			
 
				+			return -ENOMEM; // There might be other errors.
			
 
				+
			
 
				+		custom_interface->opencl_ptr = memory;
			
 
				+
			
 
				+		break;
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 	default:
			
 
				 		assert(0);
			
 
				 	}
			
@@ -208,6 +269,13 @@ static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
 
				 			custom_interface->cuda_ptr = NULL;
			
 
				 		}
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		if (custom_interface->opencl_ptr != NULL)
			
 
				+		{
			
 
				+			free(custom_interface->opencl_ptr);
			
 
				+			custom_interface->opencl_ptr = NULL;
			
 
				+		}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 		break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	case STARPU_CUDA_RAM:
			
@@ -247,6 +315,10 @@ custom_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
 
				 		case STARPU_CUDA_RAM:
			
 
				 			return data_interface->cuda_ptr;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			return data_interface->opencl_ptr;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -303,6 +375,9 @@ void custom_data_register(starpu_data_handle_t *handle,
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		.cuda_ptr = NULL,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		.opencl_ptr = NULL,
			
 
				+#endif
			
 
				 		.nx  = nx,
			
 
				 		.ops = format_ops
			
 
				 	};
			
@@ -412,3 +487,173 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
 
				 	assert(0);
			
 
				 }
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	(void) src_interface;
			
 
				+	(void) src_node;
			
 
				+	(void) dst_interface;
			
 
				+	(void) dst_node;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	(void) src_interface;
			
 
				+	(void) src_node;
			
 
				+	(void) dst_interface;
			
 
				+	(void) dst_node;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				+				 void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	(void) src_interface;
			
 
				+	(void) src_node;
			
 
				+	(void) dst_interface;
			
 
				+	(void) dst_node;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* StarPU will give us these in a near future */
			
 
				+static cl_int
			
 
				+_opencl_malloc(cl_context context, cl_mem *mem, size_t size, cl_mem_flags flags)
			
 
				+{
			
 
				+	cl_int err;
			
 
				+        cl_mem memory;
			
 
				+
			
 
				+	memory = clCreateBuffer(context, flags, size, NULL, &err);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		return err;
			
 
				+
			
 
				+        *mem = memory;
			
 
				+        return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static cl_int
			
 
				+_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node,
			
 
				+				      cl_mem buffer, unsigned dst_node,
			
 
				+				      size_t size, size_t offset,
			
 
				+				      cl_event *event, int *ret,
			
 
				+				      cl_command_queue queue)
			
 
				+{
			
 
				+        cl_int err;
			
 
				+        cl_bool blocking;
			
 
				+
			
 
				+        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+
			
 
				+        err = clEnqueueWriteBuffer(queue, buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+
			
 
				+        if (err == CL_SUCCESS)
			
 
				+                *ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static cl_int
			
 
				+_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node,
			
 
				+			   void *ptr, unsigned dst_node,
			
 
				+			   size_t size, size_t offset, cl_event *event,
			
 
				+			   cl_command_queue queue)
			
 
				+
			
 
				+{
			
 
				+        cl_int err;
			
 
				+        cl_bool blocking;
			
 
				+
			
 
				+        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				+        err = clEnqueueReadBuffer(queue, buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+
			
 
				+        return err;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event)
			
 
				+{
			
 
				+	ssize_t size;
			
 
				+	struct custom_data_interface *src_custom, *dst_custom;
			
 
				+
			
 
				+	src_custom = (struct custom_data_interface *) src_interface;
			
 
				+	dst_custom = (struct custom_data_interface *) dst_interface;
			
 
				+
			
 
				+	/*
			
 
				+	 * Opencl stuff.
			
 
				+	 */
			
 
				+	cl_context context;
			
 
				+	cl_command_queue queue;
			
 
				+	int id = starpu_worker_get_id();
			
 
				+	int devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_queue(devid, &queue);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	/* Real stuff */
			
 
				+	int err;
			
 
				+	cl_int ret;
			
 
				+
			
 
				+	size = src_custom->nx * 2 * sizeof(float);
			
 
				+	if (dst_custom->cpu_ptr == NULL)
			
 
				+	{
			
 
				+		ret = _opencl_malloc(context, (cl_mem*)&dst_custom->cpu_ptr, 
			
 
				+				size, CL_MEM_READ_WRITE);
			
 
				+		assert(ret == CL_SUCCESS);
			
 
				+	}
			
 
				+	err = _opencl_copy_ram_to_opencl_async_sync(src_custom->cpu_ptr,
			
 
				+						src_node,
			
 
				+						dst_custom->cpu_ptr,
			
 
				+						dst_node,
			
 
				+						size,
			
 
				+						0,
			
 
				+						NULL,
			
 
				+						&ret,
			
 
				+						queue);
			
 
				+	assert(err == 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event)
			
 
				+{
			
 
				+	ssize_t size;
			
 
				+	struct custom_data_interface *src_custom, *dst_custom;
			
 
				+
			
 
				+	src_custom = (struct custom_data_interface *) src_interface;
			
 
				+	dst_custom = (struct custom_data_interface *) dst_interface;
			
 
				+
			
 
				+	/*
			
 
				+	 * Opencl stuff.
			
 
				+	 */
			
 
				+	cl_context context;
			
 
				+	cl_command_queue queue;
			
 
				+	int id = starpu_worker_get_id();
			
 
				+	int devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_queue(devid, &queue);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	/* real stuff */
			
 
				+	int err;
			
 
				+	cl_int ret;
			
 
				+	size = src_custom->nx * 2 * sizeof(float);
			
 
				+	if (!dst_custom->opencl_ptr)
			
 
				+	{
			
 
				+		dst_custom->opencl_ptr = malloc(size);
			
 
				+		assert(dst_custom->opencl_ptr != NULL);
			
 
				+	}
			
 
				+
			
 
				+	err = _opencl_copy_opencl_to_ram(
			
 
				+			src_custom->opencl_ptr,
			
 
				+			src_node,
			
 
				+			dst_custom->opencl_ptr,
			
 
				+			dst_node,
			
 
				+			size,
			
 
				+			0,
			
 
				+			NULL,
			
 
				+			queue);
			
 
				+	assert(err == 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
--- a/examples/filters/custom_mf/custom_interface.h
+++ b/examples/filters/custom_mf/custom_interface.h
@@ -20,6 +20,7 @@ struct custom_data_interface
 
				 {
			
 
				 	void *cpu_ptr;
			
 
				 	void *cuda_ptr;
			
 
				+	void *opencl_ptr;
			
 
				 	struct starpu_multiformat_data_interface_ops *ops;
			
 
				 	uint32_t nx;
			
 
				 };
			
@@ -40,4 +41,8 @@ void custom_data_register(starpu_data_handle_t *handle,
 
				 	CUSTOM_GET_NX((interface))
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#define CUSTOM_GET_OPENCL_X_PTR(interface) (((struct custom_data_interface *)(interface))->opencl_ptr)
			
 
				+#endif
			
 
				+
			
 
				 #endif /* ! __CUSTOM_INTERFACE_H__ */
			
--- a/examples/filters/custom_mf/custom_mf_filter.c
+++ b/examples/filters/custom_mf/custom_mf_filter.c
@@ -16,20 +16,36 @@
 
				 #include <starpu.h>
			
 
				 #include "custom_interface.h"
			
 
				 #include "custom_types.h"
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				-#define N 20
			
 
				+#define N 12
			
 
				+
			
 
				+#define DEBUG 1
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static unsigned int ncuda;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static unsigned int nopencl;
			
 
				+#endif
			
 
				 
			
 
				-#define DEBUG 0
			
 
				 
			
 
				 static struct point array_of_structs[N];
			
 
				 static starpu_data_handle_t handle;
			
 
				-static unsigned int nchunks = 4;
			
 
				+static unsigned int nchunks = 6;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern struct starpu_codelet cpu_to_cuda_cl;
			
 
				 extern struct starpu_codelet cuda_to_cpu_cl;
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern struct starpu_codelet cpu_to_opencl_cl;
			
 
				+extern struct starpu_codelet opencl_to_cpu_cl;
			
 
				+#endif
			
 
				+
			
 
				 static struct starpu_multiformat_data_interface_ops format_ops =
			
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -37,6 +53,11 @@ static struct starpu_multiformat_data_interface_ops format_ops =
 
				 	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
			
 
				 	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_elemsize  = sizeof(struct struct_of_arrays),
			
 
				+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
			
 
				+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
			
 
				+#endif
			
 
				 	.cpu_elemsize = sizeof(struct point),
			
 
				 };
			
 
				 
			
@@ -68,6 +89,16 @@ custom_filter(void *father, void *child, struct starpu_data_filter *f,
 
				 		soa_child->y = soa_father->y + chunk_size;
			
 
				 	}
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	else if (custom_father->opencl_ptr)
			
 
				+	{
			
 
				+		struct struct_of_arrays *soa_father, *soa_child;
			
 
				+		soa_father = (struct struct_of_arrays*) custom_father->opencl_ptr;
			
 
				+		soa_child = (struct struct_of_arrays*) custom_child->opencl_ptr;
			
 
				+		soa_child->x = soa_father->x + chunk_size;
			
 
				+		soa_child->y = soa_father->y + chunk_size;
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				 	custom_child->ops = custom_father->ops;
			
 
				 	custom_child->nx = chunk_size;
			
@@ -138,6 +169,19 @@ static struct starpu_codelet cuda_cl =
 
				 };
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void custom_scal_opencl_func(void *buffers[], void *args);
			
 
				+
			
 
				+static struct starpu_codelet opencl_cl =
			
 
				+{
			
 
				+	.where = STARPU_OPENCL,
			
 
				+	.opencl_funcs = { custom_scal_opencl_func, NULL },
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.name = "opencl_codelet"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				 static int
			
 
				 create_and_submit_tasks(void)
			
 
				 {
			
@@ -146,17 +190,30 @@ create_and_submit_tasks(void)
 
				 	for (i = 0; i < nchunks; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				-		if (i %2 == 0)
			
 
				+		switch (i%3)
			
 
				 		{
			
 
				+		case 0:
			
 
				 			task->cl = &cpu_cl;
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				+			break;
			
 
				+		case 1:
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-			task->cl = &cuda_cl;
			
 
				-#else
			
 
				-			task->cl = &cpu_cl;
			
 
				-#endif /* !STARPU_USE_CUDA */
			
 
				+			if (ncuda > 0)
			
 
				+				task->cl = &cuda_cl;
			
 
				+			else
			
 
				+#endif
			
 
				+				task->cl = &cpu_cl;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			if (nopencl > 0)
			
 
				+				task->cl = &opencl_cl;
			
 
				+			else
			
 
				+#endif
			
 
				+				task->cl = &cpu_cl;
			
 
				+			break;
			
 
				+		default:
			
 
				+			/* We should never get here */
			
 
				+			assert(0);
			
 
				 		}
			
 
				 
			
 
				 		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
			
@@ -194,8 +251,7 @@ check_it(void)
 
				 	int i;
			
 
				 	for (i = 0; i < N; i++)
			
 
				 	{
			
 
				-		float expected_value = i + 1.0;
			
 
				-		expected_value *= array_of_structs[i].y;
			
 
				+		float expected_value = (i + 1.0)*42.0;
			
 
				 		if (array_of_structs[i].x != expected_value)
			
 
				 			return EXIT_FAILURE;
			
 
				 	}
			
@@ -203,6 +259,11 @@ check_it(void)
 
				 	return EXIT_SUCCESS;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+struct starpu_opencl_program opencl_program;
			
 
				+struct starpu_opencl_program opencl_conversion_program;
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				 int
			
 
				 main(void)
			
 
				 {
			
@@ -215,6 +276,24 @@ main(void)
 
				 	if (err == -ENODEV)
			
 
				 		goto enodev;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	ncuda = starpu_cuda_worker_get_count();
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	nopencl = starpu_opencl_worker_get_count();
			
 
				+	if (nopencl > 0)
			
 
				+	{
			
 
				+		char *f1 = "examples/filters/custom_mf/custom_opencl.cl";
			
 
				+		char *f2 = "examples/filters/custom_mf/conversion_opencl.cl";
			
 
				+		err = starpu_opencl_load_opencl_from_file(f1, &opencl_program,
			
 
				+							  NULL);
			
 
				+		assert(err == 0);
			
 
				+		err = starpu_opencl_load_opencl_from_file(f2,
			
 
				+						&opencl_conversion_program,
			
 
				+						NULL);
			
 
				+		assert(err == 0);
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				 	register_and_partition_data();
			
 
				 #if DEBUG
			
@@ -231,7 +310,18 @@ main(void)
 
				 #if DEBUG
			
 
				 	print_it();
			
 
				 #endif
			
 
				+
			
 
				+#if STARPU_USE_OPENCL
			
 
				+	if (nopencl > 0)
			
 
				+	{
			
 
				+        	err = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+		assert(err == 0);
			
 
				+		err = starpu_opencl_unload_opencl(&opencl_conversion_program);
			
 
				+		assert(err == 0);
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 	starpu_shutdown();		
			
 
				+	print_it();
			
 
				 	return check_it();
			
 
				 
			
 
				 
			
--- a/examples/filters/custom_mf/custom_opencl.c
+++ b/examples/filters/custom_mf/custom_opencl.c
@@ -0,0 +1,102 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "custom_types.h"
			
 
				+#include "custom_interface.h"
			
 
				+
			
 
				+extern struct starpu_opencl_program opencl_program;
			
 
				+
			
 
				+void custom_scal_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) args;
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	unsigned n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_program,
			
 
				+					"custom_scal_opencl",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
			
 
				+	int douze;
			
 
				+	if (starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					  sizeof(aop), &aop,
			
 
				+					  sizeof(x), &x,
			
 
				+					  sizeof(n), &n,
			
 
				+					  0) != 3)
			
 
				+	{
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		assert(0);
			
 
				+	}
			
 
				+	
			
 
				+
			
 
				+	{
			
 
				+		size_t global=n;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(
			
 
				+				queue,
			
 
				+				kernel,
			
 
				+				1,       /* work_dim */
			
 
				+				NULL,    /* global_work_offset */
			
 
				+				&global, /* global_work_size */
			
 
				+				&local,  /* local_work_size */
			
 
				+				0,       /* num_events_in_wait_list */
			
 
				+				NULL,    /* event_wait_list */
			
 
				+				&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/examples/filters/custom_mf/custom_opencl.cl
+++ b/examples/filters/custom_mf/custom_opencl.cl
@@ -0,0 +1,26 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "custom_types.h"
			
 
				+
			
 
				+__kernel void custom_scal_opencl(__global struct point *aop,
			
 
				+				 __global float *x,
			
 
				+				 int nx)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+	if (i < nx)
			
 
				+		x[i] *= x[i+nx];
			
 
				+}