瀏覽代碼

custom_mf : adding support for OpenCL.

Cyril Roelandt 13 年之前
父節點
當前提交
fdd27aab48

+ 12 - 1
examples/Makefile.am

@@ -50,7 +50,9 @@ EXTRA_DIST = 					\
 	basic_examples/block_opencl_kernel.cl			\
 	openmp/vector_scal.c			\
 	filters/fblock_opencl_kernel.cl		\
-	filters/multiformat/opencl.cl
+	filters/multiformat/opencl.cl           \
+	filters/custom_mf/conversion_opencl.cl  \
+	filters/custom_mf/custom_opencl.cl
 
 CLEANFILES = 					\
 	gordon/null_kernel_gordon.spuelf
@@ -409,6 +411,15 @@ filters_custom_mf_custom_mf_filter_SOURCES+=\
 	filters/custom_mf/cuda.cu
 endif
 
+if STARPU_USE_OPENCL
+filters_custom_mf_custom_mf_filter_SOURCES+=\
+	filters/custom_mf/conversion_opencl.c \
+	filters/custom_mf/custom_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	filters/custom_mf/conversion_opencl.cl \
+	filters/custom_mf/custom_opencl.cl
+endif
+
 filters_multiformat_multiformat_filter_SOURCES=                \
 	filters/multiformat/multiformat_filter.c               \
 	filters/multiformat/multiformat_ops.c                  \

+ 102 - 0
examples/filters/custom_mf/conversion_opencl.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "custom_types.h"
+#include "custom_interface.h"
+
+extern struct starpu_opencl_program opencl_conversion_program;
+
+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = CUSTOM_GET_NX(buffers[0]);
+	n*=2;
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_conversion_program,
+					"custom_opencl_conversion",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
+	if (starpu_opencl_set_kernel_args(&err, &kernel,
+					  sizeof(aop), &aop,
+					  sizeof(x), &x,
+					  sizeof(n), &n,
+					  0) != 3)
+	{
+		STARPU_OPENCL_REPORT_ERROR(err);
+		assert(0);
+	}
+	
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(
+				queue,
+				kernel,
+				1,       /* work_dim */
+				NULL,    /* global_work_offset */
+				&global, /* global_work_size */
+				&local,  /* local_work_size */
+				0,       /* num_events_in_wait_list */
+				NULL,    /* event_wait_list */
+				&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 32 - 0
examples/filters/custom_mf/conversion_opencl.cl

@@ -0,0 +1,32 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "custom_types.h"
+
+/*
+ * The first n/2 values of x are actual xs. The last N/2 values are ys.
+ */
+__kernel void custom_opencl_conversion(__global struct point *aop,
+				       __global float *x,
+				       int nx)
+{
+        const int i = get_global_id(0);
+	if (i < nx/2)
+		x[i] = aop[i].x;
+	else if (i < nx)
+		x[i] = aop[i-nx/2].y;
+
+}

+ 38 - 0
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -55,3 +55,41 @@ struct starpu_codelet cuda_to_cpu_cl =
 	.name = "codelet_cuda_to_cpu"
 };
 #endif
+
+
+#ifdef STARPU_USE_OPENCL
+void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
+{
+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+
+	int i;
+	for (i = 0; i < n; i++)
+	{
+		aop[i].x = x[i];
+		aop[i].y = x[i+n];
+	}
+}
+
+extern void cpu_to_opencl_opencl_func(void *buffers[], void *arg);
+
+struct starpu_codelet cpu_to_opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_cpu_to_opencl"
+};
+
+struct starpu_codelet opencl_to_cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = { opencl_to_cpu_cpu_func, NULL },
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_opencl_to_cpu"
+};
+#endif /* !STARPU_USE_OPENCL */

+ 252 - 7
examples/filters/custom_mf/custom_interface.c

@@ -15,6 +15,9 @@
  */
 #include <starpu.h>
 #include <starpu_hash.h>
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
 #include "custom_interface.h"
 #include "custom_types.h"
 
@@ -36,7 +39,22 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node,
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
 				   void *dst_interface, unsigned dst_node,
 				   cudaStream_t stream);
-#endif
+#endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
+				 void *dst_interface, unsigned dst_node);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event);
+#endif /* !STARPU_USE_OPENCL */
 
 static const struct starpu_data_copy_methods custom_copy_data_methods_s =
 {
@@ -51,11 +69,11 @@ static const struct starpu_data_copy_methods custom_copy_data_methods_s =
 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
-	.ram_to_opencl       = NULL,
-	.opencl_to_ram       = NULL,
-	.opencl_to_opencl    = NULL,
-        .ram_to_opencl_async = NULL,
-	.opencl_to_ram_async = NULL,
+	.ram_to_opencl       = copy_ram_to_opencl,
+	.opencl_to_ram       = copy_opencl_to_ram,
+	.opencl_to_opencl    = copy_opencl_to_opencl,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
 	.cuda_to_spu = NULL,
 	.spu_to_ram  = NULL,
@@ -126,6 +144,9 @@ register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *da
 #ifdef STARPU_USE_CUDA
 			local_interface->cuda_ptr   = custom_interface->cuda_ptr;
 #endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = custom_interface->opencl_ptr;
+#endif
 		}
 		else
 		{
@@ -133,6 +154,9 @@ register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *da
 #ifdef STARPU_USE_CUDA
 			local_interface->cuda_ptr   = NULL;
 #endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = NULL;
+#endif
 		}
 		local_interface->nx = custom_interface->nx;
 		local_interface->ops = custom_interface->ops;
@@ -160,7 +184,19 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t nod
 			custom_interface->cpu_ptr = NULL;
 			return -ENOMEM;
 		}
-#endif
+#endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+		custom_interface->opencl_ptr = malloc(size);
+		if (custom_interface->cuda_ptr == NULL)
+		{
+			free(custom_interface->cpu_ptr);
+#ifdef STARPU_USE_CUDA
+			free(custom_interface->cuda_ptr);
+#endif /* !STARPU_USE_CUDA */
+			return -ENOMEM;
+		}
+#endif /* !STARPU_USE_OPENCL */
+			
 		break;
 #if STARPU_USE_CUDA
 	case STARPU_CUDA_RAM:
@@ -180,6 +216,31 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t nod
 		break;
 	}
 #endif
+#ifdef STARPU_USE_OPENCL
+	case STARPU_OPENCL_RAM:
+	{
+		/* XXX : StarPU shoulf probably provide starpu_opencl_allocate_memory(). */
+		cl_context context;
+		cl_command_queue queue;
+		int id = starpu_worker_get_id();
+		int devid = starpu_worker_get_devid(id);
+		starpu_opencl_get_queue(devid, &queue);
+		starpu_opencl_get_context(devid, &context);
+
+		cl_int err;
+		cl_mem memory;
+
+		/* */
+		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
+		memory = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
+        	if (err != CL_SUCCESS)
+			return -ENOMEM; // There might be other errors.
+
+		custom_interface->opencl_ptr = memory;
+
+		break;
+	}
+#endif /* !STARPU_USE_OPENCL */
 	default:
 		assert(0);
 	}
@@ -208,6 +269,13 @@ static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
 			custom_interface->cuda_ptr = NULL;
 		}
 #endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+		if (custom_interface->opencl_ptr != NULL)
+		{
+			free(custom_interface->opencl_ptr);
+			custom_interface->opencl_ptr = NULL;
+		}
+#endif /* !STARPU_USE_OPENCL */
 		break;
 #ifdef STARPU_USE_CUDA
 	case STARPU_CUDA_RAM:
@@ -247,6 +315,10 @@ custom_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
 		case STARPU_CUDA_RAM:
 			return data_interface->cuda_ptr;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			return data_interface->opencl_ptr;
+#endif
 		default:
 			assert(0);
 	}
@@ -303,6 +375,9 @@ void custom_data_register(starpu_data_handle_t *handle,
 #ifdef STARPU_USE_CUDA
 		.cuda_ptr = NULL,
 #endif
+#ifdef STARPU_USE_OPENCL
+		.opencl_ptr = NULL,
+#endif
 		.nx  = nx,
 		.ops = format_ops
 	};
@@ -412,3 +487,173 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
 	assert(0);
 }
 #endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node)
+{
+	(void) src_interface;
+	(void) src_node;
+	(void) dst_interface;
+	(void) dst_node;
+	return 0;
+}
+
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node)
+{
+	(void) src_interface;
+	(void) src_node;
+	(void) dst_interface;
+	(void) dst_node;
+	return 0;
+}
+
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
+				 void *dst_interface, unsigned dst_node)
+{
+	(void) src_interface;
+	(void) src_node;
+	(void) dst_interface;
+	(void) dst_node;
+	return 0;
+}
+
+/* StarPU will give us these in a near future */
+static cl_int
+_opencl_malloc(cl_context context, cl_mem *mem, size_t size, cl_mem_flags flags)
+{
+	cl_int err;
+        cl_mem memory;
+
+	memory = clCreateBuffer(context, flags, size, NULL, &err);
+	if (err != CL_SUCCESS)
+		return err;
+
+        *mem = memory;
+        return CL_SUCCESS;
+}
+
+static cl_int
+_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node,
+				      cl_mem buffer, unsigned dst_node,
+				      size_t size, size_t offset,
+				      cl_event *event, int *ret,
+				      cl_command_queue queue)
+{
+        cl_int err;
+        cl_bool blocking;
+
+        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+
+        err = clEnqueueWriteBuffer(queue, buffer, blocking, offset, size, ptr, 0, NULL, event);
+
+        if (err == CL_SUCCESS)
+                *ret = (event == NULL) ? 0 : -EAGAIN;
+
+	return err;
+}
+
+static cl_int
+_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node,
+			   void *ptr, unsigned dst_node,
+			   size_t size, size_t offset, cl_event *event,
+			   cl_command_queue queue)
+
+{
+        cl_int err;
+        cl_bool blocking;
+
+        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        err = clEnqueueReadBuffer(queue, buffer, blocking, offset, size, ptr, 0, NULL, event);
+
+        return err;
+}
+
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event)
+{
+	ssize_t size;
+	struct custom_data_interface *src_custom, *dst_custom;
+
+	src_custom = (struct custom_data_interface *) src_interface;
+	dst_custom = (struct custom_data_interface *) dst_interface;
+
+	/*
+	 * Opencl stuff.
+	 */
+	cl_context context;
+	cl_command_queue queue;
+	int id = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_queue(devid, &queue);
+	starpu_opencl_get_context(devid, &context);
+
+	/* Real stuff */
+	int err;
+	cl_int ret;
+
+	size = src_custom->nx * 2 * sizeof(float);
+	if (dst_custom->cpu_ptr == NULL)
+	{
+		ret = _opencl_malloc(context, (cl_mem*)&dst_custom->cpu_ptr, 
+				size, CL_MEM_READ_WRITE);
+		assert(ret == CL_SUCCESS);
+	}
+	err = _opencl_copy_ram_to_opencl_async_sync(src_custom->cpu_ptr,
+						src_node,
+						dst_custom->cpu_ptr,
+						dst_node,
+						size,
+						0,
+						NULL,
+						&ret,
+						queue);
+	assert(err == 0);
+	return 0;
+}
+
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event)
+{
+	ssize_t size;
+	struct custom_data_interface *src_custom, *dst_custom;
+
+	src_custom = (struct custom_data_interface *) src_interface;
+	dst_custom = (struct custom_data_interface *) dst_interface;
+
+	/*
+	 * Opencl stuff.
+	 */
+	cl_context context;
+	cl_command_queue queue;
+	int id = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_queue(devid, &queue);
+	starpu_opencl_get_context(devid, &context);
+
+	/* real stuff */
+	int err;
+	cl_int ret;
+	size = src_custom->nx * 2 * sizeof(float);
+	if (!dst_custom->opencl_ptr)
+	{
+		dst_custom->opencl_ptr = malloc(size);
+		assert(dst_custom->opencl_ptr != NULL);
+	}
+
+	err = _opencl_copy_opencl_to_ram(
+			src_custom->opencl_ptr,
+			src_node,
+			dst_custom->opencl_ptr,
+			dst_node,
+			size,
+			0,
+			NULL,
+			queue);
+	assert(err == 0);
+	return 0;
+}
+#endif /* !STARPU_USE_OPENCL */

+ 5 - 0
examples/filters/custom_mf/custom_interface.h

@@ -20,6 +20,7 @@ struct custom_data_interface
 {
 	void *cpu_ptr;
 	void *cuda_ptr;
+	void *opencl_ptr;
 	struct starpu_multiformat_data_interface_ops *ops;
 	uint32_t nx;
 };
@@ -40,4 +41,8 @@ void custom_data_register(starpu_data_handle_t *handle,
 	CUSTOM_GET_NX((interface))
 #endif /* !STARPU_USE_CUDA */
 
+#ifdef STARPU_USE_OPENCL
+#define CUSTOM_GET_OPENCL_X_PTR(interface) (((struct custom_data_interface *)(interface))->opencl_ptr)
+#endif
+
 #endif /* ! __CUSTOM_INTERFACE_H__ */

+ 103 - 13
examples/filters/custom_mf/custom_mf_filter.c

@@ -16,20 +16,36 @@
 #include <starpu.h>
 #include "custom_interface.h"
 #include "custom_types.h"
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif /* !STARPU_USE_OPENCL */
 
-#define N 20
+#define N 12
+
+#define DEBUG 1
+
+#ifdef STARPU_USE_CUDA
+static unsigned int ncuda;
+#endif
+#ifdef STARPU_USE_OPENCL
+static unsigned int nopencl;
+#endif
 
-#define DEBUG 0
 
 static struct point array_of_structs[N];
 static starpu_data_handle_t handle;
-static unsigned int nchunks = 4;
+static unsigned int nchunks = 6;
 
 #ifdef STARPU_USE_CUDA
 extern struct starpu_codelet cpu_to_cuda_cl;
 extern struct starpu_codelet cuda_to_cpu_cl;
 #endif
 
+#ifdef STARPU_USE_OPENCL
+extern struct starpu_codelet cpu_to_opencl_cl;
+extern struct starpu_codelet opencl_to_cpu_cl;
+#endif
+
 static struct starpu_multiformat_data_interface_ops format_ops =
 {
 #ifdef STARPU_USE_CUDA
@@ -37,6 +53,11 @@ static struct starpu_multiformat_data_interface_ops format_ops =
 	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
 	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_elemsize  = sizeof(struct struct_of_arrays),
+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
+#endif
 	.cpu_elemsize = sizeof(struct point),
 };
 
@@ -68,6 +89,16 @@ custom_filter(void *father, void *child, struct starpu_data_filter *f,
 		soa_child->y = soa_father->y + chunk_size;
 	}
 #endif
+#ifdef STARPU_USE_OPENCL
+	else if (custom_father->opencl_ptr)
+	{
+		struct struct_of_arrays *soa_father, *soa_child;
+		soa_father = (struct struct_of_arrays*) custom_father->opencl_ptr;
+		soa_child = (struct struct_of_arrays*) custom_child->opencl_ptr;
+		soa_child->x = soa_father->x + chunk_size;
+		soa_child->y = soa_father->y + chunk_size;
+	}
+#endif /* !STARPU_USE_OPENCL */
 
 	custom_child->ops = custom_father->ops;
 	custom_child->nx = chunk_size;
@@ -138,6 +169,19 @@ static struct starpu_codelet cuda_cl =
 };
 #endif /* !STARPU_USE_CUDA */
 
+#ifdef STARPU_USE_OPENCL
+extern void custom_scal_opencl_func(void *buffers[], void *args);
+
+static struct starpu_codelet opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = { custom_scal_opencl_func, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "opencl_codelet"
+};
+#endif /* !STARPU_USE_OPENCL */
+
 static int
 create_and_submit_tasks(void)
 {
@@ -146,17 +190,30 @@ create_and_submit_tasks(void)
 	for (i = 0; i < nchunks; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
-		if (i %2 == 0)
+		switch (i%3)
 		{
+		case 0:
 			task->cl = &cpu_cl;
-		}
-		else
-		{
+			break;
+		case 1:
 #ifdef STARPU_USE_CUDA
-			task->cl = &cuda_cl;
-#else
-			task->cl = &cpu_cl;
-#endif /* !STARPU_USE_CUDA */
+			if (ncuda > 0)
+				task->cl = &cuda_cl;
+			else
+#endif
+				task->cl = &cpu_cl;
+			break;
+		case 2:
+#ifdef STARPU_USE_OPENCL
+			if (nopencl > 0)
+				task->cl = &opencl_cl;
+			else
+#endif
+				task->cl = &cpu_cl;
+			break;
+		default:
+			/* We should never get here */
+			assert(0);
 		}
 
 		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
@@ -194,8 +251,7 @@ check_it(void)
 	int i;
 	for (i = 0; i < N; i++)
 	{
-		float expected_value = i + 1.0;
-		expected_value *= array_of_structs[i].y;
+		float expected_value = (i + 1.0)*42.0;
 		if (array_of_structs[i].x != expected_value)
 			return EXIT_FAILURE;
 	}
@@ -203,6 +259,11 @@ check_it(void)
 	return EXIT_SUCCESS;
 }
 
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+struct starpu_opencl_program opencl_conversion_program;
+#endif /* !STARPU_USE_OPENCL */
+
 int
 main(void)
 {
@@ -215,6 +276,24 @@ main(void)
 	if (err == -ENODEV)
 		goto enodev;
 
+#ifdef STARPU_USE_CUDA
+	ncuda = starpu_cuda_worker_get_count();
+#endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+	nopencl = starpu_opencl_worker_get_count();
+	if (nopencl > 0)
+	{
+		char *f1 = "examples/filters/custom_mf/custom_opencl.cl";
+		char *f2 = "examples/filters/custom_mf/conversion_opencl.cl";
+		err = starpu_opencl_load_opencl_from_file(f1, &opencl_program,
+							  NULL);
+		assert(err == 0);
+		err = starpu_opencl_load_opencl_from_file(f2,
+						&opencl_conversion_program,
+						NULL);
+		assert(err == 0);
+	}
+#endif /* !STARPU_USE_OPENCL */
 
 	register_and_partition_data();
 #if DEBUG
@@ -231,7 +310,18 @@ main(void)
 #if DEBUG
 	print_it();
 #endif
+
+#if STARPU_USE_OPENCL
+	if (nopencl > 0)
+	{
+        	err = starpu_opencl_unload_opencl(&opencl_program);
+		assert(err == 0);
+		err = starpu_opencl_unload_opencl(&opencl_conversion_program);
+		assert(err == 0);
+	}
+#endif /* !STARPU_USE_OPENCL */
 	starpu_shutdown();		
+	print_it();
 	return check_it();
 
 

+ 102 - 0
examples/filters/custom_mf/custom_opencl.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "custom_types.h"
+#include "custom_interface.h"
+
+extern struct starpu_opencl_program opencl_program;
+
+void custom_scal_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = CUSTOM_GET_NX(buffers[0]);
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"custom_scal_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
+	int douze;
+	if (starpu_opencl_set_kernel_args(&err, &kernel,
+					  sizeof(aop), &aop,
+					  sizeof(x), &x,
+					  sizeof(n), &n,
+					  0) != 3)
+	{
+		STARPU_OPENCL_REPORT_ERROR(err);
+		assert(0);
+	}
+	
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(
+				queue,
+				kernel,
+				1,       /* work_dim */
+				NULL,    /* global_work_offset */
+				&global, /* global_work_size */
+				&local,  /* local_work_size */
+				0,       /* num_events_in_wait_list */
+				NULL,    /* event_wait_list */
+				&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 26 - 0
examples/filters/custom_mf/custom_opencl.cl

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "custom_types.h"
+
+__kernel void custom_scal_opencl(__global struct point *aop,
+				 __global float *x,
+				 int nx)
+{
+        const int i = get_global_id(0);
+	if (i < nx)
+		x[i] *= x[i+nx];
+}