Browse Source

Multiformat : filter example.

Cyril Roelandt 13 years ago
parent
commit
ac859c1e11

+ 22 - 0
examples/Makefile.am

@@ -3,6 +3,7 @@
 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
+# Copyright (C) 2012 INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -156,6 +157,7 @@ examplebin_PROGRAMS +=				\
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
+	filters/multiformat/multiformat_filter  \
 	tag_example/tag_example			\
 	tag_example/tag_example3		\
 	tag_example/tag_example2		\
@@ -387,6 +389,26 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	filters/fblock_opencl_kernel.cl
 endif
 
+#
+#
+#
+filters_multiformat_multiformat_filter_SOURCES=                \
+	filters/multiformat/multiformat_filter.c               \
+	filters/multiformat/multiformat_ops.c                  \
+	filters/multiformat/conversion_codelets.c
+
+if STARPU_USE_CUDA
+filters_multiformat_multiformat_filter_SOURCES+=               \
+	filters/multiformat/cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+filters_multiformat_multiformat_filter_SOURCES+=\
+	filters/multiformat/opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	filters/multiformat/opencl.cl
+endif
+
 ################
 # AXPY example #
 ################

+ 93 - 0
examples/filters/multiformat/conversion_codelets.c

@@ -0,0 +1,93 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "multiformat_types.h"
+
+#ifdef STARPU_USE_CUDA
+void cuda_to_cpu(void *buffers[], void *arg)
+{
+	FPRINTF(stderr, "ENTER %s\n", __func__);
+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	struct point *dst = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+
+	for (i = 0; i < n; i++)
+	{
+		dst[i].x = src->x[i];
+		dst[i].y = src->y[i];
+	}
+}
+
+void cpu_to_cuda_cuda_func(void *buffers[], void *args)
+{
+}
+struct starpu_codelet cpu_to_cuda_cl =
+{
+	.where = STARPU_CUDA,
+	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_cpu_to_cuda"
+};
+
+struct starpu_codelet cuda_to_cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {cuda_to_cpu, NULL},
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_cuda_to_cpu"
+};
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void opencl_to_cpu(void *buffers[], void *arg)
+{
+	FPRINTF(stderr, "ENTER %s\n", __func__);
+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+	struct point *dst = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+	for (i = 0; i < n; i++)
+	{
+		dst[i].x = src->x[i];
+		dst[i].y = src->y[i];
+	}
+}
+
+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
+{
+}
+struct starpu_codelet cpu_to_opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = {cpu_to_opencl_opencl_func, NULL},
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_cpu_to_opencl"
+};
+
+struct starpu_codelet opencl_to_cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {opencl_to_cpu, NULL},
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_opencl_to_cpu"
+};
+#endif

+ 43 - 0
examples/filters/multiformat/cuda.cu

@@ -0,0 +1,43 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "multiformat_types.h"
+
+static __global__ void multiformat_cuda(struct struct_of_arrays *soa, unsigned n)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i < n)
+		soa->x[i] *= soa->y[i];
+}
+
+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
+{
+	(void) _args;
+
+	FPRINTF(stderr, "Running the cuda kernel (%s)\n", __func__);
+	unsigned int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	struct struct_of_arrays *soa;
+
+	soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+        multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 371 - 0
examples/filters/multiformat/multiformat_filter.c

@@ -0,0 +1,371 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This is a really simple example intended to show how to use filters with the
+ * multiformat interface. It does not do anything really useful. Since the
+ * memory is not contiguous (cf. struct struct_of_arrays), the user must write
+ * its own copy functions. Some of them have not been implemented here
+ * (synchronous functions, for example).
+ */
+#include <starpu.h>
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+#include "multiformat_types.h"
+
+static int ncpu = 0;
+static int ncuda = 0;
+static int nopencl = 0;
+static unsigned int nchunks = 1;
+
+static struct point array_of_structs[N_ELEMENTS];
+static starpu_data_handle_t array_of_structs_handle;
+
+#if STARPU_USE_CPU
+static void
+multiformat_scal_cpu_func(void *buffers[], void *args)
+{
+	struct point *aos;
+	unsigned int n, i;
+
+	aos = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+	for (i = 0; i < n; i++)
+		aos[i].x *= aos[i].y;
+}
+#endif /* STARPU_USE_CPU */
+
+#ifdef STARPU_USE_CUDA
+extern struct starpu_codelet cpu_to_cuda_cl;
+extern struct starpu_codelet cuda_to_cpu_cl;
+#endif
+
+#ifdef STARPU_USE_OPENCL
+extern struct starpu_codelet cpu_to_opencl_cl;
+extern struct starpu_codelet opencl_to_cpu_cl;
+#endif
+
+extern struct starpu_data_copy_methods my_multiformat_copy_data_methods_s;
+static struct starpu_multiformat_data_interface_ops format_ops =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_elemsize = sizeof(struct struct_of_arrays),
+	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
+	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_elemsize = sizeof(struct struct_of_arrays),
+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
+#endif
+	.cpu_elemsize = sizeof(struct point),
+	.copy = &my_multiformat_copy_data_methods_s
+};
+
+#ifdef STARPU_USE_CUDA
+extern void multiformat_scal_cuda_func(void *buffers[], void *arg);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void multiformat_scal_opencl_func(void *buffers[], void *arg);
+#endif
+
+#ifdef STARPU_USE_CPU
+static struct starpu_codelet cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {multiformat_scal_cpu_func, NULL},
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "codelet_real"
+};
+#endif /* !STARPU_USE_CPU */
+
+#ifdef STARPU_USE_CUDA
+static struct starpu_codelet cuda_cl =
+{
+	.where = STARPU_CUDA,
+	.cuda_funcs = { multiformat_scal_cuda_func, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "cuda_codelet"
+};
+#endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static struct starpu_codelet opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = { multiformat_scal_opencl_func, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "opencl_codelet"
+};
+#endif /* !STARPU_USE_OPENCL */
+
+/*
+ * Main functions 
+ */
+static void
+init_problem_data(void)
+{
+	int i; 
+	for (i = 0; i < N_ELEMENTS; i++)
+	{
+		array_of_structs[i].x = 1.0 + i;
+		array_of_structs[i].y = 42.0;
+	}
+}
+
+static void
+register_data(void)
+{
+	starpu_multiformat_data_register(&array_of_structs_handle,
+					 0,
+					 &array_of_structs,
+					 N_ELEMENTS,
+					 &format_ops);
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(array_of_structs_handle);
+}
+
+static void
+multiformat_divide_in_equal_chunks_filter_func(void *father,
+					       void *child,
+					       struct starpu_data_filter *f,
+					       unsigned id,
+					       unsigned nchunks)
+{
+	/*
+	 * One chunk for a CPU device.
+	 * At least one for a GPU (CUDA or OpenCL).
+	 * If possible, a third chunk for another kind of GPU.
+	 */ 
+	assert(nchunks == 2 || nchunks == 3);
+	assert (id < nchunks);
+
+	struct starpu_multiformat_interface *mf_father, *mf_child;
+
+	mf_father = (struct starpu_multiformat_interface *) father;
+	mf_child = (struct starpu_multiformat_interface *) child;
+
+	uint32_t length_first = f->filter_arg;
+	uint32_t nx = mf_father->nx;
+
+	assert(length_first < nx);
+
+	mf_child->ops = mf_father->ops;
+	memcpy(mf_child->ops, mf_father->ops, sizeof(mf_child->ops));
+
+
+	/* The actual partitioning */
+	mf_child->nx = length_first;
+
+#if STARPU_USE_CPU
+	if (mf_father->cpu_ptr)
+	{
+		struct point *tmp = (struct point *) mf_father->cpu_ptr;
+		tmp += id * length_first;
+		mf_child->cpu_ptr = tmp;
+	}
+#endif
+}
+
+static void
+partition_data(void)
+{
+	struct starpu_data_filter f =
+	{
+		.filter_func = multiformat_divide_in_equal_chunks_filter_func,
+		.nchildren = nchunks,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL,
+		.filter_arg = N_ELEMENTS/nchunks
+	};
+
+	starpu_data_partition(array_of_structs_handle, &f);
+}
+
+static int
+create_and_submit_tasks(void)
+{
+	int err;
+	unsigned int i;
+	for (i = 0; i < nchunks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		if (i == 0)
+		{
+			task->cl = &cpu_cl;
+		}
+		else if (i == 1)
+		{
+#if STARPU_USE_CUDA
+			if (ncuda > 0)
+				task->cl = &cuda_cl;
+#endif /* !STARPU_USE_CUDA */
+#if STARPU_USE_OPENCL
+			if (ncuda == 0 && nopencl > 0)
+				task->cl = &opencl_cl;
+#endif /* !STARPU_USE_OPENCL */
+		}
+#if STARPU_USE_OPENCL
+		else /* i == 2 */
+		{
+			task->cl = &opencl_cl;
+		}
+#endif /* !STARPU_USE_OPENCL */
+
+		starpu_data_handle_t handle;
+		handle = starpu_data_get_sub_data(array_of_structs_handle, 1, i);
+		task->handles[0] = handle;
+
+		err = starpu_task_submit(task);
+		if (err != 0)
+			return err;
+	}
+
+
+	err = starpu_task_wait_for_all();
+	if (err != 0)
+		return err;
+
+	return 0;
+}
+
+
+static void
+print_it(void)
+{
+	int i;
+	for (i = 0; i < N_ELEMENTS; i++)
+	{
+		FPRINTF(stderr, "(%.2f %.2f) ",
+			array_of_structs[i].x,
+			array_of_structs[i].y);
+	}
+	FPRINTF(stderr, "\n");
+}
+
+static int
+check_it(void)
+{
+	int i;
+	for (i = 0; i < N_ELEMENTS; i++)
+	{
+		float expected_value = i + 1.0;
+		expected_value *= array_of_structs[i].y;
+		if (array_of_structs[i].x != expected_value)
+			return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+struct starpu_opencl_program opencl_conversion_program;
+#endif
+
+static int
+gpus_available()
+{
+#ifdef STARPU_USE_CUDA
+	if (ncuda > 0)
+		return 1;
+#endif
+#ifdef STARPU_USE_OPENCL
+	if (nopencl > 0)
+		return 1;
+#endif
+
+	return 0;
+}
+
+int
+main(void)
+{
+#ifdef STARPU_USE_CPU
+	int err;
+	struct starpu_conf conf =
+	{
+		.ncpus = -1,
+		.ncuda = 1,
+		.nopencl = 1
+	};
+	starpu_init(&conf);
+
+	ncpu = starpu_cpu_worker_get_count();
+#ifdef STARPU_USE_CUDA
+	ncuda = starpu_cuda_worker_get_count();
+#endif
+#ifdef STARPU_USE_OPENCL
+	nopencl = starpu_opencl_worker_get_count();
+#endif
+
+	if (ncpu == 0 || !gpus_available())
+		return 77;
+
+	if (ncuda > 0)
+		nchunks++;
+	if (nopencl > 0)
+		nchunks++;
+
+	/* For the sake of simplicity. */
+	assert(N_ELEMENTS % nchunks == 0);
+
+#ifdef STARPU_USE_OPENCL
+	err = starpu_opencl_load_opencl_from_file("examples/filters/multiformat/opencl.cl",
+					    &opencl_program, NULL);
+	assert(err == 0);
+#endif
+	init_problem_data();
+
+	print_it();
+
+	register_data();
+	partition_data();
+
+	err = create_and_submit_tasks();
+	if (err != 0)
+	{
+		FPRINTF(stderr, "create_submit_task : %s\n",
+			strerror(-err));
+	}
+
+	starpu_data_unpartition(array_of_structs_handle, 0);
+
+	unregister_data();
+
+	print_it();
+
+#ifdef STARPU_USE_OPENCL
+        assert(starpu_opencl_unload_opencl(&opencl_program) == CL_SUCCESS);
+#endif
+	starpu_shutdown();
+
+
+	return check_it();
+#else
+	/* Without the CPU, there is no point in using the multiformat
+	 * interface, so this test is pointless. */
+	return EXIT_SUCCESS;
+#endif
+}

+ 432 - 0
examples/filters/multiformat/multiformat_ops.c

@@ -0,0 +1,432 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+
+#include "multiformat_types.h"
+
+#if STARPU_USE_CUDA
+static int copy_cuda_common_async(void *src_interface, unsigned src_node,
+				  void *dst_interface, unsigned dst_node,
+				  cudaStream_t stream, enum cudaMemcpyKind kind)
+{
+	struct starpu_multiformat_interface *src_multiformat;
+	struct starpu_multiformat_interface *dst_multiformat;
+
+	src_multiformat = (struct starpu_multiformat_interface *) src_interface;
+	dst_multiformat = (struct starpu_multiformat_interface *) dst_interface;
+
+	size_t size;
+	cudaError_t status;
+
+	switch (kind)
+	{
+	case cudaMemcpyHostToDevice:
+	{
+		/*
+		 * XXX : Should we do that ? It is a mix between copy and conversion...
+		 */
+		/*
+		 * Copying the data to the CUDA device.
+		 */
+		status = cudaMemcpyAsync(dst_multiformat->cpu_ptr,
+					 src_multiformat->cpu_ptr,
+					 src_multiformat->nx * src_multiformat->ops->cpu_elemsize,
+					 kind, stream);
+		assert(status == cudaSuccess);
+
+		/*
+		 * Copying the real data (that is pointed to).
+		 */
+		float *x = malloc(src_multiformat->nx * sizeof(float));
+		float *y = malloc(src_multiformat->nx * sizeof(float));
+		assert(x && y);
+
+		int i;
+		for (i = 0; i < src_multiformat->nx; i++)
+		{
+			struct point *p;
+			p = (struct point *) src_multiformat->cpu_ptr;
+			x[i] = p[i].x;
+			y[i] = p[i].y;
+		}
+
+		void *rets[2];
+		unsigned size = src_multiformat->nx * sizeof(float);
+
+		status = cudaMalloc(&rets[0], sizeof(float)*src_multiformat->nx);
+		assert(status == cudaSuccess);
+		status = cudaMemcpyAsync(rets[0], x, size, kind, stream);
+		assert(status == cudaSuccess);
+
+		status = cudaMalloc(&rets[1], sizeof(float)*src_multiformat->nx);
+		assert(status == cudaSuccess);
+		status = cudaMemcpyAsync(rets[1], y, size, kind, stream);
+		assert(status == cudaSuccess);
+
+		status = cudaMemcpyAsync(dst_multiformat->cuda_ptr, rets,
+				2*sizeof(void*), kind, stream);
+		assert(status == cudaSuccess);
+
+		free(x);
+		free(y);
+		break;
+	}
+	case cudaMemcpyDeviceToHost:
+	{
+		/*
+		 * Copying the cuda_ptr from the cuda device to the RAM.
+		 */
+		size = sizeof(struct struct_of_arrays);
+		if (!dst_multiformat->cuda_ptr)
+		{
+			dst_multiformat->cuda_ptr = calloc(1, size);
+			assert(dst_multiformat->cuda_ptr != NULL);
+		}
+
+
+		/* Getting the addresses of our data on the CUDA device. */
+		void *addrs[2];
+		status = cudaMemcpyAsync(addrs, src_multiformat->cuda_ptr,
+					 2 * sizeof(void*), kind, stream);
+		assert(status == cudaSuccess);
+
+
+		/*
+		 * Getting the real data.
+		 */
+		struct struct_of_arrays *soa;
+		soa = (struct struct_of_arrays *) dst_multiformat->cuda_ptr;
+		size = src_multiformat->nx * sizeof(float);
+
+		if (!soa->x)
+			soa->x = malloc(size);
+		status = cudaMemcpyAsync(soa->x, addrs[0], size, kind, stream);
+		assert(status == cudaSuccess);
+		
+
+		if (!soa->y)
+			soa->y = malloc(size);
+		status = cudaMemcpyAsync(soa->y, addrs[1], size, kind, stream);
+		assert(status == cudaSuccess);
+
+		/* Let's free this. */
+		status = cudaFree(addrs[0]);
+		assert(status == cudaSuccess);
+		status = cudaFree(addrs[1]);
+		assert(status == cudaSuccess);
+		break;
+	}
+	default:
+		assert(0);
+	}
+
+	return 0;
+}
+
+static int
+copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
+		       void *dst_interface, unsigned dst_node,
+		       cudaStream_t stream)
+{
+	fprintf(stderr, "ENTER %s\n", __func__);
+	copy_cuda_common_async(src_interface, src_node,
+				dst_interface, dst_node,
+				stream, cudaMemcpyHostToDevice);
+	return 0;
+}
+
+static int
+copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
+		       void *dst_interface, unsigned dst_node,
+		       cudaStream_t stream)
+{
+	fprintf(stderr, "ENTER %s\n", __func__);
+	copy_cuda_common_async(src_interface, src_node,
+				dst_interface, dst_node,
+				stream, cudaMemcpyDeviceToHost);
+	return 0;
+}
+
+static int
+copy_ram_to_cuda(void *src_interface, unsigned src_node,
+		 void *dst_interface, unsigned dst_node)
+{
+	/* TODO */
+	fprintf(stderr, "ENTER %s\n", __func__);
+	return 1;
+}
+
+static int
+copy_cuda_to_ram(void *src_interface, unsigned src_node,
+		 void *dst_interface, unsigned dst_node)
+{
+	/* TODO */
+	fprintf(stderr, "ENTER %s\n", __func__);
+	return 1;
+}
+#endif /* !STARPU_USE_CUDA */
+
+
+#ifdef STARPU_USE_OPENCL
+static cl_int
+_opencl_malloc(cl_context context, cl_mem *mem, size_t size, cl_mem_flags flags)
+{
+	cl_int err;
+        cl_mem memory;
+
+	memory = clCreateBuffer(context, flags, size, NULL, &err);
+	if (err != CL_SUCCESS)
+		return err;
+
+        *mem = memory;
+        return CL_SUCCESS;
+}
+
+static cl_int
+_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node,
+				      cl_mem buffer, unsigned dst_node,
+				      size_t size, size_t offset,
+				      cl_event *event, int *ret,
+				      cl_command_queue queue)
+{
+        cl_int err;
+        cl_bool blocking;
+
+        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+
+        err = clEnqueueWriteBuffer(queue, buffer, blocking, offset, size, ptr, 0, NULL, event);
+
+        if (err == CL_SUCCESS)
+                *ret = (event == NULL) ? 0 : -EAGAIN;
+
+	return err;
+}
+
+static cl_int
+_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node,
+			   void *ptr, unsigned dst_node,
+			   size_t size, size_t offset, cl_event *event,
+			   cl_command_queue queue)
+
+{
+        cl_int err;
+        cl_bool blocking;
+
+        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        err = clEnqueueReadBuffer(queue, buffer, blocking, offset, size, ptr, 0, NULL, event);
+
+        return err;
+}
+
+static int
+copy_ram_to_opencl(void *src_interface, unsigned src_node,
+		   void *dst_interface, unsigned dst_node)
+{
+	return 1;
+}
+
+static int
+copy_opencl_to_ram(void *src_interface, unsigned src_node,
+		   void *dst_interface, unsigned dst_node)
+{
+	return 1;
+}
+
+cl_mem xy[2];
+static int
+copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
+			 void *dst_interface, unsigned dst_node,
+			 void *event)
+{
+	(void) event;
+	FPRINTF(stderr, "Enter %s\n", __func__);
+	struct starpu_multiformat_interface *src_mf;
+	struct starpu_multiformat_interface *dst_mf;
+
+	src_mf = (struct starpu_multiformat_interface *) src_interface;
+	dst_mf = (struct starpu_multiformat_interface *) dst_interface;
+
+	/*
+	 * Opencl stuff.
+	 */
+	cl_context context;
+	cl_command_queue queue;
+	int id = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_queue(devid, &queue);
+	starpu_opencl_get_context(devid, &context);
+
+	/*
+	 * Copying the cpu pointer to the OpenCL device.
+	 */
+	int err;
+	cl_int ret;
+	size_t cpu_size = src_mf->nx * src_mf->ops->cpu_elemsize;
+	err = _opencl_copy_ram_to_opencl_async_sync(src_mf->cpu_ptr,
+						    src_node,
+						    dst_mf->cpu_ptr,
+						    dst_node,
+						    cpu_size,
+						    0,
+						    (cl_event *) event,
+						    &ret,
+						    queue);
+	assert(err == 0);
+
+	/*
+	 * Copying the real data.
+	 */
+	float *x = malloc(src_mf->nx * sizeof(float));
+	float *y = malloc(src_mf->nx * sizeof(float));
+	assert(x && y);
+
+	int i;
+	for (i = 0; i < src_mf->nx; i++)
+	{
+		struct point *p;
+		p = (struct point *) src_mf->cpu_ptr;
+		x[i] = p[i].x;
+		y[i] = p[i].y;
+	}
+
+	ret = _opencl_malloc(context, xy, src_mf->nx*sizeof(*x), CL_MEM_READ_WRITE);
+	assert(ret == CL_SUCCESS);
+	ret = _opencl_malloc(context, xy+1, src_mf->nx*sizeof(*y), CL_MEM_READ_ONLY);
+	assert(ret == CL_SUCCESS);
+
+	err = _opencl_copy_ram_to_opencl_async_sync(x,
+						    src_node,
+						    xy[0],
+						    dst_node,
+						    src_mf->nx*sizeof(*x),
+						    0,
+						    NULL,
+						    &ret,
+						    queue);
+	assert(err == CL_SUCCESS);
+	err = _opencl_copy_ram_to_opencl_async_sync(y,
+						    src_node,
+						    xy[1],
+						    dst_node,
+						    src_mf->nx * sizeof(*y),
+						    0,
+						    NULL,
+						    &ret,
+						    queue);
+	assert(err == CL_SUCCESS);
+
+
+	struct struct_of_arrays *soa;
+	soa = (struct struct_of_arrays *) dst_mf->opencl_ptr;
+	soa->x = (void *) xy[0];
+	soa->y = (void *) xy[1];
+
+	/* Not needed anymore */
+	free(x);
+	free(y);
+	return 0;
+
+}
+
+static int
+copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
+			 void *dst_interface, unsigned dst_node,
+			 void *event)
+{
+	FPRINTF(stderr, "Enter %s\n", __func__);
+	struct starpu_multiformat_interface *src_mf;
+	struct starpu_multiformat_interface *dst_mf;
+
+	src_mf = (struct starpu_multiformat_interface *) src_interface;
+	dst_mf = (struct starpu_multiformat_interface *) dst_interface;
+
+	/*
+	 * OpenCL stuff.
+	 */
+	int id = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(id);
+	cl_command_queue queue;
+	starpu_opencl_get_queue(devid, &queue);
+	cl_int ret;
+	if (dst_mf->opencl_ptr == NULL)
+	{
+		dst_mf->opencl_ptr = malloc(sizeof(struct struct_of_arrays));
+		assert(dst_mf->opencl_ptr);
+	}
+
+	float *x = malloc(src_mf->nx * sizeof(float));
+	float *y = malloc(src_mf->nx * sizeof(float));
+	assert(x && y);
+
+	struct struct_of_arrays *soa;
+	soa = (struct struct_of_arrays *) dst_mf->opencl_ptr;
+	ret = _opencl_copy_opencl_to_ram(
+		xy[0],
+		src_node,
+		x,
+		dst_node,
+		src_mf->nx * sizeof(float),
+		0,
+		NULL,
+		queue);
+	assert(ret == CL_SUCCESS);
+
+
+	ret = _opencl_copy_opencl_to_ram(
+		xy[1],
+		src_node,
+		y,
+		dst_node,
+		src_mf->nx * sizeof(float),
+		0,
+		NULL,
+		queue);
+	assert(ret == CL_SUCCESS);
+	
+
+	soa->x = x;
+	soa->y = y;
+	return 0;
+}
+#endif /* STARPU_USE_OPENCL */
+
+ const struct starpu_data_copy_methods my_multiformat_copy_data_methods_s =
+{
+	.ram_to_ram = NULL,
+	.ram_to_spu = NULL,
+#ifdef STARPU_USE_CUDA
+	.ram_to_cuda        = copy_ram_to_cuda,
+	.cuda_to_ram        = copy_cuda_to_ram,
+	.ram_to_cuda_async  = copy_ram_to_cuda_async,
+	.cuda_to_ram_async  = copy_cuda_to_ram_async,
+	.cuda_to_cuda       = NULL,
+	.cuda_to_cuda_async = NULL,
+#endif
+#if STARPU_USE_OPENCL
+	.ram_to_opencl       = copy_ram_to_opencl,
+	.opencl_to_ram       = copy_opencl_to_ram,
+	.opencl_to_opencl    = NULL,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
+	.cuda_to_spu = NULL,
+	.spu_to_ram  = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu  = NULL
+};

+ 37 - 0
examples/filters/multiformat/multiformat_types.h

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#ifndef MULTIFORMAT_TYPES_H
+#define MULTIFORMAT_TYPES_H
+
+#define N_ELEMENTS 6
+
+struct struct_of_arrays
+{
+	float *x, *y;
+};
+
+struct point
+{
+	float x, y;
+};
+
+#define FPRINTF(ofile, fmt, args ...) \
+do {                                  \
+if (!getenv("STARPU_SSILENT"))        \
+	fprintf(ofile, fmt, ##args);  \
+} while(0)
+
+#endif

+ 96 - 0
examples/filters/multiformat/opencl.c

@@ -0,0 +1,96 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "multiformat_types.h"
+
+extern struct starpu_opencl_program opencl_program;
+
+void multiformat_scal_opencl_func(void *buffers[], void *args)
+{
+	FPRINTF(stderr, "ENTER %s\n", __func__);
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"multiformat_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	struct struct_of_arrays *soa = (struct struct_of_arrays *)
+			STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+	cl_mem x = (cl_mem) soa->x;
+	cl_mem y = (cl_mem) soa->y;
+	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+	err = clSetKernelArg(kernel, 1, sizeof(y), &y);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 24 - 0
examples/filters/multiformat/opencl.cl

@@ -0,0 +1,24 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void multiformat_opencl(__global float *x,
+				 __global float *y,
+				 int nx)
+{
+        const int i = get_global_id(0);
+        if (i < nx)
+		x[i] *= y[i];
+}

+ 2 - 1
include/starpu_data_interfaces.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2011-2012  Institut National de Recherche en Informatique et Automatique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -364,6 +364,7 @@ struct starpu_multiformat_data_interface_ops
 	struct starpu_codelet *cpu_to_cuda_cl;
 	struct starpu_codelet *cuda_to_cpu_cl;
 #endif
+	struct starpu_data_copy_methods *copy;
 };
 
 struct starpu_multiformat_interface

+ 26 - 0
src/datawizard/filters.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -263,6 +264,29 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, uint32_t gatherin
 		if (child_handle->nchildren > 0)
 			starpu_data_unpartition(child_handle, gathering_node);
 
+		/* If this is a multiformat handle, we must convert the data now */
+		unsigned int id = starpu_get_handle_interface_id(child_handle);
+		if (id == STARPU_MULTIFORMAT_INTERFACE_ID &&
+			_starpu_get_node_kind(child_handle->mf_node) != STARPU_CPU_RAM)
+		{
+			void fake(void *buffers[], void *args) {}
+			struct starpu_codelet cl =
+			{
+				.where = STARPU_CPU,
+				.cpu_funcs = { fake, NULL },
+				.modes = { STARPU_RW },
+				.nbuffers = 1
+			};
+			struct starpu_multiformat_interface *format_interface;
+			format_interface = starpu_data_get_interface_on_node(child_handle, 0);
+			struct starpu_task *task = starpu_task_create();
+			task->handles[0] = child_handle;
+			task->cl = &cl;
+			task->synchronous = 1;
+			if (starpu_task_submit(task) != 0)
+				_STARPU_ERROR("Could not submit the conversion task while unpartitionning\n");
+		}
+
 		int ret;
 		ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, NULL, NULL);
 		/* for now we pretend that the RAM is almost unlimited and that gathering
@@ -390,6 +414,8 @@ static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nc
 			handle_child->per_worker[worker].data_interface = calloc(1, interfacesize);
 			STARPU_ASSERT(handle_child->per_worker[worker].data_interface);
 		}
+
+		handle_child->mf_node = handle->mf_node;
 	}
 
 	/* this handle now has children */

+ 4 - 1
src/datawizard/interfaces/multiformat_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2011-2012  Institut National de Recherche en Informatique et Automatique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -184,6 +184,9 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handleptr,
 		.nx         = nobjects,
 		.ops        = format_ops
 	};
+
+	if (format_ops->copy)
+		interface_multiformat_ops.copy_methods = format_ops->copy;
 	starpu_data_register(handleptr, home_node, &multiformat, &interface_multiformat_ops);
 }