浏览代码

First version of the multiformat interface.

Even though the implementation is not really good, it works. The most interesting part of this patch is obviously the interface API.
Cyril Roelandt 13 年之前
父节点
当前提交
6884c732c5

+ 29 - 1
examples/Makefile.am

@@ -40,6 +40,8 @@ endif
 
 EXTRA_DIST = 					\
 	basic_examples/vector_scal_opencl_kernel.cl \
+	basic_examples/multiformat_opencl_kernel.cl  \
+	basic_examples/multiformat_conversion_codelets_opencl_kernel.cl \
 	common/blas_model.c			\
 	spmv/spmv_cuda.cu			\
 	spmv/spmv_opencl.cl			\
@@ -123,7 +125,8 @@ noinst_HEADERS = 				\
 	spmv/matrix_market/mmio.h		\
 	spmv/matrix_market/mm_to_bcsr.h		\
 	spmv/spmv.h				\
-	spmv/dw_block_spmv.h
+	spmv/dw_block_spmv.h                    \
+	basic_examples/multiformat_types.h
 
 #####################################
 # What to install and what to check #
@@ -152,6 +155,7 @@ examplebin_PROGRAMS +=				\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/variable			\
+	basic_examples/multiformat              \
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
@@ -212,6 +216,7 @@ STARPU_EXAMPLES +=				\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/variable			\
+	basic_examples/multiformat              \
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
@@ -297,6 +302,29 @@ basic_examples_vector_scal_fortran_LDADD =	\
 endif
 endif
 
+#######################
+# Multiformat example #
+#######################
+basic_examples_multiformat_SOURCES =                                    \
+	basic_examples/multiformat.c                                    \
+	basic_examples/multiformat_conversion_codelets.c
+
+if STARPU_USE_CUDA
+basic_examples_multiformat_SOURCES+=                                     \
+	basic_examples/multiformat_cuda.cu                               \
+	basic_examples/multiformat_conversion_codelets_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+basic_examples_multiformat_SOURCES+=                                     \
+	basic_examples/multiformat_opencl.c                              \
+	basic_examples/multiformat_conversion_codelets_opencl.c          
+
+nobase_STARPU_OPENCL_DATA_DATA+=                                         \
+	basic_examples/multiformat_opencl_kernel.cl                      \
+	basic_examples/multiformat_conversion_codelets_opencl_kernel.cl
+endif
+
 #################
 # block example #
 #################

+ 189 - 0
examples/basic_examples/multiformat.c

@@ -0,0 +1,189 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+#include "multiformat_types.h"
+
+static struct struct_of_arrays global_struct_of_arrays;
+static  starpu_data_handle global_struct_of_arrays_handle;
+
+static void
+multiformat_scal_cpu_func(void *buffers[], void *args)
+{
+	struct struct_of_arrays *s;
+	unsigned int n, i;
+
+	s = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+	for (i = 0; i < n; i++) {
+		s->x[i] *= s->y[i];
+	}
+}
+
+#ifdef STARPU_USE_CUDA
+extern starpu_codelet cpu_to_cuda_cl;
+extern starpu_codelet cuda_to_cpu_cl;
+#endif
+
+#ifdef STARPU_USE_OPENCL
+extern starpu_codelet cpu_to_opencl_cl;
+extern starpu_codelet opencl_to_cpu_cl;
+#endif
+
+static struct starpu_multiformat_data_interface_ops format_ops = {
+#ifdef STARPU_USE_CUDA
+	.cuda_elemsize = sizeof(struct point),
+	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
+	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_elemsize = sizeof(struct point),
+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
+#endif
+	.cpu_elemsize = sizeof(global_struct_of_arrays),
+
+};
+
+#ifdef STARPU_USE_CUDA
+extern void multiformat_scal_cuda_func(void *buffers[], void *arg);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void multiformat_scal_opencl_func(void *buffers[], void *arg);
+#endif
+
+static struct starpu_perfmodel_t conversion_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "multiformat_conversion_model"
+};
+
+static starpu_codelet  cl = {
+	.where = STARPU_CUDA | STARPU_OPENCL,
+	.cpu_func = multiformat_scal_cpu_func,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = multiformat_scal_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func = multiformat_scal_opencl_func,
+#endif
+	.nbuffers = 1,
+	.conversion_model = &conversion_model
+};
+
+/*
+ * Main functions 
+ */
+static void
+init_problem_data(void)
+{
+	int i; 
+	for (i = 0; i < N_ELEMENTS; i++) {
+		global_struct_of_arrays.x[i] = 1.0f + i;
+		global_struct_of_arrays.y[i] = 42.0;
+	}
+}
+
+static void
+register_data(void)
+{
+	starpu_multiformat_data_register(&global_struct_of_arrays_handle,
+					 0,
+					 &global_struct_of_arrays,
+					 N_ELEMENTS,
+					 &format_ops);
+}
+
+static void
+create_and_submit_tasks(void)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl;
+	task->synchronous = 1;
+	task->buffers[0].handle = global_struct_of_arrays_handle;
+	task->buffers[0].mode = STARPU_RW;
+	task->cl_arg = NULL;
+	task->cl_arg_size = 0;
+	starpu_task_submit(task);
+
+	struct starpu_task *task2 = starpu_task_create();
+	task2->cl = &cl;
+	task2->synchronous = 1;
+	task2->buffers[0].handle = global_struct_of_arrays_handle;
+	task2->buffers[0].mode = STARPU_RW;
+	task2->cl_arg = NULL;
+	task2->cl_arg_size = 0;
+	starpu_task_submit(task2);
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(global_struct_of_arrays_handle);
+}
+
+static void
+print_it(void)
+{
+	int i;
+	for (i = 0; i < N_ELEMENTS; i++) {
+		fprintf(stderr, "(%.2f %.2f) ",
+			global_struct_of_arrays.x[i],
+			global_struct_of_arrays.y[i]);
+	}
+	fprintf(stderr, "\n");
+}
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+struct starpu_opencl_program opencl_conversion_program;
+#endif
+
+int
+main(void)
+{
+	starpu_init(NULL);
+
+#ifdef STARPU_USE_OPENCL
+	starpu_opencl_load_opencl_from_file("examples/basic_examples/multiformat_opencl_kernel.cl",
+					    &opencl_program, NULL);
+	starpu_opencl_load_opencl_from_file("examples/basic_examples/multiformat_conversion_codelets_opencl_kernel.cl", 
+		&opencl_conversion_program, NULL);
+#endif
+	init_problem_data();
+
+	print_it();
+
+	register_data();
+
+	create_and_submit_tasks();
+
+	unregister_data();
+
+	print_it();
+
+#ifdef STARPU_USE_OPENCL
+        starpu_opencl_unload_opencl(&opencl_program);
+        starpu_opencl_unload_opencl(&opencl_conversion_program);
+#endif
+	starpu_shutdown();
+
+
+	return 0;
+}

+ 99 - 0
examples/basic_examples/multiformat_conversion_codelets.c

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include "multiformat_types.h"
+
+#ifdef STARPU_USE_CUDA
+void cpu_to_cuda(void *buffers[], void *arg)
+{
+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	struct point *dst = STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+	for (i = 0; i < n; i++) {
+		dst[i].x = src->x[i];
+		dst[i].y = src->y[i];
+	}
+}
+
+void cuda_to_cpu(void *buffers[], void *arg)
+{
+	struct point *src = STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	struct struct_of_arrays *dst = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+	for (i = 0; i < n; i++) {
+		dst->x[i] = src[i].x;
+		dst->y[i] = src[i].y;
+	}
+}
+
+extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
+starpu_codelet cpu_to_cuda_cl = {
+	.where = STARPU_CUDA,
+	.cpu_func = cpu_to_cuda,
+	.cuda_func = cpu_to_cuda_cuda_func,
+	.nbuffers = 1
+};
+
+starpu_codelet cuda_to_cpu_cl = {
+	.where = STARPU_CPU,
+	.cpu_func = cuda_to_cpu,
+	.nbuffers = 1
+};
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void cpu_to_opencl(void *buffers[], void *arg)
+{
+	fprintf(stderr, "User Entering %s\n", __func__);
+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	struct point *dst = STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+	for (i = 0; i < n; i++) {
+		dst[i].x = src->x[i];
+		dst[i].y = src->y[i];
+	}
+}
+
+void opencl_to_cpu(void *buffers[], void *arg)
+{
+	fprintf(stderr, "User Entering %s\n", __func__);
+	struct point *src = STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+	struct struct_of_arrays *dst = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+	for (i = 0; i < n; i++) {
+		dst->x[i] = src[i].x;
+		dst->y[i] = src[i].y;
+	}
+}
+
+extern void cpu_to_opencl_opencl_func(void *buffers[], void *args);
+starpu_codelet cpu_to_opencl_cl = {
+	.where = STARPU_OPENCL,
+	.cpu_func = cpu_to_opencl,
+	.opencl_func = cpu_to_opencl_opencl_func,
+	.nbuffers = 1
+};
+
+starpu_codelet opencl_to_cpu_cl = {
+	.where = STARPU_CPU,
+	.cpu_func = opencl_to_cpu,
+	.nbuffers = 1
+};
+#endif

+ 46 - 0
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -0,0 +1,46 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "multiformat_types.h"
+
+static __global__ void cpu_to_cuda_cuda(struct struct_of_arrays *src,
+	struct point *dst, unsigned n)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= n)
+		return;
+	dst[i].x = src->x[i];
+	dst[i].y = src->y[i];
+
+}
+
+extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
+{
+	struct struct_of_arrays *src;
+	src = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	struct point *dst;
+	dst = (struct point *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+
+        cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 98 - 0
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -0,0 +1,98 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+extern struct starpu_opencl_program opencl_conversion_program;
+
+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
+	cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_conversion_program,
+					"cpu_to_opencl_opencl",
+					devid);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(src), &src);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 1, sizeof(dst), &dst);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 26 - 0
examples/basic_examples/multiformat_conversion_codelets_opencl_kernel.cl

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include "examples/basic_examples/multiformat_types.h"
+__kernel void cpu_to_opencl_opencl(__global struct struct_of_arrays *src,
+				__global struct point *dst,
+				unsigned int n)
+{
+	const unsigned int i = get_global_id(0);
+	if (i < n) {
+		dst[i].x = src->x[i];
+		dst[i].y = src->y[i];
+	}
+}

+ 41 - 0
examples/basic_examples/multiformat_cuda.cu

@@ -0,0 +1,41 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "multiformat_types.h"
+static __global__ void multiformat_cuda(struct point *val, unsigned n)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= n)
+		return;
+	
+	val[i].x *= val[i].y;
+}
+
+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
+{
+	(void) _args;
+
+	fprintf(stderr, "Running the cuda kernel (%s)\n", __func__);
+	unsigned int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	struct point *val =  (struct point *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+        multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 91 - 0
examples/basic_examples/multiformat_opencl.c

@@ -0,0 +1,91 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+extern struct starpu_opencl_program opencl_program;
+
+void multiformat_scal_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"multiformat_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 1, sizeof(n), &n);
+	if (err)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 24 - 0
examples/basic_examples/multiformat_opencl_kernel.cl

@@ -0,0 +1,24 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "examples/basic_examples/multiformat_types.h"
+__kernel void multiformat_opencl(__global struct point* val, int nx)
+{
+        const int i = get_global_id(0);
+        if (i < nx) {
+                val[i].x *= val[i].y;
+        }
+}

+ 30 - 0
examples/basic_examples/multiformat_types.h

@@ -0,0 +1,30 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#ifndef MULTIFORMAT_TYPES_H
+#define MULTIFORMAT_TYPES_H
+
+#define N_ELEMENTS 10
+
+struct struct_of_arrays{
+	float x[N_ELEMENTS];
+	float y[N_ELEMENTS];
+};
+struct point {
+	float x, y;
+};
+
+
+#endif

+ 52 - 1
include/starpu_data_interfaces.h

@@ -314,6 +314,56 @@ uint32_t starpu_bcsr_get_r(starpu_data_handle);
 uint32_t starpu_bcsr_get_c(starpu_data_handle);
 size_t starpu_bcsr_get_elemsize(starpu_data_handle);
 
+/*
+ * Multiformat interface
+ */
+struct starpu_multiformat_data_interface_ops {
+	size_t cpu_elemsize;
+#ifdef STARPU_USE_OPENCL
+	size_t opencl_elemsize;
+	void *cpu_to_opencl_cl;
+	void *opencl_to_cpu_cl;
+#endif
+#ifdef STARPU_USE_CUDA
+	size_t cuda_elemsize;
+	void *cpu_to_cuda_cl;
+	void *cuda_to_cpu_cl;
+#endif
+};
+
+typedef struct starpu_multiformat_interface_s {
+	void *cpu_ptr;
+#ifdef STARPU_USE_CUDA
+	void *cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+	void *opencl_ptr;
+#endif
+	uintptr_t dev_handle;
+	size_t offset;
+	uint32_t nx;
+	struct starpu_multiformat_data_interface_ops *ops;
+	double conversion_time;
+} starpu_multiformat_interface_t;
+
+void starpu_multiformat_data_register(starpu_data_handle *handle,
+				      uint32_t home_node,
+				      void *ptr,
+				      uint32_t nobjects,
+				      struct starpu_multiformat_data_interface_ops *format_ops);
+
+#define STARPU_MULTIFORMAT_GET_PTR(interface)  (((starpu_multiformat_interface_t *)(interface))->cpu_ptr)
+
+#ifdef STARPU_USE_CUDA
+#define STARPU_MULTIFORMAT_GET_CUDA_PTR(interface) (((starpu_multiformat_interface_t *)(interface))->cuda_ptr)
+#endif
+
+#ifdef STARPU_USE_OPENCL
+#define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((starpu_multiformat_interface_t *)(interface))->opencl_ptr)
+#endif
+
+#define STARPU_MULTIFORMAT_GET_NX(interface)  (((starpu_multiformat_interface_t *)(interface))->nx)
+
 #define STARPU_MATRIX_INTERFACE_ID	0
 #define STARPU_BLOCK_INTERFACE_ID	1
 #define STARPU_VECTOR_INTERFACE_ID	2
@@ -321,7 +371,8 @@ size_t starpu_bcsr_get_elemsize(starpu_data_handle);
 #define STARPU_BCSR_INTERFACE_ID	4
 #define STARPU_VARIABLE_INTERFACE_ID	5
 #define STARPU_VOID_INTERFACE_ID	6
-#define STARPU_NINTERFACES_ID		7 /* number of data interfaces */
+#define STARPU_MULTIFORMAT_INTERFACE_ID 7
+#define STARPU_NINTERFACES_ID		8 /* number of data interfaces */
 
 unsigned starpu_get_handle_interface_id(starpu_data_handle);
 

+ 2 - 0
include/starpu_scheduler.h

@@ -183,6 +183,8 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
 /* Returns expected power consumption in J */
 double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+/* Returns expected conversion time in ms (multiformat interface only) */
+double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 
 #ifdef __cplusplus
 }

+ 3 - 0
include/starpu_task.h

@@ -99,6 +99,9 @@ typedef struct starpu_codelet_t {
 	 * In the case of parallel codelets, accounts for all units. */
 	struct starpu_perfmodel_t *power_model;
 
+	/* Conversion model of the codelet */
+	struct starpu_perfmodel_t *conversion_model;
+
 	/* statistics collected at runtime: this is filled by StarPU and should
 	 * not be accessed directly (use the starpu_display_codelet_stats
 	 * function instead for instance). */

+ 1 - 0
src/Makefile.am

@@ -182,6 +182,7 @@ libstarpu_la_SOURCES = 						\
 	datawizard/interfaces/vector_filters.c			\
 	datawizard/interfaces/variable_interface.c		\
 	datawizard/interfaces/void_interface.c			\
+	datawizard/interfaces/multiformat_interface.c           \
 	util/malloc.c						\
 	util/execute_on_all.c					\
 	util/starpu_create_sync_task.c				\

+ 5 - 0
src/core/perfmodel/perfmodel.c

@@ -196,6 +196,11 @@ double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_arc
 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 }
 
+double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	return starpu_model_expected_perf(task, task->cl->conversion_model, arch, nimpl);
+}
+
 /* Predict the transfer time (in µs) to move a handle to a memory node */
 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode)
 {

+ 23 - 0
src/core/task.c

@@ -387,3 +387,26 @@ void _starpu_set_current_task(struct starpu_task *task)
 {
 	pthread_setspecific(current_task_key, task);
 }
+
+double _starpu_task_get_conversion_time(struct starpu_task *task)
+{
+	int i;
+	double conversion_time = 0.0;
+
+	for (i = 0; i < task->cl->nbuffers; i++) {
+		starpu_data_handle handle = task->buffers[i].handle;
+		unsigned int id = starpu_get_handle_interface_id(handle);
+		if (id == STARPU_MULTIFORMAT_INTERFACE_ID) {
+			starpu_multiformat_interface_t *tmp;
+			uint32_t node = starpu_worker_get_memory_node(task->workerid);
+			tmp = starpu_data_get_interface_on_node(handle, node);
+			conversion_time += tmp->conversion_time;
+			/* XXX : this may not be the right place to reset this field,
+			 * but we need to make sure the conversion time won't be counted 
+                         * twice */
+			tmp->conversion_time = 0;
+		}
+	}
+
+	return conversion_time;
+}

+ 605 - 0
src/datawizard/interfaces/multiformat_interface.c

@@ -0,0 +1,605 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <common/config.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy_driver.h>
+#include <datawizard/filters.h>
+#include <common/hash.h>
+#include <starpu_cuda.h>
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+#ifdef STARPU_USE_CUDA
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
+#endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
+#endif
+
+static const struct starpu_data_copy_methods multiformat_copy_data_methods_s = {
+	.ram_to_ram = copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef STARPU_USE_CUDA
+	.ram_to_cuda = copy_ram_to_cuda,
+	.cuda_to_ram = copy_cuda_to_ram,
+	.ram_to_cuda_async = copy_ram_to_cuda_async,
+	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda = copy_cuda_to_cuda,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+	.opencl_to_opencl = copy_opencl_to_opencl,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
+	.cuda_to_spu = NULL,
+	.spu_to_ram = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu = NULL
+};
+
+static void register_multiformat_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void *multiformat_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node);
+static size_t multiformat_interface_get_size(starpu_data_handle handle);
+static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle handle);
+static int multiformat_compare(void *data_interface_a, void *data_interface_b);
+static void display_multiformat_interface(starpu_data_handle handle, FILE *f);
+static uint32_t starpu_multiformat_get_nx(starpu_data_handle handle);
+#ifdef STARPU_USE_GORDON
+static int convert_multiformat_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+#endif
+
+
+static struct starpu_data_interface_ops_t interface_multiformat_ops = {
+	.register_data_handle  = register_multiformat_handle,
+	.allocate_data_on_node = allocate_multiformat_buffer_on_node,
+	.handle_to_pointer     = multiformat_handle_to_pointer,
+	.free_data_on_node     = free_multiformat_buffer_on_node,
+	.copy_methods          = &multiformat_copy_data_methods_s,
+	.get_size              = multiformat_interface_get_size,
+	.footprint             = footprint_multiformat_interface_crc32,
+	.compare               = multiformat_compare,
+#ifdef STARPU_USE_GORDON
+	.convert_to_gordon     = NULL,
+#endif
+	.interfaceid           = STARPU_MULTIFORMAT_INTERFACE_ID,
+	.interface_size        = sizeof(starpu_multiformat_interface_t),
+	.display               = display_multiformat_interface
+};
+
+static void *multiformat_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+	starpu_multiformat_interface_t *multiformat_interface = 
+		starpu_data_get_interface_on_node(handle, node);
+
+	switch(_starpu_get_node_kind(node)) {
+		case STARPU_CPU_RAM:
+			return multiformat_interface->cpu_ptr;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			return multiformat_interface->cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			return multiformat_interface->opencl_ptr;
+#endif
+		default:
+			STARPU_ASSERT(0);
+	}
+}
+
+static void register_multiformat_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+{
+	starpu_multiformat_interface_t *multiformat_interface;
+	multiformat_interface = (starpu_multiformat_interface_t *) data_interface;
+
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++) {
+		starpu_multiformat_interface_t *local_interface =
+			starpu_data_get_interface_on_node(handle, node);
+
+		if (node == home_node) {
+			local_interface->cpu_ptr    = multiformat_interface->cpu_ptr;
+#ifdef STARPU_USE_CUDA
+			local_interface->cuda_ptr   = multiformat_interface->cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
+#endif
+			local_interface->dev_handle = multiformat_interface->dev_handle;
+			local_interface->offset     = multiformat_interface->offset;
+		}
+		else {
+			local_interface->cpu_ptr    = NULL;
+#ifdef STARPU_USE_CUDA
+			local_interface->cuda_ptr   = NULL;
+#endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = NULL;
+#endif
+			local_interface->dev_handle = 0;
+			local_interface->offset     = 0;
+		}
+		local_interface->nx = multiformat_interface->nx;
+		local_interface->ops = multiformat_interface->ops;
+		local_interface->conversion_time = 0;
+	}
+}
+
+void starpu_multiformat_data_register(starpu_data_handle *handleptr,
+				      uint32_t home_node,
+				      void *ptr,
+				      uint32_t nobjects,
+				      struct starpu_multiformat_data_interface_ops *format_ops)
+{
+	starpu_multiformat_interface_t multiformat = {
+		.cpu_ptr    = ptr,
+#ifdef STARPU_USE_CUDA
+		.cuda_ptr   = NULL,
+#endif
+#ifdef STARPu_USE_OPENCL
+		.opencl_ptr = NULL,
+#endif
+		.nx         = nobjects, 
+		.dev_handle = (uintptr_t) ptr,
+		.offset     = 0,
+		.ops        = format_ops
+	};
+	starpu_data_register(handleptr, home_node, &multiformat, &interface_multiformat_ops);
+}
+
+static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle handle)
+{
+	return _starpu_crc32_be(starpu_multiformat_get_nx(handle), 0);
+}
+
+static int multiformat_compare(void *data_interface_a, void *data_interface_b)
+{
+	starpu_multiformat_interface_t *multiformat_a = data_interface_a;
+	starpu_multiformat_interface_t *multiformat_b = data_interface_b;
+
+	return ((multiformat_a->nx == multiformat_b->nx)
+			&& (multiformat_a->ops->cpu_elemsize == multiformat_b->ops->cpu_elemsize)
+#ifdef STARPU_USE_CUDA
+			&& (multiformat_a->ops->cuda_elemsize == multiformat_b->ops->cuda_elemsize)
+#endif
+#if STARPU_USE_OPENCL
+			&& (multiformat_a->ops->opencl_elemsize == multiformat_b->ops->opencl_elemsize)
+#endif
+		);
+}
+
+static void display_multiformat_interface(starpu_data_handle handle, FILE *f)
+{
+	/* TODO */
+	STARPU_ASSERT(0);
+}
+
+/* XXX : returns CPU size */
+static size_t multiformat_interface_get_size(starpu_data_handle handle)
+{
+	size_t size;
+	starpu_multiformat_interface_t *multiformat_interface;
+	multiformat_interface = starpu_data_get_interface_on_node(handle, 0);
+	size = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
+	return size;
+}
+
+uint32_t starpu_multiformat_get_nx(starpu_data_handle handle)
+{
+	starpu_multiformat_interface_t *multiformat_interface;
+	multiformat_interface = starpu_data_get_interface_on_node(handle, 0);
+	return multiformat_interface->nx;
+}
+
+static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node)
+{
+	starpu_multiformat_interface_t *multiformat_interface;
+	multiformat_interface = (starpu_multiformat_interface_t *) data_interface;
+	starpu_node_kind kind = _starpu_get_node_kind(node);
+
+	switch(kind) {
+		case STARPU_CPU_RAM:
+			free(multiformat_interface->cpu_ptr);
+			multiformat_interface->cpu_ptr = NULL;
+			break;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			cudaFree(multiformat_interface->cuda_ptr);
+			multiformat_interface->cuda_ptr = NULL;
+			break;
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			/* TODO */
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+	}
+}
+
+static ssize_t allocate_multiformat_buffer_on_node(void *data_interface_, uint32_t dst_node)
+{
+	starpu_multiformat_interface_t *multiformat_interface;
+	multiformat_interface = (starpu_multiformat_interface_t *) data_interface_;
+	unsigned fail = 0;
+	uintptr_t addr = 0;
+	ssize_t allocated_memory;
+
+	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
+	switch(kind) {
+		case STARPU_CPU_RAM:
+			allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
+			addr = (uintptr_t)malloc(allocated_memory);
+			if (!addr) {
+				fail = 1;
+			}
+			else {
+				multiformat_interface->cpu_ptr = (void *) addr;
+				multiformat_interface->dev_handle = addr;
+			}
+			break;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			{
+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->cuda_elemsize;
+				cudaError_t status = cudaMalloc((void **)&addr, allocated_memory);
+				if (STARPU_UNLIKELY(status)) {
+					STARPU_CUDA_REPORT_ERROR(status);
+				}
+				else {
+					multiformat_interface->cuda_ptr = (void *)addr;
+					multiformat_interface->dev_handle = addr;
+				}
+				break;
+			}
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+				allocated_memory = multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize;
+                                ret = _starpu_opencl_allocate_memory(&ptr, allocated_memory, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				else {
+					multiformat_interface->opencl_ptr = (void *)addr;
+					multiformat_interface->dev_handle = addr;
+
+				}
+				break;
+			}
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	if (fail)
+		return -ENOMEM;
+
+	multiformat_interface->offset = 0;
+	return allocated_memory;
+}
+
+
+
+
+/*
+ * Copy methods
+ */
+static int copy_ram_to_ram(void *src_interface, unsigned src_node,
+			   void *dst_interface, unsigned dst_node)
+{
+	starpu_multiformat_interface_t *src_multiformat;
+	starpu_multiformat_interface_t *dst_multiformat;
+
+	src_multiformat = (starpu_multiformat_interface_t *) src_interface;
+	dst_multiformat = (starpu_multiformat_interface_t *) dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+
+	size_t size = dst_multiformat->nx * dst_multiformat->ops->cpu_elemsize;
+	memcpy(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size);
+
+	return 0;
+}
+
+#ifdef STARPU_USE_CUDA
+static int copy_cuda_common(void *src_interface, unsigned src_node,
+			    void *dst_interface, unsigned dst_node,
+			    enum cudaMemcpyKind kind)
+{
+	starpu_multiformat_interface_t *src_multiformat;
+	starpu_multiformat_interface_t *dst_multiformat;
+
+	src_multiformat = (starpu_multiformat_interface_t *) src_interface;
+	dst_multiformat = (starpu_multiformat_interface_t *) dst_interface;
+
+	size_t size;
+
+	cudaError_t status;
+
+	switch (kind) {
+		case cudaMemcpyHostToDevice:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			if (src_multiformat->cuda_ptr == NULL) {
+				src_multiformat->cuda_ptr = malloc(size);
+				if (src_multiformat->cuda_ptr == NULL)
+					return -ENOMEM;
+			}
+			/* Converting data , from host to host */
+			double tmp = starpu_timing_now();
+			void *buffers[1];
+			buffers[0] = src_interface;
+			starpu_codelet *cl = src_multiformat->ops->cpu_to_cuda_cl;
+			cl->cpu_func(buffers, NULL);
+			dst_multiformat->conversion_time = starpu_timing_now() - tmp;
+
+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (STARPU_UNLIKELY(status)) {
+				STARPU_CUDA_REPORT_ERROR(status);
+			}
+			break;
+		}
+		case cudaMemcpyDeviceToHost:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (STARPU_UNLIKELY(status))
+				STARPU_CUDA_REPORT_ERROR(status);
+		
+			void *buffers[1];
+			starpu_codelet *cl = src_multiformat->ops->cuda_to_cpu_cl;
+			buffers[0] = dst_interface;
+			cl->cpu_func(buffers, NULL);
+							  
+			break;
+		}
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	return 0;
+}
+
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
+
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+}
+
+static int copy_cuda_common_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
+{
+	starpu_multiformat_interface_t *src_multiformat;
+	starpu_multiformat_interface_t *dst_multiformat;
+
+	src_multiformat = (starpu_multiformat_interface_t *) src_interface;
+	dst_multiformat = (starpu_multiformat_interface_t *) dst_interface;
+
+	size_t size;
+#ifdef STARPU_USE_CUDA
+	cudaError_t status;
+#endif
+
+	switch (kind) {
+		case cudaMemcpyHostToDevice:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			if (src_multiformat->cuda_ptr == NULL) {
+				src_multiformat->cuda_ptr = malloc(size);
+				if (src_multiformat->cuda_ptr == NULL)
+					return -ENOMEM;
+			}
+
+			/* Converting data , from host to host */
+			double tmp = starpu_timing_now();
+			void *buffers[1]; // XXX
+			buffers[0] = src_interface;
+			starpu_codelet *cl = src_multiformat->ops->cpu_to_cuda_cl;
+			cl->cpu_func(buffers, NULL);
+			dst_multiformat->conversion_time = starpu_timing_now() - tmp;
+
+			/* Actual copy from host to device */
+			status = cudaMemcpyAsync(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind, stream);
+			if (STARPU_UNLIKELY(status)) {
+				STARPU_CUDA_REPORT_ERROR(status);
+			}
+			break;
+		}
+		case cudaMemcpyDeviceToHost:
+		{
+			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
+			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (STARPU_UNLIKELY(status))
+				STARPU_CUDA_REPORT_ERROR(status);
+
+			/* Converting data */
+			void *buffers[1];
+			starpu_codelet *cl = src_multiformat->ops->cuda_to_cpu_cl;
+			buffers[0] = dst_interface;
+			cl->cpu_func(buffers, NULL);
+
+			break;
+		}
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	return 0;
+}
+
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
+}
+
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
+}
+
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+	/* TODO */
+	STARPU_ASSERT(0);
+}
+
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	/* TODO */
+	STARPU_ASSERT(0);
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *_event)
+{
+	int err, ret;
+	size_t size;
+	starpu_multiformat_interface_t *src_multiformat;
+	starpu_multiformat_interface_t *dst_multiformat;
+
+	src_multiformat = (starpu_multiformat_interface_t *) src_interface;
+	dst_multiformat = (starpu_multiformat_interface_t *) dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(src_multiformat->ops != NULL);
+
+	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize;
+
+	if (src_multiformat->opencl_ptr == NULL) {
+		src_multiformat->opencl_ptr = malloc(src_multiformat->nx * src_multiformat->ops->opencl_elemsize);
+		if (src_multiformat->opencl_ptr == NULL) {
+			return -ENOMEM;
+		}
+
+		double tmp = starpu_timing_now();
+		void *buffers[1];
+		starpu_codelet *cl = src_multiformat->ops->cpu_to_opencl_cl;
+		buffers[0] = src_interface;
+		cl->cpu_func(buffers, NULL);
+		dst_multiformat->conversion_time = starpu_timing_now() - tmp;
+
+		if (src_multiformat->opencl_ptr == NULL)
+			return -ENOMEM; // XXX
+	}
+
+	err = _starpu_opencl_copy_ram_to_opencl_async_sync(src_multiformat->opencl_ptr,
+							   src_node,
+							   (cl_mem) dst_multiformat->dev_handle,
+							   dst_node,
+							   size,
+							   dst_multiformat->offset,
+							   (cl_event *) _event,
+							   &ret);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+	return ret;
+}
+
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *_event)
+{
+	int err, ret;
+	size_t size;
+	starpu_multiformat_interface_t *src_multiformat;
+	starpu_multiformat_interface_t *dst_multiformat;
+
+	src_multiformat = (starpu_multiformat_interface_t *) src_interface;
+	dst_multiformat = (starpu_multiformat_interface_t *) dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(src_multiformat->ops != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+
+	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize,
+
+	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_multiformat->dev_handle,
+							   src_node,
+							   dst_multiformat->opencl_ptr,
+							   dst_node,
+							   size,
+                                                           src_multiformat->offset,
+							   (cl_event *)_event,
+							   &ret);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+
+	/* XXX So much for asynchronicity */
+	clWaitForEvents(1, _event);
+	void *buffers[1];
+	starpu_codelet *cl = src_multiformat->ops->opencl_to_cpu_cl;
+	buffers[0] = dst_interface;
+	cl->cpu_func(buffers, NULL);
+
+	return ret;
+}
+
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
+}
+
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
+}
+
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
+                                 void *dst_interface, unsigned dst_node)
+{
+/* TODO */
+}
+#endif

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -77,7 +77,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	if (rank == 0)
 	{
 		_starpu_driver_update_job_feedback(j, cpu_args,
-				perf_arch, &codelet_start, &codelet_end);
+				perf_arch, &codelet_start, &codelet_end, 0);
 		_starpu_push_task_output(task, 0);
 	}
 

+ 5 - 1
src/drivers/cuda/driver_cuda.c

@@ -26,6 +26,9 @@
 #include "driver_cuda.h"
 #include <core/sched_policy.h>
 
+
+double _starpu_task_get_conversion_time(struct starpu_task *task);
+
 /* the number of CUDA devices */
 static int ncudagpus;
 
@@ -198,6 +201,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 		return -EAGAIN;
 	}
 
+	double conversion_time = _starpu_task_get_conversion_time(task);
 	if (calibrate_model)
 	{
 		cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
@@ -226,7 +230,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
 	_starpu_driver_end_job(args, j, &codelet_end, 0);
 
-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
+	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, conversion_time);
 
 	_starpu_push_task_output(task, mask);
 

+ 7 - 1
src/drivers/driver_common/driver_common.c

@@ -87,7 +87,7 @@ void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j, struct
 }
 void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
 					enum starpu_perf_archtype perf_arch,
-					struct timespec *codelet_start, struct timespec *codelet_end)
+					struct timespec *codelet_start, struct timespec *codelet_end, double conversion_time)
 {
 	struct starpu_task_profiling_info *profiling_info = j->task->profiling_info;
 	struct timespec measured_ts;
@@ -133,6 +133,12 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking) {
 		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
 		}
+
+	if (j->task->cl->conversion_model) {
+		_starpu_update_perfmodel_history(j, j->task->cl->conversion_model, perf_arch,
+						 worker_args->devid, conversion_time, j->nimpl);
+	}
+
 }
 
 /* Workers may block when there is no work to do at all. We assume that the

+ 1 - 1
src/drivers/driver_common/driver_common.h

@@ -29,7 +29,7 @@ void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j,
 		struct timespec *codelet_end, int rank);
 void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
 		enum starpu_perf_archtype perf_arch,
-		struct timespec *codelet_start, struct timespec *codelet_end);
+		struct timespec *codelet_start, struct timespec *codelet_end, double);
 
 void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex);
 

+ 4 - 1
src/drivers/opencl/driver_opencl.c

@@ -28,6 +28,8 @@
 #include "driver_opencl_utils.h"
 #include <common/utils.h>
 
+double _starpu_task_get_conversion_time(struct starpu_task *task);
+
 static pthread_mutex_t big_lock = PTHREAD_MUTEX_INITIALIZER;
 
 static cl_context contexts[STARPU_MAXOPENCLDEVS];
@@ -564,6 +566,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 		 * codelet back, and try it later */
 		return -EAGAIN;
 	}
+	double conversion_time = _starpu_task_get_conversion_time(task);
 
 	_starpu_driver_start_job(args, j, &codelet_start, 0);
 
@@ -582,7 +585,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 	_starpu_driver_end_job(args, j, &codelet_end, 0);
 
 	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
-							&codelet_start, &codelet_end);
+							&codelet_start, &codelet_end, conversion_time);
 
 	_starpu_push_task_output(task, mask);
 

+ 5 - 0
src/sched_policies/heft.c

@@ -250,6 +250,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
 			if (bundle)
 			{
+				/* TODO : conversion time */
 				local_task_length[worker][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
 				local_data_penalty[worker][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
 				local_power[worker][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch,nimpl);
@@ -260,6 +261,10 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 				local_task_length[worker][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
 				local_data_penalty[worker][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
 				local_power[worker][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
+
+				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
+				if (conversion_time > 0.0)
+					local_data_penalty[worker][nimpl] += conversion_time;
 				//_STARPU_DEBUG("Scheduler heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
 
 			}