15 years ago · 59c29c44ec
--- a/Makefile.am
+++ b/Makefile.am
@@ -34,4 +34,5 @@ include_HEADERS = 				\
 
				 	include/starpu_data.h			\
			
 
				 	include/starpu_perfmodel.h		\
			
 
				 	include/starpu_util.h			\
			
 
				+	include/starpu_opencl.h			\
			
 
				 	include/starpu_expert.h
			
--- a/configure.ac
+++ b/configure.ac
@@ -52,8 +52,10 @@ esac
 
				 
			
 
				 # This will be useful for program which use CUDA (and .cubin files) which need
			
 
				 # some path to the CUDA code at runtime.
			
 
				-AC_DEFINE_UNQUOTED(STARPU_DIR, "$PWD", [location of StarPU sources])
			
 
				-AC_SUBST(STARPU_DIR, $PWD)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_BUILD_DIR, "$PWD", [location of StarPU build directory])
			
 
				+AC_SUBST(STARPU_BUILD_DIR, $PWD)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})", [location of StarPU sources])
			
 
				+AC_SUBST(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})")
			
 
				 
			
 
				 AC_CHECK_LIB([pthread], [pthread_create])
			
 
				 case "$target" in
			
@@ -300,6 +302,71 @@ fi
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				+#                                 OpenCL settings                             #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_MSG_CHECKING(maximum number of OpenCL devices)
			
 
				+AC_ARG_ENABLE(maxopencldev, [AS_HELP_STRING([--enable-maxopencldev=<number>],
			
 
				+			[maximum number of OPENCL devices])],
			
 
				+			nmaxopencldev=$enableval, nmaxopencldev=8)
			
 
				+AC_MSG_RESULT($nmaxopencldev)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXOPENCLDEVS, [$nmaxopencldev], 
			
 
				+		[maximum number of OPENCL devices])
			
 
				+AC_ARG_ENABLE(opencl, [AS_HELP_STRING([--disable-opencl],
			
 
				+		[do not use OpenCL device(s)])],, [enable_opencl=maybe])
			
 
				+
			
 
				+if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
			
 
				+	#AC_MSG_CHECKING(whether OpenCL is available)
			
 
				+	AC_ARG_WITH(opencl-dir, 
			
 
				+		[AS_HELP_STRING([--with-opencl-dir=<path>],
			
 
				+		[specify OpenCL installation directory (default is /usr/)])],
			
 
				+		[
			
 
				+			opencl_dir=$withval
			
 
				+			# in case this was not explicit yet
			
 
				+			enable_opencl=yes
			
 
				+		], opencl_dir=/usr/)
			
 
				+	
			
 
				+	if test -d "$opencl_dir/include/"; then
			
 
				+		CPPFLAGS="${CPPFLAGS} -I$opencl_dir/include/ "
			
 
				+	fi
			
 
				+
			
 
				+	# do we have a valid OpenCL setup ?
			
 
				+	have_valid_opencl=yes
			
 
				+	AC_CHECK_HEADER([CL/cl.h],,[have_valid_opencl=no])
			
 
				+
			
 
				+	# we are looking for the proper option in LDFLAGS, so we save the
			
 
				+	# current value of LDFLAGS so that we can add new things in it and
			
 
				+	# restore it in case it's not working.
			
 
				+	SAVED_LDFLAGS="${LDFLAGS}"
			
 
				+
			
 
				+	found_opencllib=no
			
 
				+	if test -d "$opencl_dir/lib/"; then
			
 
				+		LDFLAGS="${SAVED_LDFLAGS} -L$opencl_dir/lib/ "
			
 
				+		AC_SEARCH_LIBS([clCreateKernel],[OpenCL],[found_opencllib=yes],[found_opencllib=no])
			
 
				+	fi
			
 
				+
			
 
				+	# in case OpenCL was explicitely required, but is not available, this is an error
			
 
				+	if test x$enable_opencl = xyes -a x$have_valid_opencl = no; then
			
 
				+		AC_MSG_ERROR([cannot find OpenCL])
			
 
				+	fi
			
 
				+
			
 
				+	# now we enable OpenCL if and only if a proper setup is available
			
 
				+	enable_opencl=$have_valid_opencl
			
 
				+fi
			
 
				+
			
 
				+AC_MSG_CHECKING(whether OpenCL should be used)
			
 
				+AC_MSG_RESULT($enable_opencl)
			
 
				+AC_SUBST(STARPU_USE_OPENCL, $enable_opencl)
			
 
				+AM_CONDITIONAL(STARPU_USE_OPENCL, test x$enable_opencl = xyes)
			
 
				+if test x$enable_opencl = xyes; then
			
 
				+	AC_DEFINE(STARPU_USE_OPENCL, [1], [OpenCL support is activated])
			
 
				+        AC_DEFINE_UNQUOTED(STARPU_OPENCL_DATADIR, "$(eval echo ${datarootdir}/starpu/opencl)", [Path to OpenCL codelets])
			
 
				+        AC_SUBST(STARPU_OPENCL_DATAdir, "$(eval echo ${datarootdir}/starpu/opencl/examples)")
			
 
				+fi
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				 #                                 Cell settings                               #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
@@ -506,13 +573,19 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 
				 
			
 
				 # We have one memory node shared by all CPU workers, one node per GPU, and
			
 
				 # currently the Cell driver is using the same memory node as the CPU.
			
 
				-if test x$enable_cuda = xyes; then
			
 
				+maxnodes=1
			
 
				+if test x$enable_cuda = xyes ; then
			
 
				 	# we could have used nmaxcudadev + 1, but this would certainly give an
			
 
				 	# odd number.
			
 
				-	maxnodes=`expr 2 \* $nmaxcudadev`
			
 
				-else
			
 
				-	maxnodes=1	
			
 
				+	maxnodes=`expr $maxnodes + $nmaxcudadev`
			
 
				 fi
			
 
				+if test x$enable_opencl = xyes ; then
			
 
				+	# we could have used nmaxcudadev + 1, but this would certainly give an
			
 
				+	# odd number.
			
 
				+	maxnodes=`expr $maxnodes + $nmaxopencldev`
			
 
				+fi
			
 
				+# todo: set maxnodes to the next power of 2 greater than maxnodes
			
 
				+
			
 
				 AC_MSG_CHECKING(maximum number of memory nodes)
			
 
				 AC_MSG_RESULT($maxnodes)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],
			
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -476,6 +476,10 @@ specified with the @code{STARPU_NCPUS} environment variable.
 
				 This is the maximum number of CUDA devices that StarPU can use. This can also be
			
 
				 specified with the @code{STARPU_NCUDA} environment variable.
			
 
				 
			
 
				+@item @code{nopencl} (default = -1):
			
 
				+This is the maximum number of OpenCL devices that StarPU can use. This can also be
			
 
				+specified with the @code{STARPU_NOPENCL} environment variable.
			
 
				+
			
 
				 @item @code{nspus} (default = -1):
			
 
				 This is the maximum number of Cell SPUs that StarPU can use. This can also be
			
 
				 specified with the @code{STARPU_NGORDON} environment variable.
			
@@ -600,7 +604,8 @@ an integer between 0 and @code{starpu_get_worker_count() - 1}.
 
				 This function returns the type of worker associated to an identifier (as
			
 
				 returned by the @code{starpu_get_worker_id} function). The returned value
			
 
				 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
			
 
				-core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
			
 
				+core, @code{STARPU_CUDA_WORKER} for a CUDA device,
			
 
				+@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
			
 
				 @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
			
 
				 identifier is unspecified.
			
 
				 
			
@@ -716,6 +721,13 @@ be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
 
				 field is ignored if @code{STARPU_CUDA} does not appear in the @code{.where}
			
 
				 field, it must be non-null otherwise.
			
 
				 
			
 
				+@item @code{opencl_func} (optionnal):
			
 
				+Is a function pointer to the OpenCL implementation of the codelet. Its
			
 
				+prototype must be:
			
 
				+@code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
			
 
				+This pointer is ignored if @code{OPENCL} does not appear in the
			
 
				+@code{.where} field, it must be non-null otherwise.
			
 
				+
			
 
				 @item @code{gordon_func} (optionnal):
			
 
				 This is the index of the Cell SPU implementation within the Gordon library.
			
 
				 TODO
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -431,11 +431,97 @@ check_PROGRAMS +=				\
 
				 examplebin_PROGRAMS +=				\
			
 
				 	incrementer/incrementer
			
 
				 
			
 
				-if STARPU_USE_CUDA
			
 
				-incrementer_incrementer_SOURCES =	\
			
 
				-	incrementer/incrementer.c	\
			
 
				-	incrementer/incrementer_kernels.cu
			
 
				-else
			
 
				 incrementer_incrementer_SOURCES =	\
			
 
				 	incrementer/incrementer.c
			
 
				+if STARPU_USE_CUDA
			
 
				+incrementer_incrementer_SOURCES +=	\
			
 
				+	incrementer/incrementer_kernels.cu
			
 
				+endif
			
 
				+if STARPU_USE_OPENCL
			
 
				+incrementer_incrementer_SOURCES +=	\
			
 
				+	incrementer/incrementer_kernels_opencl.c
			
 
				+endif
			
 
				+
			
 
				+nobase_STARPU_OPENCL_DATA_DATA = \
			
 
				+	incrementer/incrementer_kernels_opencl_codelet.cl
			
 
				+
			
 
				+###################
			
 
				+# Stencil example #
			
 
				+###################
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+check_PROGRAMS +=		\
			
 
				+	stencil/stencil
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	stencil/stencil
			
 
				+stencil_stencil_SOURCES =	\
			
 
				+	stencil/stencil.c       \
			
 
				+	stencil/stencil_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	stencil/stencil_opencl_codelet.cl \
			
 
				+	stencil/stencil.h
			
 
				+endif
			
 
				+
			
 
				+####################
			
 
				+# Variable example #
			
 
				+####################
			
 
				+
			
 
				+check_PROGRAMS +=				\
			
 
				+	variable/variable
			
 
				+
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	variable/variable
			
 
				+
			
 
				+variable_variable_SOURCES =	\
			
 
				+	variable/variable.c
			
 
				+if STARPU_USE_CUDA
			
 
				+variable_variable_SOURCES +=	\
			
 
				+	variable/variable_kernels.cu
			
 
				+endif
			
 
				+if STARPU_USE_OPENCL
			
 
				+variable_variable_SOURCES +=	\
			
 
				+	variable/variable_kernels_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	variable/variable_kernels_opencl_codelet.cl
			
 
				+endif
			
 
				+
			
 
				+######################
			
 
				+# matVecMult example #
			
 
				+######################
			
 
				+
			
 
				+check_PROGRAMS +=				\
			
 
				+	matvecmult/matvecmult
			
 
				+
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	matvecmult/matvecmult
			
 
				+
			
 
				+matvecmult_matvecmult_SOURCES =	\
			
 
				+	matvecmult/matvecmult.c
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	matvecmult/matvecmult_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+#################
			
 
				+# block example #
			
 
				+#################
			
 
				+
			
 
				+check_PROGRAMS +=				\
			
 
				+	block/block
			
 
				+
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	block/block
			
 
				+
			
 
				+block_block_SOURCES =	\
			
 
				+	block/block.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+block_block_SOURCES +=				\
			
 
				+	block/block_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	block/block_kernel.cl
			
 
				 endif
			
--- a/examples/block/block.c
+++ b/examples/block/block.c
@@ -0,0 +1,167 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <pthread.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
			
 
				+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
			
 
				+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
			
 
				+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
			
 
				+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
			
 
				+        int i;
			
 
				+
			
 
				+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err, n;
			
 
				+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
			
 
				+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
			
 
				+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
			
 
				+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
			
 
				+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
			
 
				+
			
 
				+        id = starpu_get_worker_id();
			
 
				+        devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+        err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				+                                        "examples/block/block_kernel.cl", "block", devid);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+        n=0;
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
			
 
				+	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
			
 
				+	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
			
 
				+	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
			
 
				+	err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &multiplier);
			
 
				+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+                size_t global=1024;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+        starpu_opencl_release(kernel);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+#endif
			
 
				+
			
 
				+typedef void (*device_func)(void **, void *);
			
 
				+
			
 
				+int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny, int pnz, float multiplier)
			
 
				+{
			
 
				+	starpu_codelet cl;
			
 
				+	starpu_data_handle block_handle;
			
 
				+        starpu_data_handle multiplier_handle;
			
 
				+        int i, j, k;
			
 
				+
			
 
				+	starpu_register_block_data(&block_handle, 0, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
			
 
				+	starpu_register_variable_data(&multiplier_handle, 0, (uintptr_t)&multiplier, sizeof(float));
			
 
				+
			
 
				+	cl.where = where;
			
 
				+        cl.cuda_func = func;
			
 
				+        cl.cpu_func = func;
			
 
				+        cl.opencl_func = func;
			
 
				+        cl.nbuffers = 2;
			
 
				+        cl.model = NULL;
			
 
				+
			
 
				+        struct starpu_task *task = starpu_task_create();
			
 
				+        task->cl = &cl;
			
 
				+        task->callback_func = NULL;
			
 
				+        task->buffers[0].handle = block_handle;
			
 
				+        task->buffers[0].mode = STARPU_RW;
			
 
				+        task->buffers[1].handle = multiplier_handle;
			
 
				+        task->buffers[1].mode = STARPU_RW;
			
 
				+
			
 
				+        int ret = starpu_submit_task(task);
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                fprintf(stderr, "No worker may execute this task\n");
			
 
				+                return 1;
			
 
				+	}
			
 
				+
			
 
				+	starpu_wait_all_tasks();
			
 
				+
			
 
				+	/* update the array in RAM */
			
 
				+        starpu_sync_data_with_mem(block_handle, STARPU_R);
			
 
				+
			
 
				+        for(i=0 ; i<pnx*pny*pnz; i++) {
			
 
				+          fprintf(stderr, "%f ", block[i]);
			
 
				+        }
			
 
				+        fprintf(stderr, "\n");
			
 
				+
			
 
				+        starpu_release_data_from_mem(block_handle);
			
 
				+
			
 
				+        return 0;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	starpu_codelet cl;
			
 
				+        float *block;
			
 
				+        int i, ret;
			
 
				+        int nx=3;
			
 
				+        int ny=2;
			
 
				+        int nz=4;
			
 
				+        float multiplier=1.0;
			
 
				+
			
 
				+        starpu_init(NULL);
			
 
				+
			
 
				+        block = (float*)malloc(nx*ny*nz*sizeof(float));
			
 
				+        assert(block);
			
 
				+        for(i=0 ; i<nx*ny*nz ; i++) block[i] = i+1;
			
 
				+
			
 
				+        ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
			
 
				+        if (!ret) multiplier *= 1.0;
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        _starpu_opencl_compile_source_to_opencl("examples/block/block_kernel.cl");
			
 
				+        ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
			
 
				+        if (!ret) multiplier *= 2.0;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+        ret = execute_on(STARPU_CUDA, cuda_codelet, block, nx, ny, nz, 3.0);
			
 
				+        if (!ret) multiplier *= 3.0;
			
 
				+#endif
			
 
				+
			
 
				+        // Check result is correct
			
 
				+        ret=1;
			
 
				+        for(i=0 ; i<nx*ny*nz ; i++) {
			
 
				+          if (block[i] != (i+1) * multiplier) {
			
 
				+            ret=0;
			
 
				+            break;
			
 
				+          }
			
 
				+        }
			
 
				+
			
 
				+        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
			
 
				+        starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/block/block_cuda.cu
+++ b/examples/block/block_cuda.cu
@@ -0,0 +1,34 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+static __global__ void cuda_block(float *block, int nx, int ny, int nz, float *multiplier)
			
 
				+{
			
 
				+        int i;
			
 
				+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
			
 
				+}
			
 
				+
			
 
				+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+        float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
			
 
				+	int nx = STARPU_GET_BLOCK_NX(descr[0]);
			
 
				+	int ny = STARPU_GET_BLOCK_NY(descr[0]);
			
 
				+	int nz = STARPU_GET_BLOCK_NZ(descr[0]);
			
 
				+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
			
 
				+
			
 
				+        cuda_block<<<1,1>>>(block, nx, ny, nz, multiplier);
			
 
				+}
			
--- a/examples/block/block_kernel.cl
+++ b/examples/block/block_kernel.cl
@@ -0,0 +1,23 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void block(__global float *b, int nx, int ny, int nz, __global float *multiplier)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx*ny*nz) {
			
 
				+                b[i] *= *multiplier;
			
 
				+        }
			
 
				+}
			
--- a/examples/heat/Makefile.in
+++ b/examples/heat/Makefile.in
@@ -14,7 +14,7 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 #
			
 
				 
			
 
				-export PKG_CONFIG_PATH=@STARPU_DIR@
			
 
				+export PKG_CONFIG_PATH=@STARPU_BUILD_DIR@
			
 
				 
			
 
				 LIBS+=$$(pkg-config --libs libstarpu)
			
 
				 CFLAGS+=$$(pkg-config --cflags libstarpu)
			
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -23,6 +23,10 @@ static unsigned niter = 50000;
 
				 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+#endif
			
 
				+
			
 
				 extern void cuda_codelet_host(float *tab);
			
 
				 
			
 
				 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
@@ -39,20 +43,27 @@ int main(int argc, char **argv)
 
				 	if (argc == 2)
			
 
				 		niter = atoi(argv[1]);
			
 
				 
			
 
				-	float float_array[3] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f}; 
			
 
				+	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
			
 
				 
			
 
				 	starpu_data_handle float_array_handle;
			
 
				 	starpu_register_vector_data(&float_array_handle, 0 /* home node */,
			
 
				-			(uintptr_t)&float_array, 3, sizeof(float));
			
 
				+			(uintptr_t)&float_array, 4, sizeof(float));
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        _starpu_opencl_compile_source_to_opencl("examples/incrementer/incrementer_kernels_opencl_codelet.cl");
			
 
				+#endif
			
 
				 
			
 
				 	starpu_codelet cl =
			
 
				 	{
			
 
				 		/* CUBLAS stands for CUDA kernels controlled from the host */
			
 
				-		.where = STARPU_CPU|STARPU_CUDA,
			
 
				+		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 		.cpu_func = cpu_codelet,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		.cuda_func = cuda_codelet,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		.opencl_func = opencl_codelet,
			
 
				+#endif
			
 
				 		.nbuffers = 1
			
 
				 	};
			
 
				 
			
@@ -67,7 +78,7 @@ int main(int argc, char **argv)
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 		task->cl = &cl;
			
 
				-		
			
 
				+
			
 
				 		task->callback_func = NULL;
			
 
				 
			
 
				 		task->buffers[0].handle = float_array_handle;
			
@@ -85,15 +96,17 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* update the array in RAM */
			
 
				 	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
			
 
				-	
			
 
				+
			
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				-	fprintf(stderr, "array -> %f, %f, %f\n", float_array[0], 
			
 
				-			float_array[1], float_array[2]);
			
 
				-	
			
 
				-	if (float_array[0] != float_array[1] + float_array[2])
			
 
				+	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
			
 
				+                float_array[1], float_array[2], float_array[3]);
			
 
				+
			
 
				+	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
			
 
				+		fprintf(stderr, "Incorrect result\n");
			
 
				 		return 1;
			
 
				-	
			
 
				+	}
			
 
				+
			
 
				 	starpu_release_data_from_mem(float_array_handle);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
			
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -0,0 +1,49 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <CL/cl.h>
			
 
				+
			
 
				+void opencl_codelet(void *descr[], void *_args)
			
 
				+{
			
 
				+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err;
			
 
				+
			
 
				+        id = starpu_get_worker_id();
			
 
				+        devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				+                                        "examples/incrementer/incrementer_kernels_opencl_codelet.cl", "incrementer", devid);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global=4;
			
 
				+		size_t local=4;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+	starpu_opencl_release(kernel);
			
 
				+}
			
--- a/examples/incrementer/incrementer_kernels_opencl_codelet.cl
+++ b/examples/incrementer/incrementer_kernels_opencl_codelet.cl
@@ -0,0 +1,23 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void incrementer(__global float* input) 
			
 
				+{
			
 
				+	const int i = get_global_id(0);
			
 
				+	if (i == 0 || i == 3)
			
 
				+		input[i] = input[i] + 1.0;
			
 
				+}
			
 
				+
			
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -0,0 +1,193 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <pthread.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+//static int width=1100;
			
 
				+//static int height=244021;
			
 
				+static int width=20;
			
 
				+static int height=4;
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err, n;
			
 
				+	float *matrix = (float *)STARPU_GET_MATRIX_PTR(descr[0]);
			
 
				+	float *vector = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				+	float *mult = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
			
 
				+
			
 
				+        id = starpu_get_worker_id();
			
 
				+        devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+        err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				+                                        "examples/matvecmult/matvecmult_kernel.cl", "matVecMult", devid);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+        n=0;
			
 
				+        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
			
 
				+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
			
 
				+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&width);
			
 
				+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&height);
			
 
				+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
			
 
				+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+                size_t global=512;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+	starpu_opencl_release(kernel);
			
 
				+}
			
 
				+
			
 
				+void fillArray(float* pfData, int iSize) {
			
 
				+    int i;
			
 
				+    const float fScale = 1.0f / (float)RAND_MAX;
			
 
				+    for (i = 0; i < iSize; ++i) {
			
 
				+            pfData[i] = fScale * rand();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void printArray(float* pfData, int iSize) {
			
 
				+    int i;
			
 
				+    for (i = 0; i < iSize; ++i) {
			
 
				+            fprintf(stderr, "%f ", pfData[i]);
			
 
				+    }
			
 
				+    fprintf(stderr, "\n");
			
 
				+}
			
 
				+
			
 
				+void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
			
 
				+    int i, j;
			
 
				+    for (i = 0; i < height; ++i) {
			
 
				+        double sum = 0;
			
 
				+        for (j = 0; j < width; ++j) {
			
 
				+            double a = matrix[i * width + j];
			
 
				+            double b = vector[j];
			
 
				+            sum += a * b;
			
 
				+        }
			
 
				+        mult[i] = (float)sum;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+int compareL2fe(const float* reference, const float* data, const unsigned int len, const float epsilon) {
			
 
				+    float error = 0;
			
 
				+    float ref = 0;
			
 
				+    unsigned int i;
			
 
				+
			
 
				+    for(i = 0; i < len; ++i) {
			
 
				+        float diff = reference[i] - data[i];
			
 
				+        error += diff * diff;
			
 
				+        ref += reference[i] * reference[i];
			
 
				+    }
			
 
				+
			
 
				+    float normRef = sqrtf(ref);
			
 
				+    if (fabs(ref) < 1e-7) return 1;
			
 
				+
			
 
				+    float normError = sqrtf(error);
			
 
				+    error = normError / normRef;
			
 
				+
			
 
				+    return error < epsilon ? 0 : 1;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	starpu_codelet cl;
			
 
				+        starpu_init(NULL);
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        float *matrix, *vector, *mult;
			
 
				+        float *correctResult;
			
 
				+        unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
			
 
				+
			
 
				+	starpu_data_handle matrix_handle, vector_handle, mult_handle;
			
 
				+
			
 
				+        mem_size_matrix = width * height * sizeof(float);
			
 
				+        matrix = (float*)malloc(mem_size_matrix);
			
 
				+        mem_size_vector = width * sizeof(float);
			
 
				+        vector = (float*)malloc(mem_size_vector);
			
 
				+        mem_size_mult = height * sizeof(float);
			
 
				+        mult = (float*)malloc(mem_size_mult);
			
 
				+        correctResult = (float*)malloc(mem_size_mult);
			
 
				+
			
 
				+        assert(matrix);
			
 
				+        assert(vector);
			
 
				+        assert(mult);
			
 
				+        assert(correctResult);
			
 
				+
			
 
				+        fillArray(matrix, width*height);
			
 
				+        fillArray(vector, width);
			
 
				+        fillArray(mult, height);
			
 
				+        matVecMult(matrix, vector, width, height, correctResult);
			
 
				+
			
 
				+	starpu_register_matrix_data(&matrix_handle, 0, (uintptr_t)matrix, width, width, height, sizeof(float));
			
 
				+	starpu_register_vector_data(&vector_handle, 0, (uintptr_t)vector, width, sizeof(float));
			
 
				+	starpu_register_vector_data(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
			
 
				+
			
 
				+        _starpu_opencl_compile_source_to_opencl("examples/matvecmult/matvecmult_kernel.cl");
			
 
				+
			
 
				+	cl.where = STARPU_OPENCL;
			
 
				+        cl.opencl_func = opencl_codelet;
			
 
				+        cl.nbuffers = 3;
			
 
				+        cl.model = NULL;
			
 
				+
			
 
				+        struct starpu_task *task = starpu_task_create();
			
 
				+        task->cl = &cl;
			
 
				+        task->callback_func = NULL;
			
 
				+        task->buffers[0].handle = matrix_handle;
			
 
				+        task->buffers[0].mode = STARPU_R;
			
 
				+        task->buffers[1].handle = vector_handle;
			
 
				+        task->buffers[1].mode = STARPU_R;
			
 
				+        task->buffers[2].handle = mult_handle;
			
 
				+        task->buffers[2].mode = STARPU_RW;
			
 
				+
			
 
				+        int ret = starpu_submit_task(task);
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                fprintf(stderr, "No worker may execute this task\n");
			
 
				+                exit(0);
			
 
				+	}
			
 
				+
			
 
				+	starpu_wait_all_tasks();
			
 
				+
			
 
				+	/* update the array in RAM */
			
 
				+        starpu_sync_data_with_mem(matrix_handle, STARPU_R);
			
 
				+        starpu_sync_data_with_mem(vector_handle, STARPU_R);
			
 
				+        starpu_sync_data_with_mem(mult_handle, STARPU_R);
			
 
				+
			
 
				+        int res = compareL2fe(correctResult, mult, height, 1e-6f);
			
 
				+        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
			
 
				+#if 0
			
 
				+        printArray(matrix, width*height);
			
 
				+        printArray(vector, width);
			
 
				+        printArray(mult, height);
			
 
				+#endif
			
 
				+        starpu_release_data_from_mem(matrix_handle);
			
 
				+        starpu_release_data_from_mem(vector_handle);
			
 
				+        starpu_release_data_from_mem(mult_handle);
			
 
				+
			
 
				+        starpu_shutdown();
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/matvecmult/matvecmult_kernel.cl
+++ b/examples/matvecmult/matvecmult_kernel.cl
@@ -0,0 +1,48 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and
			
 
				+ * proprietary rights in and to this software and related documentation.
			
 
				+ * Any use, reproduction, disclosure, or distribution of this software
			
 
				+ * and related documentation without an express license agreement from
			
 
				+ * NVIDIA Corporation is strictly prohibited.
			
 
				+ *
			
 
				+ * Please refer to the applicable NVIDIA end user license agreement (EULA)
			
 
				+ * associated with this source code for terms and conditions that govern
			
 
				+ * your use of this NVIDIA software.
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+/* Matrix-vector multiplication: W = M * V.
			
 
				+ * Device code.
			
 
				+ *
			
 
				+ * This sample implements matrix-vector multiplication.
			
 
				+ * It has been written for clarity of exposition to illustrate various OpenCL
			
 
				+ * programming principles and optimizatoins, not with the goal of providing
			
 
				+ * the most performant generic kernel for matrix-vector multiplication.
			
 
				+ *
			
 
				+ * CUBLAS provides high-performance matrix-vector multiplication on GPU.
			
 
				+ */
			
 
				+
			
 
				+__kernel void matVecMult(
			
 
				+                         __global float* M,
			
 
				+                         __global float* V,
			
 
				+                         int width, int height,
			
 
				+                         __global float* W
			
 
				+                         )
			
 
				+{
			
 
				+        // Row index
			
 
				+        uint y = get_global_id(0);
			
 
				+        if (y < height) {
			
 
				+                // Row pointer
			
 
				+                const __global float* row = M + y * width;
			
 
				+
			
 
				+                // Compute dot product
			
 
				+                float dotProduct = 0;
			
 
				+                for (int x = 0; x < width; ++x)
			
 
				+                        dotProduct += row[x] * V[x];
			
 
				+
			
 
				+                // Write result to global memory
			
 
				+                W[y] = dotProduct;
			
 
				+        }
			
 
				+}
			
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -40,8 +40,8 @@ void parse_args(int argc, char **argv)
 
				 		strcpy(filename_out, argv[2]);
			
 
				 	}
			
 
				 	else {
			
 
				-		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_in_default);
			
 
				-		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_out_default);
			
 
				+		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_in_default);
			
 
				+		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_out_default);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/examples/spmv/dw_spmv.c
+++ b/examples/spmv/dw_spmv.c
@@ -27,6 +27,60 @@ extern void spmv_kernel_cuda(void *descr[], void *args);
 
				 struct timeval start;
			
 
				 struct timeval end;
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include "starpu_opencl.h"
			
 
				+void spmv_kernel_opencl(void *descr[], void *args)
			
 
				+{
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err, n;
			
 
				+
			
 
				+	uint32_t nnz = STARPU_GET_CSR_NNZ(descr[0]);
			
 
				+	uint32_t nrow = STARPU_GET_CSR_NROW(descr[0]);
			
 
				+	float *nzval = (float *)STARPU_GET_CSR_NZVAL(descr[0]);
			
 
				+	uint32_t *colind = STARPU_GET_CSR_COLIND(descr[0]);
			
 
				+	uint32_t *rowptr = STARPU_GET_CSR_ROWPTR(descr[0]);
			
 
				+	uint32_t firstentry = STARPU_GET_CSR_FIRSTENTRY(descr[0]);
			
 
				+
			
 
				+	float *vecin = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				+	uint32_t nx_in = STARPU_GET_VECTOR_NX(descr[1]);
			
 
				+
			
 
				+	float *vecout = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
			
 
				+	uint32_t nx_out = STARPU_GET_VECTOR_NX(descr[2]);
			
 
				+
			
 
				+        id = starpu_get_worker_id();
			
 
				+        devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+        err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				+                                        "examples/spmv/spmv_opencl.cl", "spvm", devid);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+        n=0;
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nnz);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nrow);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &nzval);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &colind);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &rowptr);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &firstentry);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecin);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_in);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecout);
			
 
				+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_out);
			
 
				+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+                size_t global=1024;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+        starpu_opencl_release(kernel);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 unsigned nblocks = 2;
			
 
				 uint32_t size = 4194304;
			
 
				 
			
@@ -189,14 +243,28 @@ void call_spmv_codelet_filters(void)
 
				 	starpu_partition_data(sparse_matrix, &csr_f);
			
 
				 	starpu_partition_data(vector_out, &vector_f);
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        {
			
 
				+                int ret = _starpu_opencl_compile_source_to_opencl("examples/spmv/spmv_opencl.cl");
			
 
				+                if (ret)
			
 
				+		{
			
 
				+			fprintf(stderr, "Failed to compile OpenCL codelet\n");
			
 
				+			exit(ret);
			
 
				+		}
			
 
				+        }
			
 
				+#endif
			
 
				+
			
 
				 	starpu_codelet cl;
			
 
				 	memset(&cl, 0, sizeof(starpu_codelet));
			
 
				 
			
 
				-	cl.where = STARPU_CPU|STARPU_CUDA;
			
 
				+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
			
 
				 	cl.cpu_func =  cpu_spmv;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cl.cuda_func = spmv_kernel_cuda;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        cl.opencl_func = spmv_kernel_opencl;
			
 
				+#endif
			
 
				 	cl.nbuffers = 3;
			
 
				 	cl.model = NULL;
			
 
				 
			
@@ -206,6 +274,7 @@ void call_spmv_codelet_filters(void)
 
				 	for (part = 0; part < nblocks; part++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				+                int ret;
			
 
				 
			
 
				 		task->callback_func = NULL;
			
 
				 
			
@@ -219,7 +288,12 @@ void call_spmv_codelet_filters(void)
 
				 		task->buffers[2].handle = starpu_get_sub_data(vector_out, 1, part);
			
 
				 		task->buffers[2].mode = STARPU_W;
			
 
				 	
			
 
				-		starpu_submit_task(task);
			
 
				+		ret = starpu_submit_task(task);
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			fprintf(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	starpu_wait_all_tasks();
			
--- a/examples/spmv/dw_spmv.h
+++ b/examples/spmv/dw_spmv.h
@@ -24,7 +24,6 @@
 
				 #include <sys/types.h>
			
 
				 #include <pthread.h>
			
 
				 #include <signal.h>
			
 
				-#include <cblas.h>
			
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
--- a/examples/spmv/spmv_opencl.cl
+++ b/examples/spmv/spmv_opencl.cl
@@ -0,0 +1,42 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void spvm(unsigned nnz, unsigned nrow,
			
 
				+                   __global float* nzval, __global unsigned* colind,
			
 
				+                   __global unsigned* rowptr, unsigned firstentry,
			
 
				+                   __global float *vecin, unsigned nx_in,
			
 
				+                   __global float *vecout, unsigned nx_out)
			
 
				+{
			
 
				+	unsigned row;
			
 
				+	for (row = 0; row < nrow; row++)
			
 
				+	{
			
 
				+		float tmp = 0.0f;
			
 
				+		unsigned index;
			
 
				+
			
 
				+		unsigned firstindex = rowptr[row] - firstentry;
			
 
				+		unsigned lastindex = rowptr[row+1] - firstentry;
			
 
				+
			
 
				+		for (index = firstindex; index < lastindex; index++)
			
 
				+		{
			
 
				+			unsigned col;
			
 
				+
			
 
				+			col = colind[index];
			
 
				+			tmp += nzval[index]*vecin[col];
			
 
				+		}
			
 
				+
			
 
				+		vecout[row] = tmp;
			
 
				+	}
			
 
				+}
			
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -0,0 +1,147 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "stencil.h"
			
 
				+
			
 
				+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+
			
 
				+static int verbose = 0;
			
 
				+
			
 
				+static
			
 
				+void display_non_zero_values(TYPE *ptr, char *msg)
			
 
				+{
			
 
				+        if(verbose) {
			
 
				+                int x, y, z;
			
 
				+
			
 
				+                for(z = 0; z < DIM; z++)
			
 
				+                        for(y = 0; y < DIM; y++)
			
 
				+                                for(x = 0; x < DIM; x++) {
			
 
				+                                        TYPE r = ptr[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD];
			
 
				+                                        if(r != 0.0)
			
 
				+                                                printf("%s[%d, %d, %d] == %f\n", msg, z, y, x, r);
			
 
				+                                }
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+        TYPE *data;                         // original data set given to device
			
 
				+        TYPE *results;                      // results returned from device
			
 
				+        TYPE C0 = 0.25;
			
 
				+        TYPE C1 = 0.75;
			
 
				+        starpu_data_handle data_handle;
			
 
				+        starpu_data_handle results_handle;
			
 
				+        starpu_data_handle C0_handle;
			
 
				+        starpu_data_handle C1_handle;
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+
			
 
				+        // Filter args
			
 
				+        argv++;
			
 
				+        while (argc > 1) {
			
 
				+                if(!strcmp(*argv, "--verbose")) {
			
 
				+                        verbose = 1;
			
 
				+                } else
			
 
				+                        break;
			
 
				+                argc--; argv++;
			
 
				+        }
			
 
				+
			
 
				+        // Fill our data set with random float values
			
 
				+        {
			
 
				+                long i, x, y, z;
			
 
				+
			
 
				+                data = (TYPE *)malloc(SIZE * sizeof(TYPE));
			
 
				+                results = (TYPE *)malloc(SIZE * sizeof(TYPE));
			
 
				+                if (!data || !results) {
			
 
				+                        fprintf(stderr, "Malloc failed!\n");
			
 
				+                        return;
			
 
				+                }
			
 
				+
			
 
				+                for(i = 0; i < SIZE; i++) {
			
 
				+                        data[i] = 0.0;
			
 
				+                        results[i] = 0.0;
			
 
				+                }
			
 
				+
			
 
				+                z = ZBLOCK-1;
			
 
				+                y = YBLOCK-1;
			
 
				+                x = XBLOCK-1;
			
 
				+
			
 
				+                data[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD] = 2.0;
			
 
				+        }
			
 
				+
			
 
				+        display_non_zero_values(data, "data");
			
 
				+
			
 
				+        starpu_register_vector_data(&data_handle, 0 /* home node */,
			
 
				+                                    (uintptr_t)data, SIZE, sizeof(TYPE));
			
 
				+        starpu_register_vector_data(&results_handle, 0 /* home node */,
			
 
				+                                    (uintptr_t)results, SIZE, sizeof(TYPE));
			
 
				+        starpu_register_vector_data(&C0_handle, 0 /* home node */,
			
 
				+                                    (uintptr_t)&C0, 1, sizeof(TYPE));
			
 
				+        starpu_register_vector_data(&C1_handle, 0 /* home node */,
			
 
				+                                    (uintptr_t)&C1, 1, sizeof(TYPE));
			
 
				+
			
 
				+        _starpu_opencl_compile_source_to_opencl("examples/stencil/stencil_opencl_codelet.cl");
			
 
				+
			
 
				+	starpu_codelet cl =
			
 
				+	{
			
 
				+		.where = STARPU_OPENCL,
			
 
				+		.opencl_func = opencl_codelet,
			
 
				+		.nbuffers = 4
			
 
				+	};
			
 
				+
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+                task->cl = &cl;
			
 
				+                task->callback_func = NULL;
			
 
				+
			
 
				+		task->buffers[0].handle = data_handle;
			
 
				+		task->buffers[0].mode = STARPU_R;
			
 
				+		task->buffers[1].handle = results_handle;
			
 
				+		task->buffers[1].mode = STARPU_W;
			
 
				+		task->buffers[2].handle = C0_handle;
			
 
				+		task->buffers[2].mode = STARPU_R;
			
 
				+		task->buffers[3].handle = C1_handle;
			
 
				+		task->buffers[3].mode = STARPU_R;
			
 
				+
			
 
				+		int ret = starpu_submit_task(task);
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			fprintf(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_wait_all_tasks();
			
 
				+
			
 
				+	/* update the array in RAM */
			
 
				+	starpu_sync_data_with_mem(data_handle, STARPU_R);
			
 
				+	starpu_sync_data_with_mem(results_handle, STARPU_R);
			
 
				+	starpu_sync_data_with_mem(C0_handle, STARPU_R);
			
 
				+	starpu_sync_data_with_mem(C1_handle, STARPU_R);
			
 
				+
			
 
				+	display_non_zero_values(results, "results");
			
 
				+
			
 
				+	starpu_release_data_from_mem(data_handle);
			
 
				+	starpu_release_data_from_mem(results_handle);
			
 
				+	starpu_release_data_from_mem(C0_handle);
			
 
				+	starpu_release_data_from_mem(C1_handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/stencil/stencil.h
+++ b/examples/stencil/stencil.h
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STENCIL_H__
			
 
				+#define __STENCIL_H__
			
 
				+
			
 
				+#define TYPE    float
			
 
				+
			
 
				+#define DIM     (128)
			
 
				+#define BORDER  (1)
			
 
				+
			
 
				+#define PADDING (64 / sizeof(TYPE) - 2*BORDER)
			
 
				+#define FIRST_PAD (PADDING+1)
			
 
				+
			
 
				+#define REALDIM (DIM + 2 * BORDER + PADDING)
			
 
				+
			
 
				+#define SURFACE   ((DIM + 2 * BORDER) * REALDIM)
			
 
				+
			
 
				+#define SIZE    ((DIM + 2 * BORDER) * SURFACE + 1)
			
 
				+
			
 
				+#define XBLOCK  (16)
			
 
				+#define YBLOCK  (16)
			
 
				+#define ZBLOCK  (64)
			
 
				+
			
 
				+#define X_PER_THREAD (1)
			
 
				+#define Y_PER_THREAD (4)
			
 
				+#define Z_PER_THREAD (ZBLOCK)
			
 
				+
			
 
				+#endif
			
 
				+
			
--- a/examples/stencil/stencil_opencl.c
+++ b/examples/stencil/stencil_opencl.c
@@ -0,0 +1,66 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <starpu_util.h>
			
 
				+#include "stencil.h"
			
 
				+
			
 
				+void opencl_codelet(void *descr[], void *_args)
			
 
				+{
			
 
				+	float *data = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	float *results = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				+	float *C0 = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
			
 
				+	float *C1 = (float *)STARPU_GET_VECTOR_PTR(descr[3]);
			
 
				+
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err;
			
 
				+
			
 
				+        id = starpu_get_worker_id();
			
 
				+        devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+        err = starpu_opencl_load_kernel(&kernel, &queue, "examples/stencil/stencil_opencl_codelet.cl", "stencil", devid);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
			
 
				+	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &results);
			
 
				+	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &C0);
			
 
				+	err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &C1);
			
 
				+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global[3];
			
 
				+		size_t local[3];
			
 
				+
			
 
				+                // Execute the kernel over the entire range of our 3d input data set
			
 
				+                local[0] = XBLOCK / X_PER_THREAD;   // threads along the X axis
			
 
				+                local[1] = YBLOCK / Y_PER_THREAD;   // threads along the Y axis
			
 
				+                local[2] = ZBLOCK / Z_PER_THREAD;   // threads along the Z axis
			
 
				+
			
 
				+                global[0] = DIM / X_PER_THREAD;  // virtual size of global X axis
			
 
				+                global[1] = DIM / Y_PER_THREAD;  // virtual size of global Y axis
			
 
				+                global[2] = DIM / Z_PER_THREAD;  // virtual size of global Z axis
			
 
				+
			
 
				+                err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+	starpu_opencl_release(kernel);
			
 
				+}
			
--- a/examples/variable/variable.c
+++ b/examples/variable/variable.c
@@ -0,0 +1,100 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <pthread.h>
			
 
				+
			
 
				+static unsigned niter = 50000;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+#endif
			
 
				+
			
 
				+extern void cuda_codelet_host(float *tab);
			
 
				+
			
 
				+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+
			
 
				+	*val += 1.0f;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+        float foo;
			
 
				+	starpu_data_handle float_array_handle;
			
 
				+	starpu_codelet cl;
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+        if (argc == 2) niter = atoi(argv[1]);
			
 
				+        foo = 0.0f;
			
 
				+
			
 
				+	starpu_register_variable_data(&float_array_handle, 0 /* home node */,
			
 
				+                                      (uintptr_t)&foo, sizeof(float));
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        _starpu_opencl_compile_source_to_opencl("examples/variable/variable_kernels_opencl_codelet.cl");
			
 
				+#endif
			
 
				+
			
 
				+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
			
 
				+        cl.cpu_func = cpu_codelet;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+        cl.cuda_func = cuda_codelet;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        cl.opencl_func = opencl_codelet;
			
 
				+#endif
			
 
				+        cl.nbuffers = 1;
			
 
				+        cl.model = NULL;
			
 
				+
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+                int ret;
			
 
				+
			
 
				+		task->cl = &cl;
			
 
				+
			
 
				+		task->callback_func = NULL;
			
 
				+
			
 
				+		task->buffers[0].handle = float_array_handle;
			
 
				+		task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+		ret = starpu_submit_task(task);
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			fprintf(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_wait_all_tasks();
			
 
				+
			
 
				+	/* update the array in RAM */
			
 
				+	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
			
 
				+
			
 
				+	fprintf(stderr, "variable -> %f\n", foo);
			
 
				+
			
 
				+	starpu_release_data_from_mem(float_array_handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/variable/variable_kernels.cu
+++ b/examples/variable/variable_kernels.cu
@@ -0,0 +1,30 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+static __global__ void cuda_variable(float * tab)
			
 
				+{
			
 
				+	*tab += 2;
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+
			
 
				+	cuda_variable<<<1,1>>>(val);
			
 
				+}
			
--- a/examples/variable/variable_kernels_opencl.c
+++ b/examples/variable/variable_kernels_opencl.c
@@ -0,0 +1,48 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+void opencl_codelet(void *descr[], void *_args)
			
 
				+{
			
 
				+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err;
			
 
				+
			
 
				+        id = starpu_get_worker_id();
			
 
				+        devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				+                                        "examples/variable/variable_kernels_opencl_codelet.cl", "variable", devid);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global=1;
			
 
				+		size_t local=1;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+	starpu_opencl_release(kernel);
			
 
				+}
			
--- a/examples/variable/variable_kernels_opencl_codelet.cl
+++ b/examples/variable/variable_kernels_opencl_codelet.cl
@@ -0,0 +1,23 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void variable(__global float* input) 
			
 
				+{
			
 
				+	const int i = get_global_id(0);
			
 
				+	if (i == 0)
			
 
				+		input[i] = input[i] + 4.0;
			
 
				+}
			
 
				+
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -46,14 +46,19 @@ struct starpu_conf {
 
				 	int ncpus;
			
 
				 	/* maximum number of CUDA GPUs (-1 for default) */
			
 
				 	int ncuda;
			
 
				+	/* maximum number of OpenCL GPUs (-1 for default) */
			
 
				+	int nopencl;
			
 
				 	/* maximum number of Cell's SPUs (-1 for default) */
			
 
				 	int nspus;
			
 
				 
			
 
				 	unsigned use_explicit_workers_bindid;
			
 
				 	unsigned workers_bindid[STARPU_NMAXWORKERS];
			
 
				 
			
 
				-	unsigned use_explicit_workers_gpuid;
			
 
				-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
			
 
				+	unsigned use_explicit_workers_cuda_gpuid;
			
 
				+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	unsigned use_explicit_workers_opencl_gpuid;
			
 
				+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 	/* calibrate performance models, if any */
			
 
				 	unsigned calibrate;
			
@@ -75,6 +80,7 @@ unsigned starpu_get_worker_count(void);
 
				 unsigned starpu_get_cpu_worker_count(void);
			
 
				 unsigned starpu_get_cuda_worker_count(void);
			
 
				 unsigned starpu_get_spu_worker_count(void);
			
 
				+unsigned starpu_get_opencl_worker_count(void);
			
 
				 
			
 
				 /* Return the identifier of the thread in case this is associated to a worker.
			
 
				  * This will return -1 if this function is called directly from the application
			
@@ -84,6 +90,7 @@ int starpu_get_worker_id(void);
 
				 enum starpu_archtype {
			
 
				 	STARPU_CPU_WORKER, /* CPU core */
			
 
				 	STARPU_CUDA_WORKER, /* NVIDIA CUDA device */
			
 
				+	STARPU_OPENCL_WORKER, /* OpenCL CUDA device */
			
 
				 	STARPU_GORDON_WORKER /* Cell SPU */
			
 
				 };
			
 
				 
			
@@ -103,6 +110,11 @@ enum starpu_archtype starpu_get_worker_type(int id);
 
				  * behaviour. */
			
 
				 void starpu_get_worker_name(int id, char *dst, size_t maxlen);
			
 
				 
			
 
				+/* This functions returns the device id of the worker associated to an
			
 
				+ *  identifier (as returned by the starpu_get_worker_id() function)
			
 
				+ */
			
 
				+int starpu_get_worker_devid(int id);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -3,16 +3,17 @@
 
				 
			
 
				 #undef STARPU_USE_CPU
			
 
				 #undef STARPU_USE_CUDA
			
 
				+#undef STARPU_USE_OPENCL
			
 
				 #undef STARPU_USE_GORDON
			
 
				 
			
 
				 #undef STARPU_ATLAS
			
 
				 #undef STARPU_GOTO
			
 
				 #undef STARPU_SYSTEM_BLAS
			
 
				 
			
 
				+#undef STARPU_BUILD_DIR
			
 
				+#undef STARPU_OPENCL_DATADIR
			
 
				 #undef STARPU_HAVE_MAGMA
			
 
				 
			
 
				-#undef STARPU_DIR
			
 
				-
			
 
				 #undef STARPU_OPENGL_RENDER
			
 
				 
			
 
				 #undef STARPU_USE_GTK
			
@@ -37,6 +38,7 @@
 
				 
			
 
				 #undef STARPU_NMAXBUFS
			
 
				 #undef STARPU_MAXCUDADEVS
			
 
				+#undef STARPU_MAXOPENCLDEVS
			
 
				 
			
 
				 #undef STARPU_HAVE_LIBNUMA
			
 
				 
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -36,6 +36,8 @@ void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memo
 
				 /* Matrix interface for dense matrices */
			
 
				 typedef struct starpu_matrix_interface_s {
			
 
				 	uintptr_t ptr;
			
 
				+        uintptr_t dev_handle;
			
 
				+        size_t offset;
			
 
				 	uint32_t nx;
			
 
				 	uint32_t ny;
			
 
				 	uint32_t ld;
			
@@ -62,6 +64,8 @@ size_t starpu_get_matrix_elemsize(starpu_data_handle handle);
 
				 /* BLOCK interface for 3D dense blocks */
			
 
				 typedef struct starpu_block_interface_s {
			
 
				 	uintptr_t ptr;
			
 
				+        uintptr_t dev_handle;
			
 
				+        size_t offset;
			
 
				 	uint32_t nx;
			
 
				 	uint32_t ny;
			
 
				 	uint32_t nz;
			
@@ -93,6 +97,8 @@ size_t starpu_get_block_elemsize(starpu_data_handle handle);
 
				 /* vector interface for contiguous (non-strided) buffers */
			
 
				 typedef struct starpu_vector_interface_s {
			
 
				 	uintptr_t ptr;
			
 
				+        uintptr_t dev_handle;
			
 
				+        size_t offset;
			
 
				 	uint32_t nx;
			
 
				 	size_t elemsize;
			
 
				 } starpu_vector_interface_t;
			
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -0,0 +1,192 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_OPENCL_H__
			
 
				+#define __STARPU_OPENCL_H__
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <CL/cl.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#define STARPU_OPENCL_REPORT_ERROR(status)                                     \
			
 
				+	do {                                                            \
			
 
				+		char *errormsg;                                         \
			
 
				+		switch (status) {                                       \
			
 
				+		case CL_SUCCESS:                                        \
			
 
				+			errormsg = "success";                           \
			
 
				+			break;                                          \
			
 
				+		case CL_DEVICE_NOT_FOUND:				\
			
 
				+			errormsg = "Device not found";                  \
			
 
				+			break;                                          \
			
 
				+		case CL_DEVICE_NOT_AVAILABLE:				\
			
 
				+			errormsg = "Device not available";              \
			
 
				+			break;                                          \
			
 
				+		case CL_COMPILER_NOT_AVAILABLE:				\
			
 
				+			errormsg = "Compiler not available";            \
			
 
				+			break;                                          \
			
 
				+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:			\
			
 
				+			errormsg = "Memory object allocation failure";  \
			
 
				+			break;                                          \
			
 
				+		case CL_OUT_OF_RESOURCES:				\
			
 
				+			errormsg = "Out of resources";                  \
			
 
				+			break;                                          \
			
 
				+		case CL_OUT_OF_HOST_MEMORY:				\
			
 
				+			errormsg = "Out of host memory";                \
			
 
				+			break;                                          \
			
 
				+		case CL_PROFILING_INFO_NOT_AVAILABLE:			\
			
 
				+			errormsg = "Profiling info not available";      \
			
 
				+			break;                                          \
			
 
				+		case CL_MEM_COPY_OVERLAP:				\
			
 
				+			errormsg = "Memory copy overlap";               \
			
 
				+			break;                                          \
			
 
				+		case CL_IMAGE_FORMAT_MISMATCH:				\
			
 
				+			errormsg = "Image format mismatch";             \
			
 
				+			break;                                          \
			
 
				+		case CL_IMAGE_FORMAT_NOT_SUPPORTED:			\
			
 
				+			errormsg = "Image format not supported";        \
			
 
				+			break;                                          \
			
 
				+		case CL_BUILD_PROGRAM_FAILURE:				\
			
 
				+			errormsg = "Build program failure";             \
			
 
				+			break;                                          \
			
 
				+		case CL_MAP_FAILURE:				        \
			
 
				+			errormsg = "Map failure";                       \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_VALUE:				        \
			
 
				+			errormsg = "Invalid value";                     \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_DEVICE_TYPE:				\
			
 
				+			errormsg = "Invalid device type";               \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_PLATFORM:				\
			
 
				+			errormsg = "Invalid platform";                  \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_DEVICE:				        \
			
 
				+			errormsg = "Invalid device";                    \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_CONTEXT:				\
			
 
				+			errormsg = "Invalid context";                   \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_QUEUE_PROPERTIES:			\
			
 
				+			errormsg = "Invalid queue properties";          \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_COMMAND_QUEUE:				\
			
 
				+			errormsg = "Invalid command queue";             \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_HOST_PTR:				\
			
 
				+			errormsg = "Invalid host pointer";              \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_MEM_OBJECT:				\
			
 
				+			errormsg = "Invalid memory object";             \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:		\
			
 
				+			errormsg = "Invalid image format descriptor";   \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_IMAGE_SIZE:				\
			
 
				+			errormsg = "Invalid image size";                \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_SAMPLER:				\
			
 
				+			errormsg = "Invalid sampler";                   \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_BINARY:				        \
			
 
				+			errormsg = "Invalid binary";                    \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_BUILD_OPTIONS:				\
			
 
				+			errormsg = "Invalid build options";             \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_PROGRAM:				\
			
 
				+			errormsg = "Invalid program";                   \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_PROGRAM_EXECUTABLE:			\
			
 
				+			errormsg = "Invalid program executable";        \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_KERNEL_NAME:				\
			
 
				+			errormsg = "Invalid kernel name";               \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_KERNEL_DEFINITION:			\
			
 
				+			errormsg = "Invalid kernel definition";         \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_KERNEL:				        \
			
 
				+			errormsg = "Invalid kernel";                    \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_ARG_INDEX:				\
			
 
				+			errormsg = "Invalid argument index";            \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_ARG_VALUE:				\
			
 
				+			errormsg = "Invalid argument value";            \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_ARG_SIZE:				\
			
 
				+			errormsg = "Invalid argument size";             \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_KERNEL_ARGS:				\
			
 
				+			errormsg = "Invalid kernel arguments";          \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_WORK_DIMENSION:				\
			
 
				+			errormsg = "Invalid work dimension";            \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_WORK_GROUP_SIZE:			\
			
 
				+			errormsg = "Invalid work group size";           \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_WORK_ITEM_SIZE:				\
			
 
				+			errormsg = "Invalid work item size";            \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_GLOBAL_OFFSET:				\
			
 
				+			errormsg = "Invalid global offset";             \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_EVENT_WAIT_LIST:			\
			
 
				+			errormsg = "Invalid event wait list";           \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_EVENT:				        \
			
 
				+			errormsg = "Invalid event";                     \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_OPERATION:				\
			
 
				+			errormsg = "Invalid operation";                 \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_GL_OBJECT:				\
			
 
				+			errormsg = "Invalid GL object";                 \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_BUFFER_SIZE:				\
			
 
				+			errormsg = "Invalid buffer size";               \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_MIP_LEVEL:				\
			
 
				+			errormsg = "Invalid MIP level";                 \
			
 
				+			break;                                          \
			
 
				+		case CL_INVALID_GLOBAL_WORK_SIZE:			\
			
 
				+			errormsg = "Invalid global work size";          \
			
 
				+			break;                                          \
			
 
				+		default:						\
			
 
				+			errormsg = "unknown error";			\
			
 
				+			break;			                        \
			
 
				+		}                                                       \
			
 
				+		printf("oops in %s ... <%s> \n", __func__, errormsg);	\
			
 
				+		assert(0);	                                        \
			
 
				+	} while (0)
			
 
				+
			
 
				+void starpu_opencl_get_context(int devid, cl_context *context);
			
 
				+void starpu_opencl_get_device(int devid, cl_device_id *device);
			
 
				+void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
			
 
				+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid);
			
 
				+int starpu_opencl_release(cl_kernel kernel);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // STARPU_USE_OPENCL
			
 
				+#endif // __STARPU_OPENCL_H__
			
 
				+
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -21,6 +21,7 @@
 
				 #include <pthread.h>
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_config.h>
			
 
				+#include <starpu_task.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
@@ -39,8 +40,9 @@ struct starpu_buffer_descr_t;
 
				 enum starpu_perf_archtype {
			
 
				 	STARPU_CPU_DEFAULT = 0,
			
 
				 	STARPU_CUDA_DEFAULT = 1,
			
 
				-	/* STARPU_CUDA_DEFAULT + devid */
			
 
				-	STARPU_GORDON_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS
			
 
				+	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
			
 
				+	/* STARPU_OPENCL_DEFAULT + devid */
			
 
				+	STARPU_GORDON_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS
			
 
				 };
			
 
				 
			
 
				 #define STARPU_NARCH_VARIATIONS	(STARPU_GORDON_DEFAULT+1)
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -31,6 +31,7 @@
 
				 #define STARPU_CUDA	((1ULL)<<3)
			
 
				 #define STARPU_SPU	((1ULL)<<4)
			
 
				 #define STARPU_GORDON	((1ULL)<<5)
			
 
				+#define STARPU_OPENCL	((1ULL)<<6)
			
 
				 
			
 
				 #define STARPU_MIN_PRIO        (-4)
			
 
				 #define STARPU_MAX_PRIO        5
			
@@ -53,6 +54,7 @@ typedef struct starpu_codelet_t {
 
				 	/* the different implementations of the codelet */
			
 
				 	void (*cuda_func)(void **, void *);
			
 
				 	void (*cpu_func)(void **, void *);
			
 
				+	void (*opencl_func)(void **, void *);
			
 
				 	uint8_t gordon_func;
			
 
				 
			
 
				 	/* how many buffers do the codelet takes as argument ? */
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -157,3 +157,8 @@ endif
 
				 if STARPU_USE_GORDON
			
 
				 libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
			
 
				 endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+libstarpu_la_SOURCES += drivers/opencl/driver_opencl.c
			
 
				+libstarpu_la_SOURCES += drivers/opencl/driver_opencl_utils.c
			
 
				+endif
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -49,6 +49,7 @@ typedef void (*callback)(void *);
 
				 #define STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
			
 
				 #define STARPU_SPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SPU)
			
 
				 #define STARPU_GORDON_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_GORDON)
			
 
				+#define STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
			
 
				 
			
 
				 /* a job is the internal representation of a task */
			
 
				 LIST_TYPE(starpu_job,
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -82,6 +82,9 @@ static double common_task_expected_length(struct starpu_perfmodel_t *model, uint
 
				 			case STARPU_CUDA:
			
 
				 				alpha = STARPU_CUDA_ALPHA;
			
 
				 				break;
			
 
				+  		        case STARPU_OPENCL:
			
 
				+	                        alpha = STARPU_OPENCL_ALPHA;
			
 
				+                                break;
			
 
				 			default:
			
 
				 				/* perhaps there are various worker types on that queue */
			
 
				 				alpha = 1.0; // this value is not significant ...
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -106,8 +106,11 @@ double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_
 
				 void _starpu_set_calibrate_flag(unsigned val);
			
 
				 unsigned _starpu_get_calibrate_flag(void);
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-int *_starpu_get_gpu_affinity_vector(unsigned gpuid);
			
 
				+#if defined(STARPU_USE_CUDA)
			
 
				+int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
			
 
				 #endif
			
 
				- 
			
 
				+#if defined(STARPU_USE_OPENCL)
			
 
				+int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
			
 
				+#endif
			
 
				+
			
 
				 #endif // __PERFMODEL_H__
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -23,8 +23,10 @@
 
				 #include <unistd.h>
			
 
				 #include <sys/time.h>
			
 
				 #include <stdlib.h>
			
 
				+#include <math.h>
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include <common/config.h>
			
 
				 #include <core/workers.h>
			
 
				 #include <core/perfmodel/perfmodel.h>
			
@@ -34,7 +36,7 @@
 
				 
			
 
				 #define MAXCPUS	32
			
 
				 
			
 
				-struct cudadev_timing {
			
 
				+struct dev_timing {
			
 
				 	int cpu_id;
			
 
				 	double timing_htod;
			
 
				 	double timing_dtoh;
			
@@ -43,19 +45,29 @@ struct cudadev_timing {
 
				 static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
			
 
				 static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
			
 
				 static unsigned was_benchmarked = 0;
			
 
				+static unsigned ncpus = 0;
			
 
				 static int ncuda = 0;
			
 
				-
			
 
				-static int affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
			
 
				+static int nopencl = 0;
			
 
				 
			
 
				 /* Benchmarking the performance of the bus */
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
			
 
				 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
			
 
				 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
			
 
				+static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
			
 
				+static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
			
 
				+static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
			
 
				+static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
			
 
				+#endif
			
 
				 
			
 
				-static struct cudadev_timing cudadev_timing_per_cpu[STARPU_MAXNODES][MAXCPUS];
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 
			
 
				-static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
			
 
				 {
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
@@ -84,7 +96,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 
			
 
				 	/* Allocate a buffer on the host */
			
 
				 	unsigned char *h_buffer;
			
 
				-	cudaHostAlloc((void **)&h_buffer, SIZE, 0); 
			
 
				+	cudaHostAlloc((void **)&h_buffer, SIZE, 0);
			
 
				 	assert(h_buffer);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
@@ -104,7 +116,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				 
			
 
				-	cudadev_timing_per_cpu[dev+1][cpu].cpu_id = cpu;
			
 
				+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
			
 
				 
			
 
				 	/* Measure upload bandwidth */
			
 
				 	gettimeofday(&start, NULL);
			
@@ -116,7 +128,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 	gettimeofday(&end, NULL);
			
 
				 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	cudadev_timing_per_cpu[dev+1][cpu].timing_htod = timing/NITER;
			
 
				+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
			
 
				 
			
 
				 	/* Measure download bandwidth */
			
 
				 	gettimeofday(&start, NULL);
			
@@ -128,7 +140,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 	gettimeofday(&end, NULL);
			
 
				 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh = timing/NITER;
			
 
				+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
			
 
				 
			
 
				 	/* Free buffers */
			
 
				 	cudaFreeHost(h_buffer);
			
@@ -137,18 +149,101 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 	cudaThreadExit();
			
 
				 
			
 
				 }
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
			
 
				+{
			
 
				+        cl_context context;
			
 
				+        cl_command_queue queue;
			
 
				+
			
 
				+        struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				+
			
 
				+	/* Initialize OpenCL context on the device */
			
 
				+        _starpu_opencl_init_context(dev);
			
 
				+        starpu_opencl_get_context(dev, &context);
			
 
				+        starpu_opencl_get_queue(dev, &queue);
			
 
				+
			
 
				+	/* hack to avoid third party libs to rebind threads */
			
 
				+	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				+
			
 
				+	/* Allocate a buffer on the device */
			
 
				+        int err;
			
 
				+	cl_mem d_buffer;
			
 
				+	d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, SIZE, NULL, &err);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	/* hack to avoid third party libs to rebind threads */
			
 
				+	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				+
			
 
				+        /* Allocate a buffer on the host */
			
 
				+	unsigned char *h_buffer;
			
 
				+        h_buffer = malloc(SIZE);
			
 
				+	assert(h_buffer);
			
 
				+
			
 
				+	/* hack to avoid third party libs to rebind threads */
			
 
				+	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				+
			
 
				+        /* Fill them */
			
 
				+	memset(h_buffer, 0, SIZE);
			
 
				+        err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	/* hack to avoid third party libs to rebind threads */
			
 
				+	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				+
			
 
				+        unsigned iter;
			
 
				+	double timing;
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
			
 
				+
			
 
				+	/* Measure upload bandwidth */
			
 
				+	gettimeofday(&start, NULL);
			
 
				+	for (iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+                err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
			
 
				+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+
			
 
				+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
			
 
				+
			
 
				+	/* Measure download bandwidth */
			
 
				+	gettimeofday(&start, NULL);
			
 
				+	for (iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+                err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
			
 
				+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+
			
 
				+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
			
 
				+
			
 
				+	/* Free buffers */
			
 
				+	clReleaseMemObject(d_buffer);
			
 
				+	free(h_buffer);
			
 
				+
			
 
				+	/* Uninitiliaze OpenCL context on the device */
			
 
				+        _starpu_opencl_deinit_context(dev);
			
 
				+}
			
 
				+#endif
			
 
				 
			
 
				 /* NB: we want to sort the bandwidth by DECREASING order */
			
 
				-static int compar_cudadev_timing(const void *left_cudadev_timing, const void *right_cudadev_timing)
			
 
				+static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
			
 
				 {
			
 
				-	const struct cudadev_timing *left = left_cudadev_timing;
			
 
				-	const struct cudadev_timing *right = right_cudadev_timing;
			
 
				-	
			
 
				+	const struct dev_timing *left = left_dev_timing;
			
 
				+	const struct dev_timing *right = right_dev_timing;
			
 
				+
			
 
				 	double left_dtoh = left->timing_dtoh;
			
 
				 	double left_htod = left->timing_htod;
			
 
				 	double right_dtoh = right->timing_dtoh;
			
 
				 	double right_htod = right->timing_htod;
			
 
				-	
			
 
				+
			
 
				 	double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
			
 
				 	double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
			
 
				 
			
@@ -156,47 +251,55 @@ static int compar_cudadev_timing(const void *left_cudadev_timing, const void *ri
 
				 	return (bandwidth_sum2_left < bandwidth_sum2_right);
			
 
				 }
			
 
				 
			
 
				-static void measure_bandwidth_between_host_and_dev(int dev, unsigned ncpus)
			
 
				+static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
			
 
				+                                                   struct dev_timing *dev_timing_per_cpu, char type)
			
 
				 {
			
 
				 	unsigned cpu;
			
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 	{
			
 
				-		measure_bandwidth_between_host_and_dev_on_cpu(dev, cpu);
			
 
				-	}
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+                if (type == 'C')
			
 
				+                        measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                if (type == 'O')
			
 
				+                        measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
			
 
				+#endif
			
 
				+        }
			
 
				 
			
 
				 	/* sort the results */
			
 
				-	qsort(cudadev_timing_per_cpu[dev+1], ncpus,
			
 
				-			sizeof(struct cudadev_timing),
			
 
				-			compar_cudadev_timing);
			
 
				-	
			
 
				+	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
			
 
				+              sizeof(struct dev_timing),
			
 
				+			compar_dev_timing);
			
 
				+
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 	{
			
 
				-		unsigned current_cpu = cudadev_timing_per_cpu[dev+1][cpu].cpu_id;
			
 
				-		double bandwidth_dtoh = cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh;
			
 
				-		double bandwidth_htod = cudadev_timing_per_cpu[dev+1][cpu].timing_htod;
			
 
				+		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
			
 
				+		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
			
 
				+		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
			
 
				 
			
 
				 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
			
 
				 
			
 
				 		fprintf(stderr, "BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
			
 
				 	}
			
 
				 
			
 
				-	unsigned best_cpu = cudadev_timing_per_cpu[dev+1][0].cpu_id;
			
 
				+	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
			
 
				 
			
 
				 	fprintf(stderr, "BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
			
 
				 #endif
			
 
				 
			
 
				 	/* The results are sorted in a decreasing order, so that the best
			
 
				 	 * measurement is currently the first entry. */
			
 
				-	cudadev_timing_dtoh[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_dtoh;
			
 
				-	cudadev_timing_htod[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_htod;
			
 
				+	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
			
 
				+	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
			
 
				 }
			
 
				-#endif
			
 
				+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
			
 
				 
			
 
				-static void benchmark_all_cuda_devices(void)
			
 
				+static void benchmark_all_gpu_devices(void)
			
 
				 {
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	int ret;
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				+	int i, ret;
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 	fprintf(stderr, "Benchmarking the speed of the bus\n");
			
@@ -213,15 +316,24 @@ static void benchmark_all_cuda_devices(void)
 
				 	}
			
 
				 
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+	ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				         cudaGetDeviceCount(&ncuda);
			
 
				-	int i;
			
 
				 	for (i = 0; i < ncuda; i++)
			
 
				 	{
			
 
				 		/* measure bandwidth between Host and Device i */
			
 
				-		measure_bandwidth_between_host_and_dev(i, ncpus);
			
 
				+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
			
 
				+	}
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        nopencl = _starpu_opencl_get_device_count();
			
 
				+	for (i = 0; i < nopencl; i++)
			
 
				+	{
			
 
				+		/* measure bandwith between Host and Device i */
			
 
				+		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
			
 
				 	}
			
 
				+#endif
			
 
				 
			
 
				 	/* FIXME: use hwloc */
			
 
				 	/* Restore the former affinity */
			
@@ -235,7 +347,7 @@ static void benchmark_all_cuda_devices(void)
 
				 #ifdef STARPU_VERBOSE
			
 
				 	fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
			
 
				 #endif
			
 
				-#endif
			
 
				+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
			
 
				 
			
 
				 	was_benchmarked = 1;
			
 
				 }
			
@@ -244,7 +356,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 
				 {
			
 
				 	_starpu_get_perf_model_dir_bus(path, maxlen);
			
 
				 	strncat(path, type, maxlen);
			
 
				-	
			
 
				+
			
 
				 	char hostname[32];
			
 
				 	gethostname(hostname, 32);
			
 
				 	strncat(path, ".", maxlen);
			
@@ -270,13 +382,13 @@ static void load_bus_affinity_file_content(void)
 
				 	f = fopen(path, "r");
			
 
				 	STARPU_ASSERT(f);
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+	ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+        int gpu;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				         cudaGetDeviceCount(&ncuda);
			
 
				-
			
 
				-	int gpu;
			
 
				 	for (gpu = 0; gpu < ncuda; gpu++)
			
 
				 	{
			
 
				 		int ret;
			
@@ -292,7 +404,7 @@ static void load_bus_affinity_file_content(void)
 
				 		unsigned cpu;
			
 
				 		for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 		{
			
 
				-			ret = fscanf(f, "%d\t", &affinity_matrix[gpu][cpu]);
			
 
				+			ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
			
 
				 			STARPU_ASSERT(ret == 1);
			
 
				 		}
			
 
				 
			
@@ -300,6 +412,32 @@ static void load_bus_affinity_file_content(void)
 
				 		STARPU_ASSERT(ret == 0);
			
 
				 	}
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        nopencl = _starpu_opencl_get_device_count();
			
 
				+	for (gpu = 0; gpu < nopencl; gpu++)
			
 
				+	{
			
 
				+		int ret;
			
 
				+
			
 
				+		int dummy;
			
 
				+
			
 
				+		starpu_drop_comments(f);
			
 
				+		ret = fscanf(f, "%d\t", &dummy);
			
 
				+		STARPU_ASSERT(ret == 1);
			
 
				+
			
 
				+		STARPU_ASSERT(dummy == gpu);
			
 
				+
			
 
				+		unsigned cpu;
			
 
				+		for (cpu = 0; cpu < ncpus; cpu++)
			
 
				+		{
			
 
				+			ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
			
 
				+			STARPU_ASSERT(ret == 1);
			
 
				+		}
			
 
				+
			
 
				+		ret = fscanf(f, "\n");
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				+	}
			
 
				+#endif
			
 
				+#endif
			
 
				 
			
 
				 	fclose(f);
			
 
				 }
			
@@ -320,24 +458,36 @@ static void write_bus_affinity_file_content(void)
 
				 		STARPU_ABORT();
			
 
				 	}
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 	unsigned cpu;
			
 
				+        int gpu;
			
 
				 
			
 
				-	fprintf(f, "# GPU\t");
			
 
				+        fprintf(f, "# GPU\t");
			
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 		fprintf(f, "CPU%d\t", cpu);
			
 
				 	fprintf(f, "\n");
			
 
				 
			
 
				-	int gpu;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				 	for (gpu = 0; gpu < ncuda; gpu++)
			
 
				 	{
			
 
				 		fprintf(f, "%d\t", gpu);
			
 
				 
			
 
				 		for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 		{
			
 
				-			fprintf(f, "%d\t", cudadev_timing_per_cpu[gpu+1][cpu].cpu_id);
			
 
				+			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
			
 
				+		}
			
 
				+
			
 
				+		fprintf(f, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	for (gpu = 0; gpu < nopencl; gpu++)
			
 
				+	{
			
 
				+		fprintf(f, "%d\t", gpu);
			
 
				+
			
 
				+		for (cpu = 0; cpu < ncpus; cpu++)
			
 
				+		{
			
 
				+                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -345,12 +495,13 @@ static void write_bus_affinity_file_content(void)
 
				 #endif
			
 
				 
			
 
				 	fclose(f);
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 static void generate_bus_affinity_file(void)
			
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				-		benchmark_all_cuda_devices();
			
 
				+		benchmark_all_gpu_devices();
			
 
				 
			
 
				 	write_bus_affinity_file_content();
			
 
				 }
			
@@ -372,10 +523,19 @@ static void load_bus_affinity_file(void)
 
				 	load_bus_affinity_file_content();
			
 
				 }
			
 
				 
			
 
				-int *_starpu_get_gpu_affinity_vector(unsigned gpuid)
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
			
 
				+{
			
 
				+        return cuda_affinity_matrix[gpuid];
			
 
				+}
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
			
 
				 {
			
 
				-	return affinity_matrix[gpuid];
			
 
				+        return opencl_affinity_matrix[gpuid];
			
 
				 }
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				 
			
 
				 /*
			
 
				  *	Latency
			
@@ -420,7 +580,7 @@ static void load_bus_latency_file_content(void)
 
				 
			
 
				 static void write_bus_latency_file_content(void)
			
 
				 {
			
 
				-	int src, dst;
			
 
				+        int src, dst, maxnode;
			
 
				 	FILE *f;
			
 
				 
			
 
				 	STARPU_ASSERT(was_benchmarked);
			
@@ -440,13 +600,17 @@ static void write_bus_latency_file_content(void)
 
				 		fprintf(f, "to %d\t\t", dst);
			
 
				 	fprintf(f, "\n");
			
 
				 
			
 
				-	for (src = 0; src < STARPU_MAXNODES; src++)
			
 
				+        maxnode = ncuda;
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        maxnode += nopencl;
			
 
				+#endif
			
 
				+        for (src = 0; src < STARPU_MAXNODES; src++)
			
 
				 	{
			
 
				 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
 
				 		{
			
 
				 			double latency;
			
 
				 
			
 
				-			if ((src > ncuda) || (dst > ncuda))
			
 
				+			if ((src > maxnode) || (dst > maxnode))
			
 
				 			{
			
 
				 				/* convention */
			
 
				 				latency = -1.0;
			
@@ -471,7 +635,7 @@ static void write_bus_latency_file_content(void)
 
				 static void generate_bus_latency_file(void)
			
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				-		benchmark_all_cuda_devices();
			
 
				+		benchmark_all_gpu_devices();
			
 
				 
			
 
				 	write_bus_latency_file_content();
			
 
				 }
			
@@ -494,7 +658,7 @@ static void load_bus_latency_file(void)
 
				 }
			
 
				 
			
 
				 
			
 
				-/* 
			
 
				+/*
			
 
				  *	Bandwidth
			
 
				  */
			
 
				 static void get_bandwidth_path(char *path, size_t maxlen)
			
@@ -540,7 +704,7 @@ static void load_bus_bandwidth_file_content(void)
 
				 
			
 
				 static void write_bus_bandwidth_file_content(void)
			
 
				 {
			
 
				-	int src, dst;
			
 
				+	int src, dst, maxnode;
			
 
				 	FILE *f;
			
 
				 
			
 
				 	STARPU_ASSERT(was_benchmarked);
			
@@ -556,25 +720,38 @@ static void write_bus_bandwidth_file_content(void)
 
				 		fprintf(f, "to %d\t\t", dst);
			
 
				 	fprintf(f, "\n");
			
 
				 
			
 
				+        maxnode = ncuda;
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        maxnode += nopencl;
			
 
				+#endif
			
 
				 	for (src = 0; src < STARPU_MAXNODES; src++)
			
 
				 	{
			
 
				 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
 
				 		{
			
 
				 			double bandwidth;
			
 
				-			
			
 
				-			if ((src > ncuda) || (dst > ncuda))
			
 
				+
			
 
				+			if ((src > maxnode) || (dst > maxnode))
			
 
				 			{
			
 
				 				bandwidth = -1.0;
			
 
				 			}
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 			else if (src != dst)
			
 
				 			{
			
 
				-			/* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
			
 
				-				double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
			
 
				-				double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
			
 
				-				
			
 
				+                                double time_src_to_ram=0.0, time_ram_to_dst=0.0;
			
 
				+                                /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
			
 
				+                                time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                                if (src > ncuda)
			
 
				+                                        time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
			
 
				+                                if (dst > ncuda)
			
 
				+                                        time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
			
 
				+#endif
			
 
				+
			
 
				 				double timing =time_src_to_ram + time_ram_to_dst;
			
 
				-				
			
 
				+
			
 
				 				bandwidth = 1.0*SIZE/timing;
			
 
				 			}
			
 
				 #endif
			
@@ -582,7 +759,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 			        /* convention */
			
 
				 			        bandwidth = 0.0;
			
 
				 			}
			
 
				-			
			
 
				+
			
 
				 			fprintf(f, "%lf\t", bandwidth);
			
 
				 		}
			
 
				 
			
@@ -595,7 +772,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 static void generate_bus_bandwidth_file(void)
			
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				-		benchmark_all_cuda_devices();
			
 
				+		benchmark_all_gpu_devices();
			
 
				 
			
 
				 	write_bus_bandwidth_file_content();
			
 
				 }
			
@@ -618,6 +795,96 @@ static void load_bus_bandwidth_file(void)
 
				 }
			
 
				 
			
 
				 /*
			
 
				+ *	Config
			
 
				+ */
			
 
				+static void get_config_path(char *path, size_t maxlen)
			
 
				+{
			
 
				+	get_bus_path("config", path, maxlen);
			
 
				+}
			
 
				+
			
 
				+static void check_bus_config_file()
			
 
				+{
			
 
				+        int res;
			
 
				+        char path[256];
			
 
				+
			
 
				+        get_config_path(path, 256);
			
 
				+        res = access(path, F_OK);
			
 
				+        if (res) {
			
 
				+                starpu_force_bus_sampling();
			
 
				+        }
			
 
				+        else {
			
 
				+                FILE *f;
			
 
				+                int ret, read_cuda, read_opencl;
			
 
				+                unsigned read_cpus;
			
 
				+                struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+
			
 
				+                // Loading configuration from file
			
 
				+                f = fopen(path, "r");
			
 
				+                STARPU_ASSERT(f);
			
 
				+                starpu_drop_comments(f);
			
 
				+                ret = fscanf(f, "%d\t", &read_cpus);
			
 
				+		STARPU_ASSERT(ret == 1);
			
 
				+                starpu_drop_comments(f);
			
 
				+		ret = fscanf(f, "%d\t", &read_cuda);
			
 
				+		STARPU_ASSERT(ret == 1);
			
 
				+                starpu_drop_comments(f);
			
 
				+		ret = fscanf(f, "%d\t", &read_opencl);
			
 
				+		STARPU_ASSERT(ret == 1);
			
 
				+                starpu_drop_comments(f);
			
 
				+                fclose(f);
			
 
				+
			
 
				+                // Loading current configuration
			
 
				+                ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+                cudaGetDeviceCount(&ncuda);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                nopencl = _starpu_opencl_get_device_count();
			
 
				+#endif
			
 
				+
			
 
				+                // Checking if both configurations match
			
 
				+                if (read_cpus != ncpus) {
			
 
				+                        fprintf(stderr, "Current configuration does not match the performance model (CPUS: %d != %d)\n", read_cpus, ncpus);
			
 
				+                        starpu_force_bus_sampling();
			
 
				+                }
			
 
				+                else if (read_cuda != ncuda) {
			
 
				+                        fprintf(stderr, "Current configuration does not match the performance model (CUDA: %d != %d)\n", read_cuda, ncuda);
			
 
				+                        starpu_force_bus_sampling();
			
 
				+                }
			
 
				+                else if (read_opencl != nopencl) {
			
 
				+                        fprintf(stderr, "Current configuration does not match the performance model (OpenCL: %d != %d)\n", read_opencl, nopencl);
			
 
				+                        starpu_force_bus_sampling();
			
 
				+                }
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+static void write_bus_config_file_content(void)
			
 
				+{
			
 
				+	FILE *f;
			
 
				+	char path[256];
			
 
				+
			
 
				+	STARPU_ASSERT(was_benchmarked);
			
 
				+        get_config_path(path, 256);
			
 
				+        f = fopen(path, "w+");
			
 
				+	STARPU_ASSERT(f);
			
 
				+
			
 
				+        fprintf(f, "# Current configuration\n");
			
 
				+        fprintf(f, "%d # Number of CPUs\n", ncpus);
			
 
				+        fprintf(f, "%d # Number of CUDA devices\n", ncuda);
			
 
				+        fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
			
 
				+
			
 
				+        fclose(f);
			
 
				+}
			
 
				+
			
 
				+static void generate_bus_config_file()
			
 
				+{
			
 
				+	if (!was_benchmarked)
			
 
				+		benchmark_all_gpu_devices();
			
 
				+
			
 
				+	write_bus_config_file_content();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				  *	Generic
			
 
				  */
			
 
				 
			
@@ -628,12 +895,14 @@ void starpu_force_bus_sampling(void)
 
				 	generate_bus_affinity_file();
			
 
				 	generate_bus_latency_file();
			
 
				 	generate_bus_bandwidth_file();
			
 
				+        generate_bus_config_file();
			
 
				 }
			
 
				 
			
 
				 void _starpu_load_bus_performance_files(void)
			
 
				 {
			
 
				 	_starpu_create_sampling_directory_if_needed();
			
 
				 
			
 
				+        check_bus_config_file();
			
 
				 	load_bus_affinity_file();
			
 
				 	load_bus_latency_file();
			
 
				 	load_bus_bandwidth_file();
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -21,6 +21,7 @@
 
				 #include <core/debug.h>
			
 
				 #include <core/topology.h>
			
 
				 #include <drivers/cuda/driver_cuda.h>
			
 
				+#include <common/hash.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 #include <hwloc.h>
			
@@ -37,8 +38,16 @@ static unsigned topology_is_initialized = 0;
 
				 
			
 
				 static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *config);
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config);
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				+#  ifdef STARPU_USE_CUDA
			
 
				+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config);
			
 
				+static struct starpu_htbl32_node_s *devices_using_cuda = NULL;
			
 
				+#  endif
			
 
				+#  ifdef STARPU_USE_OPENCL
			
 
				+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config);
			
 
				+#  endif
			
 
				+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
			
 
				+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus);
			
 
				 static unsigned may_bind_automatically = 0;
			
 
				 #endif
			
 
				 
			
@@ -47,12 +56,70 @@ static unsigned may_bind_automatically = 0;
 
				  */
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config)
			
 
				+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config)
			
 
				+{
			
 
				+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_cuda_gpuid,
			
 
				+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_cuda_gpuid,
			
 
				+                                         &(config->current_cuda_gpuid), (int *)config->workers_cuda_gpuid, "STARPU_WORKERS_CUDAID",
			
 
				+                                         config->nhwcudagpus);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config)
			
 
				+{
			
 
				+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_opencl_gpuid,
			
 
				+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_opencl_gpuid,
			
 
				+                                         &(config->current_opencl_gpuid), (int *)config->workers_opencl_gpuid, "STARPU_WORKERS_OPENCLID",
			
 
				+                                         config->nhwopenclgpus);
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+        // Detect devices which are already used with CUDA
			
 
				+        {
			
 
				+                unsigned tmp[STARPU_NMAXWORKERS];
			
 
				+                unsigned nb=0;
			
 
				+                int i;
			
 
				+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
			
 
				+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
			
 
				+                        if (_starpu_htbl_search_32(devices_using_cuda, key) == NULL) {
			
 
				+                                tmp[nb] = config->workers_opencl_gpuid[i];
			
 
				+                                nb++;
			
 
				+                        }
			
 
				+                }
			
 
				+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
			
 
				+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
			
 
				+        }
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+        {
			
 
				+                // Detect identical devices
			
 
				+                struct starpu_htbl32_node_s *devices_already_used = NULL;
			
 
				+                unsigned tmp[STARPU_NMAXWORKERS];
			
 
				+                unsigned nb=0;
			
 
				+                int i;
			
 
				+
			
 
				+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
			
 
				+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
			
 
				+                        if (_starpu_htbl_search_32(devices_already_used, key) == NULL) {
			
 
				+                                _starpu_htbl_insert_32(&devices_already_used, key, config);
			
 
				+                                tmp[nb] = config->workers_opencl_gpuid[i];
			
 
				+                                nb ++;
			
 
				+                        }
			
 
				+                }
			
 
				+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
			
 
				+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
			
 
				+        }
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
			
 
				+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus)
			
 
				 {
			
 
				 	char *strval;
			
 
				 	unsigned i;
			
 
				 
			
 
				-	config->current_gpuid = 0;
			
 
				+	*current = 0;
			
 
				 
			
 
				 	/* conf->workers_bindid indicates the successive cpu identifier that
			
 
				 	 * should be used to bind the workers. It should be either filled
			
@@ -62,14 +129,14 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
				 	 * cpus. */
			
 
				 
			
 
				 	/* what do we use, explicit value, env. variable, or round-robin ? */
			
 
				-	if (config->user_conf && config->user_conf->use_explicit_workers_gpuid)
			
 
				+	if (use_explicit_workers_gpuid)
			
 
				 	{
			
 
				 		/* we use the explicit value from the user */
			
 
				-		memcpy(config->workers_gpuid,
			
 
				-			config->user_conf->workers_gpuid,
			
 
				-			STARPU_NMAXWORKERS*sizeof(unsigned));
			
 
				+		memcpy(workers_gpuid,
			
 
				+                       explicit_workers_gpuid,
			
 
				+                       STARPU_NMAXWORKERS*sizeof(unsigned));
			
 
				 	}
			
 
				-	else if ((strval = getenv("STARPU_WORKERS_CUDAID")))
			
 
				+	else if ((strval = getenv(varname)))
			
 
				 	{
			
 
				 		/* STARPU_WORKERS_CUDAID certainly contains less entries than
			
 
				 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
			
@@ -86,7 +153,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
				 				val = strtol(strval, &endptr, 10);
			
 
				 				if (endptr != strval)
			
 
				 				{
			
 
				-					config->workers_gpuid[i] = (unsigned)val;
			
 
				+					workers_gpuid[i] = (unsigned)val;
			
 
				 					strval = endptr;
			
 
				 				}
			
 
				 				else {
			
@@ -97,11 +164,11 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
				 					/* there is no more values in the string */
			
 
				 					wrap = 1;
			
 
				 
			
 
				-					config->workers_gpuid[i] = config->workers_gpuid[0];
			
 
				+					workers_gpuid[i] = workers_gpuid[0];
			
 
				 				}
			
 
				 			}
			
 
				 			else {
			
 
				-				config->workers_gpuid[i] = config->workers_gpuid[i % number_of_entries];
			
 
				+				workers_gpuid[i] = workers_gpuid[i % number_of_entries];
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -109,7 +176,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
				 	{
			
 
				 		/* by default, we take a round robin policy */
			
 
				 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				-			config->workers_gpuid[i] = (unsigned)i;
			
 
				+			workers_gpuid[i] = (unsigned)(i % nhwgpus);
			
 
				 
			
 
				 		/* StarPU can use sampling techniques to bind threads correctly */
			
 
				 		may_bind_automatically = 1;
			
@@ -117,11 +184,18 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static inline int _starpu_get_next_gpuid(struct starpu_machine_config_s *config)
			
 
				+static inline int _starpu_get_next_cuda_gpuid(struct starpu_machine_config_s *config)
			
 
				 {
			
 
				-	unsigned i = ((config->current_gpuid++) % config->ncudagpus);
			
 
				+	unsigned i = ((config->current_cuda_gpuid++) % config->ncudagpus);
			
 
				 
			
 
				-	return (int)config->workers_gpuid[i];
			
 
				+	return (int)config->workers_cuda_gpuid[i];
			
 
				+}
			
 
				+
			
 
				+static inline int _starpu_get_next_opencl_gpuid(struct starpu_machine_config_s *config)
			
 
				+{
			
 
				+	unsigned i = ((config->current_opencl_gpuid++) % config->nopenclgpus);
			
 
				+
			
 
				+	return (int)config->workers_opencl_gpuid[i];
			
 
				 }
			
 
				 
			
 
				 static void _starpu_init_topology(struct starpu_machine_config_s *config)
			
@@ -152,7 +226,14 @@ static void _starpu_init_topology(struct starpu_machine_config_s *config)
 
				 #warning no way to know number of cores, assuming 1
			
 
				 		config->nhwcpus = 1;
			
 
				 #endif
			
 
				-	
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+                config->nhwcudagpus = _starpu_get_cuda_device_count();
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                config->nhwopenclgpus = _starpu_opencl_get_device_count();
			
 
				+#endif
			
 
				+
			
 
				 		topology_is_initialized = 1;
			
 
				 	}
			
 
				 }
			
@@ -208,22 +289,79 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 
				 	if (config->ncudagpus > 0)
			
 
				 		use_accelerator = 1;
			
 
				 
			
 
				-	_starpu_initialize_workers_gpuid(config);
			
 
				+	_starpu_initialize_workers_cuda_gpuid(config);
			
 
				 
			
 
				 	unsigned cudagpu;
			
 
				 	for (cudagpu = 0; cudagpu < config->ncudagpus; cudagpu++)
			
 
				 	{
			
 
				 		config->workers[config->nworkers + cudagpu].arch = STARPU_CUDA_WORKER;
			
 
				-		int devid = _starpu_get_next_gpuid(config);
			
 
				+		int devid = _starpu_get_next_cuda_gpuid(config);
			
 
				 		enum starpu_perf_archtype arch = STARPU_CUDA_DEFAULT + devid;
			
 
				 		config->workers[config->nworkers + cudagpu].devid = devid;
			
 
				 		config->workers[config->nworkers + cudagpu].perf_arch = arch; 
			
 
				 		config->workers[config->nworkers + cudagpu].worker_mask = STARPU_CUDA;
			
 
				 		config->worker_mask |= STARPU_CUDA;
			
 
				-	}
			
 
				+
			
 
				+                uint32_t key = _starpu_crc32_be(devid, 0);
			
 
				+                _starpu_htbl_insert_32(&devices_using_cuda, key, config);
			
 
				+        }
			
 
				 
			
 
				 	config->nworkers += config->ncudagpus;
			
 
				 #endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	if (user_conf && (user_conf->nopencl == 0))
			
 
				+	{
			
 
				+		/* the user explicitely disabled OpenCL */
			
 
				+		config->nopenclgpus = 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		/* we need to initialize OpenCL early to count the number of devices */
			
 
				+		_starpu_opencl_init();
			
 
				+
			
 
				+		if (user_conf && (user_conf->nopencl != -1))
			
 
				+		{
			
 
				+			explicitval = user_conf->nopencl;
			
 
				+		}
			
 
				+		else {
			
 
				+			explicitval = starpu_get_env_number("STARPU_NOPENCL");
			
 
				+		}
			
 
				+
			
 
				+		if (explicitval < 0) {
			
 
				+			config->nopenclgpus =
			
 
				+				STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
			
 
				+		} else {
			
 
				+			/* use the specified value */
			
 
				+			config->nopenclgpus = (unsigned)explicitval;
			
 
				+			STARPU_ASSERT(config->nopenclgpus <= STARPU_MAXOPENCLDEVS);
			
 
				+		}
			
 
				+		STARPU_ASSERT(config->nopenclgpus + config->nworkers <= STARPU_NMAXWORKERS);
			
 
				+	}
			
 
				+
			
 
				+	if (config->nopenclgpus > 0)
			
 
				+		use_accelerator = 1;
			
 
				+	// TODO: use_accelerator pour les OpenCL?
			
 
				+
			
 
				+	_starpu_initialize_workers_opencl_gpuid(config);
			
 
				+
			
 
				+	unsigned openclgpu;
			
 
				+	for (openclgpu = 0; openclgpu < config->nopenclgpus; openclgpu++)
			
 
				+	{
			
 
				+		int devid = _starpu_get_next_opencl_gpuid(config);
			
 
				+                if (devid == -1) { // There is no more devices left
			
 
				+                  config->nopenclgpus = openclgpu;
			
 
				+                  break;
			
 
				+                }
			
 
				+		config->workers[config->nworkers + openclgpu].arch = STARPU_OPENCL_WORKER;
			
 
				+		enum starpu_perf_archtype arch = STARPU_OPENCL_DEFAULT + devid;
			
 
				+		config->workers[config->nworkers + openclgpu].devid = devid;
			
 
				+		config->workers[config->nworkers + openclgpu].perf_arch = arch; 
			
 
				+		config->workers[config->nworkers + openclgpu].worker_mask = STARPU_OPENCL;
			
 
				+		config->worker_mask |= STARPU_OPENCL;
			
 
				+	}
			
 
				+
			
 
				+	config->nworkers += config->nopenclgpus;
			
 
				+#endif
			
 
				 	
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	if (user_conf && (user_conf->ncuda != -1)) {
			
@@ -500,13 +638,27 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
				 				if (may_bind_automatically)
			
 
				 				{
			
 
				 					/* StarPU is allowed to bind threads automatically */
			
 
				-					preferred_binding = _starpu_get_gpu_affinity_vector(workerarg->devid);
			
 
				+					preferred_binding = _starpu_get_cuda_affinity_vector(workerarg->devid);
			
 
				 					npreferred = config->nhwcpus;
			
 
				 				}
			
 
				 				is_a_set_of_accelerators = 0;
			
 
				 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
			
 
				 				break;
			
 
				 #endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		        case STARPU_OPENCL_WORKER:
			
 
				+				if (may_bind_automatically)
			
 
				+				{
			
 
				+					/* StarPU is allowed to bind threads automatically */
			
 
				+					preferred_binding = _starpu_get_opencl_affinity_vector(workerarg->devid);
			
 
				+					npreferred = config->nhwcpus;
			
 
				+				}
			
 
				+				is_a_set_of_accelerators = 0;
			
 
				+				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
			
 
				+				break;
			
 
				+#endif
			
 
				+
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
 
				 		}
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -59,6 +59,11 @@ inline uint32_t _starpu_may_submit_cpu_task(void)
 
				 	return (STARPU_CPU & config.worker_mask);
			
 
				 }
			
 
				 
			
 
				+inline uint32_t _starpu_may_submit_opencl_task(void)
			
 
				+{
			
 
				+	return (STARPU_OPENCL & config.worker_mask);
			
 
				+}
			
 
				+
			
 
				 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where)
			
 
				 {
			
 
				 	return (where & config.workers[workerid].worker_mask);
			
@@ -90,6 +95,9 @@ static void _starpu_init_worker_queue(struct starpu_worker_s *workerarg)
 
				 		case STARPU_CUDA_WORKER:
			
 
				 			jobq->alpha = STARPU_CUDA_ALPHA;
			
 
				 			break;
			
 
				+		case STARPU_OPENCL_WORKER:
			
 
				+			jobq->alpha = STARPU_OPENCL_ALPHA;
			
 
				+			break;
			
 
				 		case STARPU_GORDON_WORKER:
			
 
				 			jobq->alpha = STARPU_GORDON_ALPHA;
			
 
				 			break;
			
@@ -151,6 +159,15 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 
				 
			
 
				 				break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			case STARPU_OPENCL_WORKER:
			
 
				+				workerarg->set = NULL;
			
 
				+				workerarg->worker_is_initialized = 0;
			
 
				+				pthread_create(&workerarg->worker_thread, 
			
 
				+						NULL, _starpu_opencl_worker, workerarg);
			
 
				+
			
 
				+				break;
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 			case STARPU_GORDON_WORKER:
			
 
				 				/* we will only launch gordon once, but it will handle 
			
@@ -192,6 +209,7 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 
				 		switch (workerarg->arch) {
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				+			case STARPU_OPENCL_WORKER:			  
			
 
				 				PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				 				while (!workerarg->worker_is_initialized)
			
 
				 					PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
@@ -523,6 +541,11 @@ unsigned starpu_get_cuda_worker_count(void)
 
				 	return config.ncudagpus;
			
 
				 }
			
 
				 
			
 
				+unsigned starpu_get_opencl_worker_count(void)
			
 
				+{
			
 
				+	return config.nopenclgpus;
			
 
				+}
			
 
				+
			
 
				 unsigned starpu_get_spu_worker_count(void)
			
 
				 {
			
 
				 	return config.ngordon_spus;
			
@@ -549,6 +572,11 @@ int starpu_get_worker_id(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+int starpu_get_worker_devid(int id)
			
 
				+{
			
 
				+	return config.workers[id].devid;
			
 
				+}
			
 
				+
			
 
				 struct starpu_worker_s *_starpu_get_worker_struct(unsigned id)
			
 
				 {
			
 
				 	return &config.workers[id];
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -42,6 +42,10 @@
 
				 #include <drivers/cuda/driver_cuda.h>
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #include <drivers/gordon/driver_gordon.h>
			
 
				 #endif
			
@@ -52,6 +56,7 @@
 
				 
			
 
				 #define STARPU_CPU_ALPHA	1.0f
			
 
				 #define STARPU_CUDA_ALPHA	13.33f
			
 
				+#define STARPU_OPENCL_ALPHA	12.22f
			
 
				 #define STARPU_GORDON_ALPHA	6.0f /* XXX this is a random value ... */
			
 
				 
			
 
				 #ifdef STARPU_DATA_STATS
			
@@ -105,18 +110,25 @@ struct starpu_machine_config_s {
 
				 #endif
			
 
				 
			
 
				 	unsigned nhwcpus;
			
 
				+        unsigned nhwcudagpus;
			
 
				+        unsigned nhwopenclgpus;
			
 
				 
			
 
				 	unsigned ncpus;
			
 
				 	unsigned ncudagpus;
			
 
				+	unsigned nopenclgpus;
			
 
				 	unsigned ngordon_spus;
			
 
				 
			
 
				 	/* Where to bind workers ? */
			
 
				 	int current_bindid;
			
 
				 	unsigned workers_bindid[STARPU_NMAXWORKERS];
			
 
				 	
			
 
				-	/* Which GPU(s) do we use ? */
			
 
				-	int current_gpuid;
			
 
				-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
			
 
				+	/* Which GPU(s) do we use for CUDA ? */
			
 
				+	int current_cuda_gpuid;
			
 
				+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* Which GPU(s) do we use for OpenCL ? */
			
 
				+	int current_opencl_gpuid;
			
 
				+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
			
 
				 	
			
 
				 	struct starpu_worker_s workers[STARPU_NMAXWORKERS];
			
 
				 	uint32_t worker_mask;
			
@@ -138,6 +150,7 @@ unsigned _starpu_machine_is_running(void);
 
				 inline uint32_t _starpu_worker_exists(uint32_t task_mask);
			
 
				 inline uint32_t _starpu_may_submit_cuda_task(void);
			
 
				 inline uint32_t _starpu_may_submit_cpu_task(void);
			
 
				+inline uint32_t _starpu_may_submit_opencl_task(void);
			
 
				 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where);
			
 
				 unsigned _starpu_worker_can_block(unsigned memnode);
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -25,8 +25,8 @@ uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_n
 
				 	/* in case one of the node is a GPU, it needs to perform the transfer,
			
 
				 	 * if both of them are GPU, it's a bit more complicated (TODO !) */
			
 
				 
			
 
				-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
			
 
				-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM);
			
 
				+	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
			
 
				+	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
			
 
				 
			
 
				 	/* we do not handle GPU->GPU transfers yet ! */
			
 
				 	STARPU_ASSERT( !(src_is_a_gpu && dst_is_a_gpu) );
			
@@ -77,6 +77,8 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
				 			 * 	other should be ok */
			
 
				 			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM)
			
 
				 				break;
			
 
				+			if (_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)
			
 
				+				break;
			
 
				 
			
 
				 			/* XXX do a better algorithm to distribute the memory copies */
			
 
				 			/* TODO : use the "requesting_node" as an argument to do so */
			
@@ -181,8 +183,8 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, uint32_t requesting_no
 
				 			STARPU_ASSERT(src_node != requesting_node);
			
 
				 		}
			
 
				 	
			
 
				-		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
			
 
				-		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM);
			
 
				+		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
			
 
				+		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(requesting_node) == STARPU_OPENCL_RAM);
			
 
				 
			
 
				 		/* we have to perform 2 successive requests for GPU->GPU transfers */
			
 
				 		if (read && (src_is_a_gpu && dst_is_a_gpu)) {
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -135,6 +135,28 @@ cudaStream_t *stream;
 
				 				}
			
 
				 				break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    		        case STARPU_OPENCL_RAM:
			
 
				+				/* OpenCL -> RAM */
			
 
				+				if (_starpu_get_local_memory_node() == src_node)
			
 
				+				{
			
 
				+					STARPU_ASSERT(copy_methods->opencl_to_ram);
			
 
				+					if (!req || !copy_methods->opencl_to_ram_async)
			
 
				+					{
			
 
				+						/* this is not associated to a request so it's synchronous */
			
 
				+                                                copy_methods->opencl_to_ram(handle, src_node, dst_node);
			
 
				+                                        }
			
 
				+                                        else {
			
 
				+                                                ret = copy_methods->opencl_to_ram_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
			
 
				+                                        }
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					/* we should not have a blocking call ! */
			
 
				+					STARPU_ABORT();
			
 
				+				}
			
 
				+				break;
			
 
				+#endif
			
 
				 			case STARPU_SPU_LS:
			
 
				 				STARPU_ABORT(); // TODO
			
 
				 				break;
			
@@ -180,6 +202,34 @@ cudaStream_t *stream;
 
				 		}
			
 
				 		break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	case STARPU_OPENCL_RAM:
			
 
				+		switch (src_kind) {
			
 
				+		        case STARPU_RAM:
			
 
				+				/* STARPU_RAM -> STARPU_OPENCL_RAM */
			
 
				+				STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
			
 
				+				STARPU_ASSERT(copy_methods->ram_to_opencl);
			
 
				+				if (!req || !copy_methods->ram_to_opencl_async)
			
 
				+				{
			
 
				+					/* this is not associated to a request so it's synchronous */
			
 
				+					copy_methods->ram_to_opencl(handle, src_node, dst_node);
			
 
				+				}
			
 
				+				else {
			
 
				+                                        ret = copy_methods->ram_to_opencl_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
			
 
				+				}
			
 
				+				break;
			
 
				+			case STARPU_CUDA_RAM:
			
 
				+			case STARPU_OPENCL_RAM:
			
 
				+			case STARPU_SPU_LS:
			
 
				+				STARPU_ABORT(); // TODO 
			
 
				+				break;
			
 
				+			case STARPU_UNUSED:
			
 
				+			default:
			
 
				+				STARPU_ABORT();
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#endif
			
 
				 	case STARPU_SPU_LS:
			
 
				 		STARPU_ABORT(); // TODO
			
 
				 		break;
			
@@ -275,6 +325,12 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                case STARPU_OPENCL_RAM:
			
 
				+                        fprintf(stderr, "not implemented yet\n");
			
 
				+			STARPU_ABORT();
			
 
				+                        break;
			
 
				+#endif
			
 
				 		case STARPU_RAM:
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
@@ -301,6 +357,17 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                case STARPU_OPENCL_RAM:
			
 
				+                        {
			
 
				+                                cl_int event_status;
			
 
				+                                cl_event opencl_event = (*async_channel).opencl_event;
			
 
				+                                if (opencl_event == NULL) STARPU_ABORT();
			
 
				+                                clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
			
 
				+                                success = (event_status == CL_COMPLETE);
			
 
				+                                break;
			
 
				+                        }
			
 
				+#endif
			
 
				 		case STARPU_RAM:
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -28,6 +28,10 @@
 
				 #include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <CL/cl.h>
			
 
				+#endif
			
 
				+
			
 
				 struct starpu_data_request_s;
			
 
				 
			
 
				 /* this is a structure that can be queried to see whether an asynchronous
			
@@ -37,24 +41,36 @@ typedef union {
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t cuda_event;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        cl_event opencl_event;
			
 
				+#endif
			
 
				 } starpu_async_channel;
			
 
				 
			
 
				 struct starpu_copy_data_methods_s {
			
 
				 	/* src type is ram */
			
 
				 	int (*ram_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 	int (*ram_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				+	int (*ram_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 	int (*ram_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 
			
 
				 	/* src type is cuda */
			
 
				 	int (*cuda_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 	int (*cuda_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				+	int (*cuda_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 	int (*cuda_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 
			
 
				 	/* src type is spu */
			
 
				 	int (*spu_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 	int (*spu_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				+	int (*spu_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 	int (*spu_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				 
			
 
				+	/* src type is opencl */
			
 
				+	int (*opencl_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				+	int (*opencl_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				+	int (*opencl_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				+	int (*opencl_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	/* for asynchronous CUDA transfers */
			
 
				 	int (*ram_to_cuda_async)(starpu_data_handle handle, uint32_t src,
			
@@ -64,6 +80,13 @@ struct starpu_copy_data_methods_s {
 
				 	int (*cuda_to_cuda_async)(starpu_data_handle handle, uint32_t src,
			
 
				 					uint32_t dst, cudaStream_t *stream);
			
 
				 #endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	/* for asynchronous OpenCL transfers */
			
 
				+        int (*ram_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
			
 
				+	int (*opencl_to_ram_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
			
 
				+	int (*opencl_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
			
 
				+#endif
			
 
				 };
			
 
				 
			
 
				 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -22,6 +22,11 @@
 
				 #include <datawizard/filters.h>
			
 
				 #include <common/hash.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * BCSR : blocked CSR, we use blocks of size (r x c)
			
 
				  */
			
@@ -31,6 +36,10 @@ static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, u
 
				 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+#endif
			
 
				 
			
 
				 static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
			
 
				 	.ram_to_ram = dummy_copy_ram_to_ram,
			
@@ -39,6 +48,10 @@ static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram = copy_opencl_to_ram,
			
 
				+#endif
			
 
				 	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
@@ -274,6 +287,27 @@ static size_t allocate_bcsr_buffer_on_node(starpu_data_handle handle, uint32_t d
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+                        {
			
 
				+                                int ret;
			
 
				+                                void *ptr;
			
 
				+
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
			
 
				+                                addr_nzval = (uintptr_t)ptr;
			
 
				+                                if (ret) goto fail_nzval;
			
 
				+
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+                                addr_colind = ptr;
			
 
				+				if (ret) goto fail_colind;
			
 
				+
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+                                addr_rowptr = ptr;
			
 
				+				if (ret) goto fail_rowptr;
			
 
				+
			
 
				+                                break;
			
 
				+                        }
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -298,6 +332,11 @@ fail_rowptr:
 
				 			cudaFree((void*)addr_colind);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			clReleaseMemObject((void*)addr_colind);
			
 
				+			break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -311,6 +350,11 @@ fail_colind:
 
				 			cudaFree((void*)addr_nzval);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			clReleaseMemObject((void*)addr_nzval);
			
 
				+			break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -341,6 +385,13 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 
				 			cudaFree((void*)bcsr_interface->rowptr);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			clReleaseMemObject((void*)bcsr_interface->nzval);
			
 
				+			clReleaseMemObject((void*)bcsr_interface->colind);
			
 
				+			clReleaseMemObject((void*)bcsr_interface->rowptr);
			
 
				+			break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -420,6 +471,76 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_bcsr_interface_t *src_bcsr;
			
 
				+	starpu_bcsr_interface_t *dst_bcsr;
			
 
				+
			
 
				+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	uint32_t nnz = src_bcsr->nnz;
			
 
				+	uint32_t nrow = src_bcsr->nrow;
			
 
				+	size_t elemsize = src_bcsr->elemsize;
			
 
				+
			
 
				+	uint32_t r = src_bcsr->r;
			
 
				+	uint32_t c = src_bcsr->c;
			
 
				+
			
 
				+        int err;
			
 
				+
			
 
				+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->nzval, (void *)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->colind, (void *)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->rowptr, (void *)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_bcsr_interface_t *src_bcsr;
			
 
				+	starpu_bcsr_interface_t *dst_bcsr;
			
 
				+
			
 
				+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	uint32_t nnz = src_bcsr->nnz;
			
 
				+	uint32_t nrow = src_bcsr->nrow;
			
 
				+	size_t elemsize = src_bcsr->elemsize;
			
 
				+
			
 
				+	uint32_t r = src_bcsr->r;
			
 
				+	uint32_t c = src_bcsr->c;
			
 
				+
			
 
				+        int err;
			
 
				+
			
 
				+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->nzval, (cl_mem)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->colind, (cl_mem)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->rowptr, (cl_mem)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif // STARPU_USE_OPENCL
			
 
				+
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -22,6 +22,11 @@
 
				 
			
 
				 #include <common/hash.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
@@ -29,6 +34,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
				 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+#endif
			
 
				 
			
 
				 static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
			
 
				 	.ram_to_ram = dummy_copy_ram_to_ram,
			
@@ -39,6 +50,12 @@ static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram = copy_opencl_to_ram,
			
 
				+        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				+	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				+#endif
			
 
				 	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
@@ -94,11 +111,15 @@ static void register_block_handle(starpu_data_handle handle, uint32_t home_node,
 
				 
			
 
				 		if (node == home_node) {
			
 
				 			local_interface->ptr = block_interface->ptr;
			
 
				+                        local_interface->dev_handle = block_interface->dev_handle;
			
 
				+                        local_interface->offset = block_interface->offset;
			
 
				 			local_interface->ldy  = block_interface->ldy;
			
 
				 			local_interface->ldz  = block_interface->ldz;
			
 
				 		}
			
 
				 		else {
			
 
				 			local_interface->ptr = 0;
			
 
				+                        local_interface->dev_handle = 0;
			
 
				+                        local_interface->offset = 0;
			
 
				 			local_interface->ldy  = 0;
			
 
				 			local_interface->ldz  = 0;
			
 
				 		}
			
@@ -117,6 +138,8 @@ void starpu_register_block_data(starpu_data_handle *handleptr, uint32_t home_nod
 
				 {
			
 
				 	starpu_block_interface_t interface = {
			
 
				 		.ptr = ptr,
			
 
				+                .dev_handle = ptr,
			
 
				+                .offset = 0,
			
 
				 		.ldy = ldy,
			
 
				 		.ldz = ldz,
			
 
				 		.nx = nx,
			
@@ -278,6 +301,19 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	        case STARPU_OPENCL_RAM:
			
 
				+			{
			
 
				+                                int ret;
			
 
				+                                void *ptr;
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
			
 
				+                                addr = (uintptr_t)ptr;
			
 
				+				if (ret) {
			
 
				+					fail = 1;
			
 
				+				}
			
 
				+				break;
			
 
				+			}
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -288,6 +324,8 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
				 
			
 
				 		/* update the data properly in consequence */
			
 
				 		dst_block->ptr = addr;
			
 
				+                dst_block->dev_handle = addr;
			
 
				+                dst_block->offset = 0;
			
 
				 		dst_block->ldy = nx;
			
 
				 		dst_block->ldz = nx*ny;
			
 
				 	} else {
			
@@ -319,6 +357,11 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node)
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                case STARPU_OPENCL_RAM:
			
 
				+                        clReleaseMemObject((void *)block_interface->ptr);
			
 
				+                        break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -660,6 +703,85 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_block_interface_t *src_block;
			
 
				+	starpu_block_interface_t *dst_block;
			
 
				+
			
 
				+	src_block = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
			
 
				+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				+                                                dst_block->offset, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_block_interface_t *src_block;
			
 
				+	starpu_block_interface_t *dst_block;
			
 
				+
			
 
				+	src_block = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
			
 
				+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				+                                                  src_block->offset, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_block_interface_t *src_block;
			
 
				+	starpu_block_interface_t *dst_block;
			
 
				+
			
 
				+	src_block = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
			
 
				+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				+                                                dst_block->offset, NULL);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_block_interface_t *src_block;
			
 
				+	starpu_block_interface_t *dst_block;
			
 
				+
			
 
				+	src_block = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
			
 
				+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				+                                                  src_block->offset, NULL);
			
 
				+
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -22,12 +22,20 @@
 
				 
			
 
				 #include <common/hash.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				 
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+#endif
			
 
				 
			
 
				 static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
			
 
				 	.ram_to_ram = dummy_copy_ram_to_ram,
			
@@ -36,6 +44,10 @@ static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram = copy_opencl_to_ram,
			
 
				+#endif
			
 
				 	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
@@ -246,6 +258,27 @@ static size_t allocate_csr_buffer_on_node(starpu_data_handle handle, uint32_t ds
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	        case STARPU_OPENCL_RAM:
			
 
				+			{
			
 
				+                                int ret;
			
 
				+                                void *ptr;
			
 
				+
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
			
 
				+                                addr_nzval = (uintptr_t)ptr;
			
 
				+				if (ret) goto fail_nzval;
			
 
				+
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+                                addr_colind = ptr;
			
 
				+				if (ret) goto fail_colind;
			
 
				+
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
			
 
				+                                addr_rowptr = ptr;
			
 
				+				if (ret) goto fail_rowptr;
			
 
				+
			
 
				+				break;
			
 
				+			}
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -270,6 +303,11 @@ fail_rowptr:
 
				 			cudaFree((void*)addr_colind);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			clReleaseMemObject((void*)addr_colind);
			
 
				+			break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -283,6 +321,11 @@ fail_colind:
 
				 			cudaFree((void*)addr_nzval);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			clReleaseMemObject((void*)addr_nzval);
			
 
				+			break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -313,6 +356,13 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 
				 			cudaFree((void*)csr_interface->rowptr);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			clReleaseMemObject((void*)csr_interface->nzval);
			
 
				+			clReleaseMemObject((void*)csr_interface->colind);
			
 
				+			clReleaseMemObject((void*)csr_interface->rowptr);
			
 
				+			break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -386,6 +436,70 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_csr_interface_t *src_csr;
			
 
				+	starpu_csr_interface_t *dst_csr;
			
 
				+
			
 
				+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+        int err;
			
 
				+
			
 
				+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_csr_interface_t *src_csr;
			
 
				+	starpu_csr_interface_t *dst_csr;
			
 
				+
			
 
				+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+        int err;
			
 
				+
			
 
				+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = _starpu_opencl_copy_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif // STARPU_USE_OPENCL
			
 
				+
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
--- a/src/datawizard/interfaces/matrix_filters.c
+++ b/src/datawizard/interfaces/matrix_filters.c
@@ -69,6 +69,8 @@ void starpu_block_filter_func(starpu_filter *f, starpu_data_handle root_handle)
 
				 
			
 
				 				local->ptr = local_root->ptr + offset;
			
 
				 				local->ld = local_root->ld;
			
 
				+                                local->dev_handle = local_root->dev_handle;
			
 
				+                                local->offset = local_root->offset + offset;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -122,6 +124,8 @@ void starpu_vertical_block_filter_func(starpu_filter *f, starpu_data_handle root
 
				 					(size_t)chunk*chunk_size*local_root->ld*elemsize;
			
 
				 				local->ptr = local_root->ptr + offset;
			
 
				 				local->ld = local_root->ld;
			
 
				+                                local->dev_handle = local_root->dev_handle;
			
 
				+                                local->offset = local_root->offset + offset;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -26,6 +26,10 @@
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				 
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
				 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+#endif
			
 
				 
			
 
				 static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
			
 
				 	.ram_to_ram = dummy_copy_ram_to_ram,
			
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram = copy_opencl_to_ram,
			
 
				+        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				+	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				+#endif
			
 
				 	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
@@ -106,10 +122,14 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 
				 
			
 
				 		if (node == home_node) {
			
 
				 			local_interface->ptr = matrix_interface->ptr;
			
 
				+                        local_interface->dev_handle = matrix_interface->dev_handle;
			
 
				+                        local_interface->offset = matrix_interface->offset;
			
 
				 			local_interface->ld  = matrix_interface->ld;
			
 
				 		}
			
 
				 		else {
			
 
				 			local_interface->ptr = 0;
			
 
				+			local_interface->dev_handle = 0;
			
 
				+			local_interface->offset = 0;
			
 
				 			local_interface->ld  = 0;
			
 
				 		}
			
 
				 
			
@@ -129,7 +149,9 @@ void starpu_register_matrix_data(starpu_data_handle *handleptr, uint32_t home_no
 
				 		.ld = ld,
			
 
				 		.nx = nx,
			
 
				 		.ny = ny,
			
 
				-		.elemsize = elemsize
			
 
				+		.elemsize = elemsize,
			
 
				+                .dev_handle = ptr,
			
 
				+                .offset = 0
			
 
				 	};
			
 
				 
			
 
				 	_starpu_register_data_handle(handleptr, home_node, &interface, &_starpu_interface_matrix_ops);
			
@@ -256,6 +278,19 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	        case STARPU_OPENCL_RAM:
			
 
				+			{
			
 
				+                                int ret;
			
 
				+                                void *ptr;
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*elemsize, CL_MEM_READ_WRITE);
			
 
				+                                addr = (uintptr_t)ptr;
			
 
				+				if (ret) {
			
 
				+					fail = 1;
			
 
				+				}
			
 
				+				break;
			
 
				+			}
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -266,6 +301,8 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
				 
			
 
				 		/* update the data properly in consequence */
			
 
				 		interface->ptr = addr;
			
 
				+                interface->dev_handle = addr;
			
 
				+                interface->offset = 0;
			
 
				 		interface->ld = ld;
			
 
				 	} else {
			
 
				 		/* allocation failed */
			
@@ -296,6 +333,11 @@ static void liberate_matrix_buffer_on_node(void *interface, uint32_t node)
 
				 
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                case STARPU_OPENCL_RAM:
			
 
				+                        clReleaseMemObject((void *)matrix_interface->ptr);
			
 
				+                        break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -423,6 +465,83 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
				 
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_matrix_interface_t *src_matrix;
			
 
				+	starpu_matrix_interface_t *dst_matrix;
			
 
				+
			
 
				+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				+                                                dst_matrix->offset, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_matrix_interface_t *src_matrix;
			
 
				+	starpu_matrix_interface_t *dst_matrix;
			
 
				+
			
 
				+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
			
 
				+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				+                                                  src_matrix->offset, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_matrix_interface_t *src_matrix;
			
 
				+	starpu_matrix_interface_t *dst_matrix;
			
 
				+
			
 
				+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				+                                                dst_matrix->offset, NULL);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_matrix_interface_t *src_matrix;
			
 
				+	starpu_matrix_interface_t *dst_matrix;
			
 
				+
			
 
				+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
			
 
				+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				+                                                  src_matrix->offset, NULL);
			
 
				+
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -26,6 +26,10 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				 
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
				 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+#endif
			
 
				 
			
 
				 static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
			
 
				 	.ram_to_ram = dummy_copy_ram_to_ram,
			
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram = copy_opencl_to_ram,
			
 
				+        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				+	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				+#endif
			
 
				 	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
@@ -192,6 +208,19 @@ static size_t allocate_variable_buffer_on_node(starpu_data_handle handle, uint32
 
				 			}
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	        case STARPU_OPENCL_RAM:
			
 
				+			{
			
 
				+                                int ret;
			
 
				+                                void *ptr;
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
			
 
				+                                addr = (uintptr_t)ptr;
			
 
				+				if (ret) {
			
 
				+					fail = 1;
			
 
				+				}
			
 
				+				break;
			
 
				+			}
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -220,6 +249,11 @@ static void liberate_variable_buffer_on_node(void *interface, uint32_t node)
 
				 			cudaFree((void*)STARPU_GET_VARIABLE_PTR(interface));
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                case STARPU_OPENCL_RAM:
			
 
				+                        clReleaseMemObject((void*)STARPU_GET_VARIABLE_PTR(interface));
			
 
				+                        break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -324,6 +358,81 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
				 
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_variable_interface_t *src_variable;
			
 
				+	starpu_variable_interface_t *dst_variable;
			
 
				+
			
 
				+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
			
 
				+                                                0, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_variable_interface_t *src_variable;
			
 
				+	starpu_variable_interface_t *dst_variable;
			
 
				+
			
 
				+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
			
 
				+                                                  0, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_variable_interface_t *src_variable;
			
 
				+	starpu_variable_interface_t *dst_variable;
			
 
				+
			
 
				+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
			
 
				+                                                0, NULL);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_variable_interface_t *src_variable;
			
 
				+	starpu_variable_interface_t *dst_variable;
			
 
				+
			
 
				+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
			
 
				+                                                  0, NULL);
			
 
				+
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable;
			
--- a/src/datawizard/interfaces/vector_filters.c
+++ b/src/datawizard/interfaces/vector_filters.c
@@ -62,6 +62,8 @@ void starpu_block_filter_func_vector(starpu_filter *f, starpu_data_handle root_h
 
				 					starpu_data_get_interface_on_node(root_handle, node);
			
 
				 
			
 
				 				local->ptr = local_root->ptr + offset;
			
 
				+                                local->dev_handle = local_root->dev_handle;
			
 
				+                                local->offset = local_root->offset + offset;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -100,6 +102,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 
				 				starpu_data_get_interface_on_node(root_handle, node);
			
 
				 
			
 
				 			local->ptr = local_root->ptr;
			
 
				+                        local->offset = local_root->offset;
			
 
				+                        local->dev_handle = local_root->dev_handle;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -119,6 +123,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 
				 				starpu_data_get_interface_on_node(root_handle, node);
			
 
				 
			
 
				 			local->ptr = local_root->ptr + length_first*elemsize;
			
 
				+                        local->offset = local_root->offset + length_first*elemsize;
			
 
				+                        local->dev_handle = local_root->dev_handle;
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -161,6 +167,8 @@ void starpu_list_filter_func_vector(starpu_filter *f, starpu_data_handle root_ha
 
				 					starpu_data_get_interface_on_node(root_handle, node);
			
 
				 
			
 
				 				local->ptr = local_root->ptr + current_pos*elemsize;
			
 
				+                                local->offset = local_root->offset + current_pos*elemsize;
			
 
				+                                local->dev_handle = local_root->dev_handle;
			
 
				 			}
			
 
				 		}
			
 
				 
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,6 +25,10 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				 
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -33,6 +37,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
				 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
			
 
				+#endif
			
 
				 
			
 
				 static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
			
 
				 	.ram_to_ram = dummy_copy_ram_to_ram,
			
@@ -43,6 +53,12 @@ static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram = copy_opencl_to_ram,
			
 
				+        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				+	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				+#endif
			
 
				 	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
@@ -87,9 +103,13 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 
				 
			
 
				 		if (node == home_node) {
			
 
				 			local_interface->ptr = vector_interface->ptr;
			
 
				+                        local_interface->dev_handle = vector_interface->dev_handle;
			
 
				+                        local_interface->offset = vector_interface->offset;
			
 
				 		}
			
 
				 		else {
			
 
				 			local_interface->ptr = 0;
			
 
				+                        local_interface->dev_handle = 0;
			
 
				+                        local_interface->offset = 0;
			
 
				 		}
			
 
				 
			
 
				 		local_interface->nx = vector_interface->nx;
			
@@ -116,7 +136,9 @@ void starpu_register_vector_data(starpu_data_handle *handleptr, uint32_t home_no
 
				 	starpu_vector_interface_t vector = {
			
 
				 		.ptr = ptr,
			
 
				 		.nx = nx,
			
 
				-		.elemsize = elemsize
			
 
				+		.elemsize = elemsize,
			
 
				+                .dev_handle = ptr,
			
 
				+                .offset = 0
			
 
				 	};	
			
 
				 
			
 
				 	_starpu_register_data_handle(handleptr, home_node, &vector, &interface_vector_ops); 
			
@@ -216,6 +238,19 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
				 			}
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	        case STARPU_OPENCL_RAM:
			
 
				+			{
			
 
				+                                int ret;
			
 
				+                                void *ptr;
			
 
				+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*elemsize, CL_MEM_READ_WRITE);
			
 
				+                                addr = (uintptr_t)ptr;
			
 
				+				if (ret) {
			
 
				+					fail = 1;
			
 
				+				}
			
 
				+				break;
			
 
				+			}
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -228,6 +263,8 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				 	interface->ptr = addr;
			
 
				+        interface->dev_handle = addr;
			
 
				+        interface->offset = 0;
			
 
				 	
			
 
				 	return allocated_memory;
			
 
				 }
			
@@ -246,6 +283,11 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 
				 			cudaFree((void*)vector_interface->ptr);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+                case STARPU_OPENCL_RAM:
			
 
				+                        clReleaseMemObject((void *)vector_interface->ptr);
			
 
				+                        break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			assert(0);
			
 
				 	}
			
@@ -349,6 +391,80 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
				 
			
 
				 
			
 
				 #endif // STARPU_USE_CUDA
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_vector_interface_t *src_vector;
			
 
				+	starpu_vector_interface_t *dst_vector;
			
 
				+
			
 
				+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
			
 
				+                                                dst_vector->offset, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
			
 
				+	starpu_vector_interface_t *src_vector;
			
 
				+	starpu_vector_interface_t *dst_vector;
			
 
				+
			
 
				+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
			
 
				+                                                  src_vector->offset, event);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+
			
 
				+	return EAGAIN;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_vector_interface_t *src_vector;
			
 
				+	starpu_vector_interface_t *dst_vector;
			
 
				+
			
 
				+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
			
 
				+                                                dst_vector->offset, NULL);
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
			
 
				+	starpu_vector_interface_t *src_vector;
			
 
				+	starpu_vector_interface_t *dst_vector;
			
 
				+
			
 
				+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
			
 
				+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
			
 
				+
			
 
				+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
			
 
				+                                                  src_vector->offset, NULL);
			
 
				+
			
 
				+        if (STARPU_UNLIKELY(err))
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				 
			
 
				 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -26,7 +26,8 @@ typedef enum {
 
				 	STARPU_UNUSED,
			
 
				 	STARPU_SPU_LS,
			
 
				 	STARPU_RAM,
			
 
				-	STARPU_CUDA_RAM
			
 
				+	STARPU_CUDA_RAM,
			
 
				+        STARPU_OPENCL_RAM,
			
 
				 } starpu_node_kind;
			
 
				 
			
 
				 typedef struct {
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -0,0 +1,422 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/utils.h>
			
 
				+#include <core/debug.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "driver_opencl.h"
			
 
				+#include "driver_opencl_utils.h"
			
 
				+#include <common/utils.h>
			
 
				+
			
 
				+static cl_context contexts[STARPU_MAXOPENCLDEVS];
			
 
				+static cl_device_id devices[STARPU_MAXOPENCLDEVS];
			
 
				+static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
			
 
				+static cl_uint nb_devices = -1;
			
 
				+static int init_done = 0;
			
 
				+extern char *_starpu_opencl_codelet_dir;
			
 
				+
			
 
				+void starpu_opencl_get_context(int devid, cl_context *context)
			
 
				+{
			
 
				+        *context = contexts[devid];
			
 
				+}
			
 
				+
			
 
				+void starpu_opencl_get_device(int devid, cl_device_id *device)
			
 
				+{
			
 
				+        *device = devices[devid];
			
 
				+}
			
 
				+
			
 
				+void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
			
 
				+{
			
 
				+        *queue = queues[devid];
			
 
				+}
			
 
				+
			
 
				+int _starpu_opencl_init_context(int devid)
			
 
				+{
			
 
				+	cl_int err;
			
 
				+        cl_device_id device;
			
 
				+
			
 
				+        _STARPU_OPENCL_DEBUG("Initialising context for dev %d\n", devid);
			
 
				+
			
 
				+        // Create a compute context
			
 
				+        device = devices[devid];
			
 
				+        contexts[devid] = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        // Create queue for the given device
			
 
				+        queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        _starpu_opencl_init_programs(devid);
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int _starpu_opencl_deinit_context(int devid)
			
 
				+{
			
 
				+        int err;
			
 
				+
			
 
				+        _STARPU_OPENCL_DEBUG("De-initialising context for dev %d\n", devid);
			
 
				+
			
 
				+        err = clReleaseContext(contexts[devid]);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        err = clReleaseCommandQueue(queues[devid]);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        _starpu_opencl_release_programs(devid);
			
 
				+
			
 
				+        return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags)
			
 
				+{
			
 
				+	cl_int err;
			
 
				+        cl_mem address;
			
 
				+        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+
			
 
				+	address = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        *addr = address;
			
 
				+        return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event)
			
 
				+{
			
 
				+      int err;
			
 
				+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+
			
 
				+      if (event == NULL) {
			
 
				+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				+      }
			
 
				+      else {
			
 
				+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
			
 
				+      }
			
 
				+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+      return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event)
			
 
				+{
			
 
				+      int err;
			
 
				+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
			
 
				+
			
 
				+      if (event == NULL) {
			
 
				+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				+      }
			
 
				+      else {
			
 
				+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
			
 
				+      }
			
 
				+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+      return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+void _starpu_opencl_init()
			
 
				+{
			
 
				+   if (!init_done) {
			
 
				+           cl_platform_id platform_id[STARPU_OPENCL_PLATFORM_MAX];
			
 
				+           cl_uint nb_platforms;
			
 
				+           cl_device_type device_type = CL_DEVICE_TYPE_GPU;
			
 
				+           cl_int err;
			
 
				+
			
 
				+           _STARPU_OPENCL_DEBUG("Initialising OpenCL\n");
			
 
				+
			
 
				+           // Get Platforms
			
 
				+           err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
			
 
				+           if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+           _STARPU_OPENCL_DEBUG("Platforms detected: %d\n", nb_platforms);
			
 
				+
			
 
				+           // Get devices
			
 
				+           nb_devices = 0;
			
 
				+           {
			
 
				+                   unsigned int i;
			
 
				+                   for (i=0; i<nb_platforms; i++) {
			
 
				+                           cl_uint num;
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+                           {
			
 
				+                                   char name[1024], vendor[1024];
			
 
				+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
			
 
				+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
			
 
				+                                   _STARPU_OPENCL_DEBUG("Platform: %s - %s\n", name, vendor);
			
 
				+                           }
			
 
				+#endif
			
 
				+                           err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
			
 
				+                           if (err == CL_DEVICE_NOT_FOUND) {
			
 
				+                                   _STARPU_OPENCL_DEBUG("  No devices detected on this platform\n");
			
 
				+                           }
			
 
				+                           else {
			
 
				+                                   if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                                   _STARPU_OPENCL_DEBUG("  %d devices detected\n", num);
			
 
				+                                   nb_devices += num;
			
 
				+                           }
			
 
				+         }
			
 
				+      }
			
 
				+
			
 
				+      // Get location of OpenCl codelet source files
			
 
				+      _starpu_opencl_codelet_dir = getenv("STARPU_OPENCL_CODELET_DIR");
			
 
				+
			
 
				+      init_done=1;
			
 
				+   }
			
 
				+}
			
 
				+
			
 
				+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
			
 
				+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args);
			
 
				+
			
 
				+void *_starpu_opencl_worker(void *arg)
			
 
				+{
			
 
				+	struct starpu_worker_s* args = arg;
			
 
				+
			
 
				+	int devid = args->devid;
			
 
				+	unsigned memory_node = args->memory_node;
			
 
				+
			
 
				+#ifdef USE_FXT
			
 
				+	fxt_register_thread(args->bindid);
			
 
				+#endif
			
 
				+
			
 
				+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
			
 
				+
			
 
				+	_starpu_set_local_memory_node_key(&(args->memory_node));
			
 
				+
			
 
				+	_starpu_set_local_queue(args->jobq);
			
 
				+
			
 
				+	_starpu_set_local_worker_key(args);
			
 
				+
			
 
				+	/* this is only useful (and meaningful) is there is a single
			
 
				+	   memory node "related" to that queue */
			
 
				+	args->jobq->memory_node = memory_node;
			
 
				+
			
 
				+	args->jobq->total_computation_time = 0.0;
			
 
				+	args->jobq->total_communication_time = 0.0;
			
 
				+	args->jobq->total_computation_time_error = 0.0;
			
 
				+	args->jobq->total_job_performed = 0;
			
 
				+
			
 
				+	_starpu_opencl_init_context(devid);
			
 
				+
			
 
				+	/* one more time to avoid hacks from third party lib :) */
			
 
				+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
			
 
				+
			
 
				+	args->status = STATUS_UNKNOWN;
			
 
				+
			
 
				+	/* get the device's name */
			
 
				+	char devname[128];
			
 
				+	_starpu_opencl_get_device_name(devid, devname, 128);
			
 
				+	snprintf(args->name, 32, "OpenCL %d (%s)", args->devid, devname);
			
 
				+
			
 
				+	_STARPU_OPENCL_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
			
 
				+
			
 
				+	STARPU_TRACE_WORKER_INIT_END
			
 
				+
			
 
				+	/* tell the main thread that this one is ready */
			
 
				+	PTHREAD_MUTEX_LOCK(&args->mutex);
			
 
				+	args->worker_is_initialized = 1;
			
 
				+	PTHREAD_COND_SIGNAL(&args->ready_cond);
			
 
				+	PTHREAD_MUTEX_UNLOCK(&args->mutex);
			
 
				+
			
 
				+	struct starpu_job_s * j;
			
 
				+	int res;
			
 
				+
			
 
				+	struct starpu_sched_policy_s *policy = _starpu_get_sched_policy();
			
 
				+	struct starpu_jobq_s *queue = policy->starpu_get_local_queue(policy);
			
 
				+	unsigned memnode = args->memory_node;
			
 
				+
			
 
				+	while (_starpu_machine_is_running())
			
 
				+	{
			
 
				+		STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+		_starpu_datawizard_progress(memnode, 1);
			
 
				+		STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+
			
 
				+		_starpu_execute_registered_progression_hooks();
			
 
				+
			
 
				+		_starpu_jobq_lock(queue);
			
 
				+
			
 
				+		/* perhaps there is some local task to be executed first */
			
 
				+		j = _starpu_pop_local_task(args);
			
 
				+
			
 
				+		/* otherwise ask a task to the scheduler */
			
 
				+		if (!j)
			
 
				+			j = _starpu_pop_task();
			
 
				+
			
 
				+		if (j == NULL) {
			
 
				+			if (_starpu_worker_can_block(memnode))
			
 
				+				PTHREAD_COND_WAIT(&queue->activity_cond, &queue->activity_mutex);
			
 
				+			_starpu_jobq_unlock(queue);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		_starpu_jobq_unlock(queue);
			
 
				+
			
 
				+		/* can OpenCL do that task ? */
			
 
				+		if (!STARPU_OPENCL_MAY_PERFORM(j))
			
 
				+		{
			
 
				+			/* this is not a OpenCL task */
			
 
				+			_starpu_push_task(j);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		_starpu_set_current_task(j->task);
			
 
				+
			
 
				+		res = _starpu_opencl_execute_job(j, args);
			
 
				+
			
 
				+		_starpu_set_current_task(NULL);
			
 
				+
			
 
				+                if (res) {
			
 
				+			switch (res) {
			
 
				+				case -EAGAIN:
			
 
				+					fprintf(stderr, "ouch, put the codelet %p back ... \n", j);
			
 
				+					_starpu_push_task(j);
			
 
				+					STARPU_ABORT();
			
 
				+					continue;
			
 
				+				default:
			
 
				+					assert(0);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		_starpu_handle_job_termination(j);
			
 
				+	}
			
 
				+
			
 
				+	STARPU_TRACE_WORKER_DEINIT_START
			
 
				+
			
 
				+          _starpu_opencl_deinit_context(devid);
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+	fprintf(stderr, "OpenCL #%d computation %le comm %le (%lf \%%)\n", args->id, args->jobq->total_computation_time, args->jobq->total_communication_time, args->jobq->total_communication_time*100.0/args->jobq->total_computation_time);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+	double ratio = 0;
			
 
				+	if (args->jobq->total_job_performed != 0)
			
 
				+	{
			
 
				+		ratio = args->jobq->total_computation_time_error/args->jobq->total_computation_time;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	_starpu_print_to_logfile("MODEL ERROR: OpenCL %d ERROR %lf EXEC %lf RATIO %lf NTASKS %d\n", args->devid, args->jobq->total_computation_time_error, args->jobq->total_computation_time, ratio, args->jobq->total_job_performed);
			
 
				+#endif
			
 
				+
			
 
				+	pthread_exit(NULL);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+        if (!init_done) {
			
 
				+                _starpu_opencl_init();
			
 
				+        }
			
 
				+
			
 
				+	// Get device name
			
 
				+	err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	_STARPU_OPENCL_DEBUG("Device %d : [%s]\n", dev, name);
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+unsigned _starpu_opencl_get_device_count(void)
			
 
				+{
			
 
				+        if (!init_done) {
			
 
				+                _starpu_opencl_init();
			
 
				+        }
			
 
				+	return nb_devices;
			
 
				+}
			
 
				+
			
 
				+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args)
			
 
				+{
			
 
				+	int ret;
			
 
				+//	uint32_t mask = (1<<0);
			
 
				+	uint32_t mask = 0;
			
 
				+
			
 
				+	STARPU_ASSERT(j);
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	starpu_tick_t codelet_start, codelet_end;
			
 
				+	starpu_tick_t codelet_start_comm, codelet_end_comm;
			
 
				+
			
 
				+	unsigned calibrate_model = 0;
			
 
				+
			
 
				+	STARPU_ASSERT(task);
			
 
				+	struct starpu_codelet_t *cl = task->cl;
			
 
				+	STARPU_ASSERT(cl);
			
 
				+
			
 
				+	if (cl->model && cl->model->benchmarking)
			
 
				+		calibrate_model = 1;
			
 
				+
			
 
				+	/* we do not take communication into account when modeling the performance */
			
 
				+	if (STARPU_BENCHMARK_COMM)
			
 
				+	{
			
 
				+                //barrier(CLK_GLOBAL_MEM_FENCE);
			
 
				+		STARPU_GET_TICK(codelet_start_comm);
			
 
				+	}
			
 
				+
			
 
				+	ret = _starpu_fetch_task_input(task, mask);
			
 
				+	if (ret != 0) {
			
 
				+		/* there was not enough memory, so the input of
			
 
				+		 * the codelet cannot be fetched ... put the
			
 
				+		 * codelet back, and try it later */
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	if (calibrate_model || STARPU_BENCHMARK_COMM)
			
 
				+	{
			
 
				+                //barrier(CLK_GLOBAL_MEM_FENCE);
			
 
				+		STARPU_GET_TICK(codelet_end_comm);
			
 
				+	}
			
 
				+
			
 
				+	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				+
			
 
				+	args->status = STATUS_EXECUTING;
			
 
				+	cl_func func = cl->opencl_func;
			
 
				+	STARPU_ASSERT(func);
			
 
				+	STARPU_GET_TICK(codelet_start);
			
 
				+	func(task->interface, task->cl_arg);
			
 
				+
			
 
				+	cl->per_worker_stats[args->workerid]++;
			
 
				+
			
 
				+	STARPU_GET_TICK(codelet_end);
			
 
				+
			
 
				+	args->status = STATUS_UNKNOWN;
			
 
				+
			
 
				+	STARPU_TRACE_END_CODELET_BODY(j);
			
 
				+
			
 
				+	if (calibrate_model || STARPU_BENCHMARK_COMM)
			
 
				+	{
			
 
				+		double measured = _starpu_timing_delay(&codelet_start, &codelet_end);
			
 
				+		double measured_comm = _starpu_timing_delay(&codelet_start_comm, &codelet_end_comm);
			
 
				+
			
 
				+		args->jobq->total_computation_time += measured;
			
 
				+		args->jobq->total_communication_time += measured_comm;
			
 
				+
			
 
				+		double error;
			
 
				+		error = fabs(STARPU_MAX(measured, 0.0) - STARPU_MAX(j->predicted, 0.0));
			
 
				+		args->jobq->total_computation_time_error += error;
			
 
				+
			
 
				+		if (calibrate_model)
			
 
				+			_starpu_update_perfmodel_history(j, args->perf_arch, (unsigned)args->devid, measured);
			
 
				+	}
			
 
				+
			
 
				+	args->jobq->total_job_performed++;
			
 
				+
			
 
				+	_starpu_push_task_output(task, mask);
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/src/drivers/opencl/driver_opencl.h
+++ b/src/drivers/opencl/driver_opencl.h
@@ -0,0 +1,57 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DRIVER_OPENCL_H__
			
 
				+#define __DRIVER_OPENCL_H__
			
 
				+
			
 
				+#ifndef _GNU_SOURCE
			
 
				+#define _GNU_SOURCE
			
 
				+#endif
			
 
				+
			
 
				+#include <CL/cl.h>
			
 
				+
			
 
				+extern
			
 
				+int _starpu_opencl_init_context(int devid);
			
 
				+
			
 
				+extern
			
 
				+int _starpu_opencl_deinit_context(int devid);
			
 
				+
			
 
				+extern
			
 
				+unsigned _starpu_opencl_get_device_count(void);
			
 
				+
			
 
				+extern
			
 
				+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
			
 
				+
			
 
				+extern
			
 
				+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event);
			
 
				+
			
 
				+extern
			
 
				+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event);
			
 
				+
			
 
				+extern
			
 
				+void _starpu_opencl_init(void);
			
 
				+
			
 
				+extern
			
 
				+void *_starpu_opencl_worker(void *);
			
 
				+
			
 
				+extern
			
 
				+int _starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue,
			
 
				+                               char *program_name, char *kernel_name, int dev);
			
 
				+
			
 
				+extern
			
 
				+int _starpu_opencl_compile_source_to_opencl(char *source_file_name);
			
 
				+
			
 
				+#endif //  __DRIVER_OPENCL_H__
			
--- a/src/drivers/opencl/driver_opencl_utils.c
+++ b/src/drivers/opencl/driver_opencl_utils.c
@@ -0,0 +1,430 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <sys/stat.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <unistd.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/wait.h>
			
 
				+
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <common/list.h>
			
 
				+#include <common/htable32.h>
			
 
				+#include <core/workers.h>
			
 
				+#include "driver_opencl_utils.h"
			
 
				+#include "driver_opencl.h"
			
 
				+
			
 
				+#define CRC32C_POLY_BE 0x1EDC6F41
			
 
				+
			
 
				+static
			
 
				+inline uint32_t __attribute__ ((pure)) crc32_be_8(uint8_t inputbyte, uint32_t inputcrc)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	uint32_t crc;
			
 
				+
			
 
				+	crc = inputcrc ^ (inputbyte << 24);
			
 
				+	for (i = 0; i < 8; i++)
			
 
				+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE : 0);
			
 
				+
			
 
				+	return crc;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+uint32_t crc32_string(char *str)
			
 
				+{
			
 
				+	uint32_t hash = 0;
			
 
				+
			
 
				+	size_t len = strlen(str);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < len; i++)
			
 
				+	{
			
 
				+		hash = crc32_be_8((uint8_t)str[i], hash);
			
 
				+	}
			
 
				+
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+cl_uint _starpu_opencl_device_uniqueid(cl_device_id id)
			
 
				+{
			
 
				+	char name[1024];
			
 
				+	cl_int  err;
			
 
				+
			
 
				+	err = clGetDeviceInfo(id, CL_DEVICE_NAME, 1024, name, NULL);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	//  fprintf(stderr, "name %s\n", name);
			
 
				+
			
 
				+	return crc32_string(name);
			
 
				+}
			
 
				+
			
 
				+char *_starpu_opencl_codelet_dir;
			
 
				+
			
 
				+static
			
 
				+int _starpu_opencl_locate_file(char *source_file_name, char *located_file_name) {
			
 
				+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", source_file_name);
			
 
				+        if (access(source_file_name, R_OK) == 0) {
			
 
				+                strcpy(located_file_name, source_file_name);
			
 
				+                return EXIT_SUCCESS;
			
 
				+        }
			
 
				+        if (_starpu_opencl_codelet_dir) {
			
 
				+                sprintf(located_file_name, "%s/%s", _starpu_opencl_codelet_dir, source_file_name);
			
 
				+                _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				+                if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
			
 
				+        }
			
 
				+        sprintf(located_file_name, "%s/%s", STARPU_OPENCL_DATADIR, source_file_name);
			
 
				+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
			
 
				+        sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
			
 
				+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
			
 
				+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
			
 
				+
			
 
				+        strcpy(located_file_name, "");
			
 
				+        OPENCL_ERROR("Cannot locate file <%s>\n", source_file_name);
			
 
				+        return EXIT_FAILURE;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+unsigned char *_starpu_opencl_load_program_binary(char *filename, size_t *len)
			
 
				+{
			
 
				+	struct stat statbuf;
			
 
				+	FILE        *fh;
			
 
				+	unsigned char        *binary;
			
 
				+
			
 
				+	fh = fopen(filename, "r");
			
 
				+	if (fh == 0)
			
 
				+		return EXIT_SUCCESS;
			
 
				+
			
 
				+	stat(filename, &statbuf);
			
 
				+
			
 
				+	binary = (unsigned char *) malloc(statbuf.st_size);
			
 
				+	if(!binary)
			
 
				+		return binary;
			
 
				+
			
 
				+	fread(binary, statbuf.st_size, 1, fh);
			
 
				+
			
 
				+	*len = statbuf.st_size;
			
 
				+	return binary;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+cl_int _starpu_opencl_load_program(cl_context context, char *program_name, cl_device_id device, cl_program *program)
			
 
				+{
			
 
				+        //	cl_program     program;
			
 
				+        const unsigned char *binary;
			
 
				+	size_t         len;
			
 
				+	cl_int         err;
			
 
				+	cl_int         status;
			
 
				+        cl_device_type type;
			
 
				+
			
 
				+	cl_uint uniqueid;
			
 
				+	char     located_program_name[1024];
			
 
				+	char     binary_file_name[1024];
			
 
				+	char    *p;
			
 
				+
			
 
				+        // locate file
			
 
				+        _starpu_opencl_locate_file(program_name, located_program_name);
			
 
				+
			
 
				+        // Get type of device
			
 
				+        err = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
			
 
				+	if (err != CL_SUCCESS) {
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                return err;
			
 
				+        }
			
 
				+
			
 
				+        // Get the name of the binary file
			
 
				+	uniqueid = _starpu_opencl_device_uniqueid(device);
			
 
				+	strcpy(binary_file_name, located_program_name);
			
 
				+	p = strstr(binary_file_name, ".cl");
			
 
				+	if(p == NULL) OPENCL_ERROR("Program file name doesn't have the '.cl' extension!\n");
			
 
				+        strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
			
 
				+	sprintf(p + strlen(p), "%u", uniqueid);
			
 
				+
			
 
				+        // Load the binary file
			
 
				+	binary = _starpu_opencl_load_program_binary(binary_file_name, &len);
			
 
				+	if(binary == NULL)
			
 
				+		OPENCL_ERROR("Cannot load binary file %s\n", binary_file_name);
			
 
				+
			
 
				+	//_STARPU_OPENCL_DEBUG("[%s] binary file loaded.\n", binary_file_name);
			
 
				+	*program = clCreateProgramWithBinary(context, 1, &device, &len, &binary, &status, &err);
			
 
				+	if (err != CL_SUCCESS) {
			
 
				+                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                return err;
			
 
				+        }
			
 
				+
			
 
				+	// Build the program executable
			
 
				+	err = clBuildProgram(*program, 0, NULL, NULL, NULL, NULL);
			
 
				+	if (err != CL_SUCCESS) {
			
 
				+		size_t len;
			
 
				+		char buffer[2048];
			
 
				+
			
 
				+		fprintf(stderr, "Error: Failed to build program executable!\n");
			
 
				+		clGetProgramBuildInfo(*program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			
 
				+		fprintf(stderr, "%s\n", buffer);
			
 
				+                return err;
			
 
				+	}
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_htbl32_node_s *history_program_hash[STARPU_MAXOPENCLDEVS] = {NULL};
			
 
				+LIST_TYPE(program,
			
 
				+          char *program_name;
			
 
				+          cl_program program;
			
 
				+          );
			
 
				+program_list_t history_program_list[STARPU_MAXOPENCLDEVS];
			
 
				+
			
 
				+int _starpu_opencl_init_programs(int dev)
			
 
				+{
			
 
				+        history_program_list[dev] = program_list_new();
			
 
				+        return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int _starpu_opencl_release_programs(int dev)
			
 
				+{
			
 
				+        while (!program_list_empty(history_program_list[dev])) {
			
 
				+                program_t pp = program_list_pop_front(history_program_list[dev]);
			
 
				+                _STARPU_OPENCL_DEBUG("Releasing program=<%s> on dev=<%d>\n", pp->program_name, dev);
			
 
				+                clReleaseProgram(pp->program);
			
 
				+        }
			
 
				+        program_list_delete(history_program_list[dev]);
			
 
				+        return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid)
			
 
				+{
			
 
				+        int err;
			
 
				+	cl_device_id device;
			
 
				+        cl_context context;
			
 
				+        uint32_t key;
			
 
				+        cl_program program;
			
 
				+
			
 
				+        starpu_opencl_get_device(devid, &device);
			
 
				+        starpu_opencl_get_context(devid, &context);
			
 
				+        starpu_opencl_get_queue(devid, queue);
			
 
				+
			
 
				+        key = crc32_string(program_name);
			
 
				+        program = _starpu_htbl_search_32(history_program_hash[devid], key);
			
 
				+        if (!program) {
			
 
				+                err = _starpu_opencl_load_program(context, program_name, device, &program);
			
 
				+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                _starpu_htbl_insert_32(&(history_program_hash[devid]), key, program);
			
 
				+                program_t pp = program_new();
			
 
				+                pp->program_name = program_name;
			
 
				+                pp->program = program;
			
 
				+                program_list_push_front(history_program_list[devid], pp);
			
 
				+        }
			
 
				+
			
 
				+        // Create the compute kernel in the program we wish to run
			
 
				+        *kernel = clCreateKernel(program, kernel_name, &err);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int starpu_opencl_release(cl_kernel kernel) {
			
 
				+	cl_int err;
			
 
				+
			
 
				+	err = clReleaseKernel(kernel);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+char *_starpu_opencl_load_program_source(const char *filename)
			
 
				+{
			
 
				+        struct stat statbuf;
			
 
				+        FILE        *fh;
			
 
				+        char        *source;
			
 
				+
			
 
				+        fh = fopen(filename, "r");
			
 
				+        if (fh == 0)
			
 
				+                return EXIT_SUCCESS;
			
 
				+
			
 
				+        stat(filename, &statbuf);
			
 
				+        source = (char *) malloc(statbuf.st_size + 1);
			
 
				+        fread(source, statbuf.st_size, 1, fh);
			
 
				+        source[statbuf.st_size] = '\0';
			
 
				+
			
 
				+        fclose(fh);
			
 
				+
			
 
				+        return source;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+int _starpu_opencl_store_program_binary(const char *filename, const char *binary, size_t len)
			
 
				+{
			
 
				+        FILE *fh;
			
 
				+
			
 
				+        fh = fopen(filename, "w");
			
 
				+        if(fh == NULL) {
			
 
				+                perror("fopen"); return EXIT_FAILURE;
			
 
				+        }
			
 
				+
			
 
				+        fwrite(binary, len, 1, fh);
			
 
				+        fclose(fh);
			
 
				+
			
 
				+        return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int _starpu_opencl_compile_source_to_opencl(char *source_file_name)
			
 
				+{
			
 
				+        int              err;
			
 
				+        int              device_type = CL_DEVICE_TYPE_ALL;
			
 
				+        cl_device_id     devices[STARPU_MAXOPENCLDEVS];
			
 
				+        unsigned         max = STARPU_MAXOPENCLDEVS;
			
 
				+        unsigned         nb_devices = 0;
			
 
				+        cl_uint          history[STARPU_MAXOPENCLDEVS]; // To track similar devices
			
 
				+        char             preproc_file_name[1024];
			
 
				+        char             located_file_name[1024];
			
 
				+        cl_platform_id   platform_ids[STARPU_OPENCL_PLATFORM_MAX];
			
 
				+        cl_uint          platform, nb_platforms;
			
 
				+
			
 
				+        // Locate source file
			
 
				+        _starpu_opencl_locate_file(source_file_name, located_file_name);
			
 
				+        _STARPU_OPENCL_DEBUG("Source file name : <%s>\n", located_file_name);
			
 
				+
			
 
				+        // Prepare preprocessor temporary filename
			
 
				+        {
			
 
				+                char *p;
			
 
				+                strcpy(preproc_file_name, located_file_name);
			
 
				+                p = strstr(preproc_file_name, ".cl");
			
 
				+                if(p == NULL)
			
 
				+                        OPENCL_ERROR("Kernel file name doesn't have the '.cl' extension!\n");
			
 
				+                strcpy(p, ".pre");
			
 
				+        }
			
 
				+
			
 
				+        // Get Platforms
			
 
				+        err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_ids, &nb_platforms);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        // Iterate over each platform
			
 
				+        for(platform=0; platform<nb_platforms; platform++) {
			
 
				+                // Get devices
			
 
				+                err = clGetDeviceIDs(platform_ids[platform], device_type, max, devices, &nb_devices);
			
 
				+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                if(nb_devices > max)
			
 
				+                        nb_devices = max;
			
 
				+
			
 
				+                // Iterate over each device
			
 
				+                unsigned int dev;
			
 
				+                for(dev = 0; dev < nb_devices; dev ++) {
			
 
				+                        cl_context       context;
			
 
				+                        cl_program       program;
			
 
				+                        cl_device_type   type;
			
 
				+                        cl_int           err;
			
 
				+                        cl_uint          uniqueid;
			
 
				+
			
 
				+                        uniqueid =_starpu_opencl_device_uniqueid(devices[dev]);
			
 
				+                        // Look up and update history (to avoid unuseful compilations in the case of identical devices)
			
 
				+                        {
			
 
				+                                unsigned int d;
			
 
				+                                for(d = 0; d < dev; d++)
			
 
				+                                        if(history[d] == uniqueid)
			
 
				+                                                break; // Just skip compiling for this device
			
 
				+                                if(d != dev)
			
 
				+                                        continue;
			
 
				+                                history[dev] = uniqueid;
			
 
				+                        }
			
 
				+
			
 
				+                        err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
			
 
				+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                        // Create a compute context
			
 
				+                        context = clCreateContext(0, 1, devices + dev, NULL, NULL, &err);
			
 
				+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                        // Run C preprocessor
			
 
				+                        {
			
 
				+                                pid_t pid;
			
 
				+                                pid = fork();
			
 
				+                                if(pid == 0) {
			
 
				+                                        execlp("cpp", "cpp", located_file_name, "-o", preproc_file_name, NULL);
			
 
				+                                        perror("execlp");
			
 
				+                                        exit(EXIT_FAILURE);
			
 
				+                                }
			
 
				+                                else {
			
 
				+                                        int status;
			
 
				+                                        waitpid(pid, &status, 0);
			
 
				+                                        if (WEXITSTATUS(status) != EXIT_SUCCESS)
			
 
				+                                                OPENCL_ERROR("Cannot preprocess file [%s]\n", located_file_name);
			
 
				+                                }
			
 
				+                        }
			
 
				+
			
 
				+                        // Load the compute program from disk into a cstring buffer
			
 
				+                        char *source = _starpu_opencl_load_program_source(preproc_file_name);
			
 
				+                        if(!source)
			
 
				+                                OPENCL_ERROR("Failed to load compute program from file <%s>!\n", preproc_file_name);
			
 
				+
			
 
				+                        // Create the compute program from the source buffer
			
 
				+                        program = clCreateProgramWithSource(context, 1, (const char **) & source, NULL, &err);
			
 
				+                        if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                        // Build the program executable
			
 
				+                        err = clBuildProgram(program, 1, devices + dev, "-Werror -cl-mad-enable", NULL, NULL);
			
 
				+                        if (err != CL_SUCCESS) {
			
 
				+                                size_t len;
			
 
				+                                static char buffer[4096];
			
 
				+
			
 
				+                                fprintf(stderr, "Error: Failed to build program executable!\n");
			
 
				+                                clGetProgramBuildInfo(program, devices[dev], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			
 
				+
			
 
				+                                fprintf(stderr, "%s\n", buffer);
			
 
				+                                return EXIT_FAILURE;
			
 
				+                        }
			
 
				+
			
 
				+                        // Store program binary
			
 
				+                        {
			
 
				+                                char     binary_file_name[1024];
			
 
				+                                char    *binary;
			
 
				+                                size_t   binary_len;
			
 
				+                                char    *p;
			
 
				+
			
 
				+                                strcpy(binary_file_name, located_file_name);
			
 
				+                                p = strstr(binary_file_name, ".cl");
			
 
				+                                if(p == NULL)
			
 
				+                                        OPENCL_ERROR("Input file name doesn't have the '.cl' extension!\n");
			
 
				+
			
 
				+                                strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
			
 
				+                                sprintf(p + strlen(p), "%u", uniqueid);
			
 
				+
			
 
				+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_len, NULL);
			
 
				+                                if(err != CL_SUCCESS)
			
 
				+                                        OPENCL_ERROR("Cannot get program binary size (err = %d)!\n", err);
			
 
				+
			
 
				+                                binary = malloc(binary_len);
			
 
				+
			
 
				+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binary), &binary, NULL);
			
 
				+                                if(err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                                _starpu_opencl_store_program_binary(binary_file_name, binary, binary_len);
			
 
				+
			
 
				+                                free(binary);
			
 
				+
			
 
				+                                _STARPU_OPENCL_DEBUG("Binary file [%s] successfully built (%ld bytes).\n", binary_file_name, binary_len);
			
 
				+                        }
			
 
				+
			
 
				+                        clReleaseProgram(program);
			
 
				+                        clReleaseContext(context);
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        return EXIT_SUCCESS;
			
 
				+}
			
--- a/src/drivers/opencl/driver_opencl_utils.h
+++ b/src/drivers/opencl/driver_opencl_utils.h
@@ -0,0 +1,41 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_OPENCL_UTILS_H__
			
 
				+#define __STARPU_OPENCL_UTILS_H__
			
 
				+
			
 
				+#include <config.h>
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+#  define _STARPU_OPENCL_DEBUG(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
			
 
				+#else
			
 
				+#  define _STARPU_OPENCL_DEBUG(fmt, args ...)
			
 
				+#endif
			
 
				+
			
 
				+#define _STARPU_OPENCL_DISP(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
			
 
				+
			
 
				+#define OPENCL_ERROR(fmt, args ...)                                                   \
			
 
				+	do {                                                                          \
			
 
				+                fprintf(stderr, "[starpu][%s] Error: " fmt ,__func__ ,##args); \
			
 
				+		assert(0);                                                            \
			
 
				+	} while (0)
			
 
				+
			
 
				+#define STARPU_OPENCL_PLATFORM_MAX 4
			
 
				+
			
 
				+int _starpu_opencl_init_programs(int dev);
			
 
				+int _starpu_opencl_release_programs(int dev);
			
 
				+
			
 
				+#endif /* __STARPU_OPENCL_UTILS_H__ */
			
--- a/src/util/execute_on_all.c
+++ b/src/util/execute_on_all.c
@@ -39,6 +39,7 @@ void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t whe
 
				 		.where = where,
			
 
				 		.cuda_func = wrapper_func,
			
 
				 		.cpu_func = wrapper_func,
			
 
				+		.opencl_func = wrapper_func,
			
 
				 		/* XXX we do not handle Cell .. */
			
 
				 		.nbuffers = 0,
			
 
				 		.model = NULL
			
--- a/src/util/malloc.c
+++ b/src/util/malloc.c
@@ -24,13 +24,28 @@
 
				 #include <cuda.h>
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 struct malloc_pinned_codelet_struct {
			
 
				 	void **ptr;
			
 
				 	size_t dim;
			
 
				 };
			
 
				+#endif
			
 
				+
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//static void malloc_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+//{
			
 
				+//	struct malloc_pinned_codelet_struct *s = arg;
			
 
				+//        //        *(s->ptr) = malloc(s->dim);
			
 
				+//        _starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
			
 
				+//}
			
 
				+//#endif
			
 
				 
			
 
				-static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void malloc_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				 	struct malloc_pinned_codelet_struct *s = arg;
			
 
				 
			
@@ -39,10 +54,16 @@ static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA)// || defined(STARPU_USE_OPENCL)
			
 
				 static starpu_codelet malloc_pinned_cl = {
			
 
				-	.where = STARPU_CUDA,
			
 
				-	.cuda_func = malloc_pinned_codelet,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = malloc_pinned_cuda_codelet,
			
 
				+#endif
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//	.opencl_func = malloc_pinned_opencl_codelet,
			
 
				+//#endif
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
@@ -59,24 +80,47 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 
				 	{
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		int push_res;
			
 
				-	
			
 
				+
			
 
				 		struct malloc_pinned_codelet_struct s = {
			
 
				 			.ptr = A,
			
 
				 			.dim = dim
			
 
				-		};	
			
 
				-	
			
 
				+		};
			
 
				+
			
 
				+                malloc_pinned_cl.where = STARPU_CUDA;
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				-			task->callback_func = NULL; 
			
 
				+			task->callback_func = NULL;
			
 
				 			task->cl = &malloc_pinned_cl;
			
 
				 			task->cl_arg = &s;
			
 
				 
			
 
				 		task->synchronous = 1;
			
 
				-	
			
 
				+
			
 
				 		push_res = starpu_submit_task(task);
			
 
				 		STARPU_ASSERT(push_res != -ENODEV);
			
 
				 #endif
			
 
				 	}
			
 
				-	else {
			
 
				+//	else if (_starpu_may_submit_opencl_task())
			
 
				+//	{
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//		int push_res;
			
 
				+//
			
 
				+//		struct malloc_pinned_codelet_struct s = {
			
 
				+//			.ptr = A,
			
 
				+//			.dim = dim
			
 
				+//		};
			
 
				+//
			
 
				+//                malloc_pinned_cl.where = STARPU_OPENCL;
			
 
				+//		struct starpu_task *task = starpu_task_create();
			
 
				+//			task->callback_func = NULL;
			
 
				+//			task->cl = &malloc_pinned_cl;
			
 
				+//			task->cl_arg = &s;
			
 
				+//
			
 
				+//		task->synchronous = 1;
			
 
				+//
			
 
				+//		push_res = starpu_submit_task(task);
			
 
				+//		STARPU_ASSERT(push_res != -ENODEV);
			
 
				+//#endif
			
 
				+//        }
			
 
				+        else {
			
 
				 		*A = malloc(dim);
			
 
				 	}
			
 
				 
			
@@ -86,17 +130,32 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void free_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				 	cudaError_t cures;
			
 
				 	cures = cudaFreeHost(arg);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 }
			
 
				+#endif
			
 
				+
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//static void free_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+//{
			
 
				+//        //        free(arg);
			
 
				+//        int err = clReleaseMemObject(arg);
			
 
				+//        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+//}
			
 
				+//#endif
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA) // || defined(STARPU_USE_OPENCL)
			
 
				 static starpu_codelet free_pinned_cl = {
			
 
				-	.where = STARPU_CUDA,
			
 
				-	.cuda_func = free_pinned_codelet,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = free_pinned_cuda_codelet,
			
 
				+#endif
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//	.opencl_func = free_pinned_opencl_codelet,
			
 
				+//#endif
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
@@ -111,18 +170,36 @@ int starpu_free_pinned_if_possible(void *A)
 
				 	{
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		int push_res;
			
 
				-	
			
 
				+
			
 
				+                free_pinned_cl.where = STARPU_CUDA;
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				-			task->callback_func = NULL; 
			
 
				+			task->callback_func = NULL;
			
 
				 			task->cl = &free_pinned_cl;
			
 
				 			task->cl_arg = A;
			
 
				 
			
 
				 		task->synchronous = 1;
			
 
				-	
			
 
				+
			
 
				 		push_res = starpu_submit_task(task);
			
 
				 		STARPU_ASSERT(push_res != -ENODEV);
			
 
				 #endif
			
 
				 	}
			
 
				+//	else if (_starpu_may_submit_opencl_task())
			
 
				+//	{
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//		int push_res;
			
 
				+//
			
 
				+//                free_pinned_cl.where = STARPU_OPENCL;
			
 
				+//		struct starpu_task *task = starpu_task_create();
			
 
				+//			task->callback_func = NULL;
			
 
				+//			task->cl = &free_pinned_cl;
			
 
				+//			task->cl_arg = A;
			
 
				+//
			
 
				+//		task->synchronous = 1;
			
 
				+//
			
 
				+//		push_res = starpu_submit_task(task);
			
 
				+//		STARPU_ASSERT(push_res != -ENODEV);
			
 
				+//#endif
			
 
				+//	}
			
 
				 	else {
			
 
				 		free(A);
			
 
				 	}
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -33,7 +33,7 @@ if STARPU_USE_CUDA
 
				 # TODO define NVCCFLAGS
			
 
				 NVCC ?= nvcc
			
 
				 
			
 
				-NVCCFLAGS += -I$(top_srcdir)/include/
			
 
				+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
			
 
				 
			
 
				 .cu.cubin:
			
 
				 	$(MKDIR_P) `dirname $@`
			
@@ -246,6 +246,11 @@ datawizard_sync_and_notify_data_SOURCES +=	\
 
				 	datawizard/sync_and_notify_data_kernels.cu
			
 
				 endif
			
 
				 
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_sync_and_notify_data_SOURCES +=	\
			
 
				+	datawizard/sync_and_notify_data_opencl.c
			
 
				+endif
			
 
				+
			
 
				 if STARPU_USE_GORDON
			
 
				 datawizard_sync_and_notify_data_SOURCES +=	\
			
 
				 	datawizard/sync_and_notify_data_gordon_kernels.c
			
--- a/tests/core/empty_task_sync_point.c
+++ b/tests/core/empty_task_sync_point.c
@@ -34,9 +34,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/execute_on_a_specific_worker.c
+++ b/tests/core/execute_on_a_specific_worker.c
@@ -71,9 +71,10 @@ static starpu_access_mode select_random_mode(void)
 
				 
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = codelet_null,
			
 
				 	.cuda_func = codelet_null,
			
 
				+        .opencl_func = codelet_null,
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
--- a/tests/core/multithreaded.c
+++ b/tests/core/multithreaded.c
@@ -33,9 +33,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/regenerate.c
+++ b/tests/core/regenerate.c
@@ -51,9 +51,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/starpu_wait_all_tasks.c
+++ b/tests/core/starpu_wait_all_tasks.c
@@ -28,9 +28,10 @@ static void dummy_func(void *descr[], void *arg)
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	.gordon_func = 0, /* this will be defined later */
			
 
				 #endif
			
@@ -70,9 +71,11 @@ static struct starpu_conf conf = {
 
				 	.sched_policy_name = NULL,
			
 
				 	.ncpus = -1,
			
 
				 	.ncuda = -1,
			
 
				+        .nopencl = -1,
			
 
				 	.nspus = -1,
			
 
				 	.use_explicit_workers_bindid = 0,
			
 
				-	.use_explicit_workers_gpuid = 0,
			
 
				+	.use_explicit_workers_cuda_gpuid = 0,
			
 
				+	.use_explicit_workers_opencl_gpuid = 0,
			
 
				 	.calibrate = 0
			
 
				 };
			
 
				 
			
--- a/tests/core/starpu_wait_task.c
+++ b/tests/core/starpu_wait_task.c
@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
@@ -86,7 +87,8 @@ int main(int argc, char **argv)
 
				 		int ret = starpu_submit_task(task);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 
			
 
				-		starpu_wait_task(task);
			
 
				+		ret = starpu_wait_task(task);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				 
			
 
				 		starpu_task_destroy(task);
			
 
				 	}
			
--- a/tests/core/static_restartable.c
+++ b/tests/core/static_restartable.c
@@ -28,10 +28,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				-	.model = NULL,
			
 
				+	.opencl_func = dummy_func,
			
 
				+        .model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
 
				 
			
--- a/tests/core/static_restartable_tag.c
+++ b/tests/core/static_restartable_tag.c
@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/static_restartable_using_initializer.c
+++ b/tests/core/static_restartable_using_initializer.c
@@ -31,9 +31,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/subgraph_repeat.c
+++ b/tests/core/subgraph_repeat.c
@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/subgraph_repeat_regenerate.c
+++ b/tests/core/subgraph_repeat_regenerate.c
@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/tag_wait_api.c
+++ b/tests/core/tag_wait_api.c
@@ -26,9 +26,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/core/task_wait_api.c
+++ b/tests/core/task_wait_api.c
@@ -26,10 +26,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				-	.model = NULL,
			
 
				+	.opencl_func = dummy_func,
			
 
				+        .model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
 
				 
			
--- a/tests/datawizard/dining_philosophers.c
+++ b/tests/datawizard/dining_philosophers.c
@@ -27,9 +27,10 @@ static void eat_kernel(void *descr[], void *arg)
 
				 }
			
 
				 
			
 
				 static starpu_codelet eating_cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cuda_func = eat_kernel,
			
 
				 	.cpu_func = eat_kernel,
			
 
				+        .opencl_func = eat_kernel,
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
--- a/tests/datawizard/dsm_stress.c
+++ b/tests/datawizard/dsm_stress.c
@@ -54,6 +54,10 @@ static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_arg
 
				 {
			
 
				 }
			
 
				 
			
 
				+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				 }
			
@@ -74,9 +78,10 @@ static starpu_access_mode select_random_mode(void)
 
				 
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = cpu_codelet_null,
			
 
				 	.cuda_func = cuda_codelet_null,
			
 
				+        .opencl_func = opencl_codelet_null,
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
--- a/tests/datawizard/readers_and_writers.c
+++ b/tests/datawizard/readers_and_writers.c
@@ -24,9 +24,10 @@ static void dummy_kernel(void *descr[], void *arg)
 
				 }
			
 
				 
			
 
				 static starpu_codelet rw_cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cuda_func = dummy_kernel,
			
 
				 	.cpu_func = dummy_kernel,
			
 
				+	.opencl_func = dummy_kernel,
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
--- a/tests/datawizard/sync_and_notify_data.c
+++ b/tests/datawizard/sync_and_notify_data.c
@@ -46,6 +46,11 @@ void cuda_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
 
				 void cuda_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+#endif
			
 
				+
			
 
				 #define VECTORSIZE	16
			
 
				 
			
 
				 starpu_data_handle v_handle;
			
@@ -79,8 +84,12 @@ int main(int argc, char **argv)
 
				 
			
 
				 	fprintf(stderr, "kernel incA %d incC %d elf %d\n", kernel_incA_id, kernel_incC_id, elf_id);
			
 
				 #endif
			
 
				-	
			
 
				-	starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        _starpu_opencl_compile_source_to_opencl("tests/datawizard/sync_and_notify_data_opencl_codelet.cl");
			
 
				+#endif
			
 
				+
			
 
				+        starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
			
 
				 
			
 
				 	unsigned iter;
			
 
				 	for (iter = 0; iter < K; iter++)
			
@@ -91,11 +100,14 @@ int main(int argc, char **argv)
 
				 		{
			
 
				 			/* increment a = v[0] */
			
 
				 			starpu_codelet cl_inc_a = {
			
 
				-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
			
 
				 				.cpu_func = cpu_codelet_incA,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 				.cuda_func = cuda_codelet_incA,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+				.opencl_func = opencl_codelet_incA,
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 				.gordon_func = kernel_incA_id,
			
 
				 #endif
			
@@ -127,11 +139,14 @@ int main(int argc, char **argv)
 
				 		{
			
 
				 			/* increment c = v[2] */
			
 
				 			starpu_codelet cl_inc_c = {
			
 
				-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
			
 
				 				.cpu_func = cpu_codelet_incC,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 				.cuda_func = cuda_codelet_incC,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+				.opencl_func = opencl_codelet_incC,
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 				.gordon_func = kernel_incC_id,
			
 
				 #endif
			
--- a/tests/datawizard/sync_and_notify_data_opencl.c
+++ b/tests/datawizard/sync_and_notify_data_opencl.c
@@ -0,0 +1,77 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <CL/cl.h>
			
 
				+
			
 
				+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+        unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err;
			
 
				+
			
 
				+	id = starpu_get_worker_id();
			
 
				+	devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incA", devid);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global=100;
			
 
				+		size_t local=100;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+	starpu_opencl_release(kernel);
			
 
				+}
			
 
				+
			
 
				+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	int id, devid, err;
			
 
				+
			
 
				+	id = starpu_get_worker_id();
			
 
				+	devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incC", devid);
			
 
				+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = 0;
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global=100;
			
 
				+		size_t local=100;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+
			
 
				+	starpu_opencl_release(kernel);
			
 
				+}
			
--- a/tests/datawizard/sync_and_notify_data_opencl_codelet.cl
+++ b/tests/datawizard/sync_and_notify_data_opencl_codelet.cl
@@ -0,0 +1,30 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void incA(__global unsigned* input) 
			
 
				+{
			
 
				+	const int i = get_global_id(0);
			
 
				+	if (i == 0)
			
 
				+		input[i] ++;
			
 
				+}
			
 
				+
			
 
				+__kernel void incC(__global unsigned* input) 
			
 
				+{
			
 
				+	const int i = get_global_id(0);
			
 
				+	if (i == 2)
			
 
				+		input[i] ++;
			
 
				+}
			
 
				+
			
--- a/tests/datawizard/sync_with_data_with_mem.c
+++ b/tests/datawizard/sync_with_data_with_mem.c
@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				 }
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_codelet,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = dummy_codelet,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        .opencl_func = dummy_codelet,
			
 
				+#endif
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
--- a/tests/datawizard/sync_with_data_with_mem_non_blocking.c
+++ b/tests/datawizard/sync_with_data_with_mem_non_blocking.c
@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				 }
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_codelet,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = dummy_codelet,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func = dummy_codelet,
			
 
				+#endif
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
--- a/tests/datawizard/unpartition.c
+++ b/tests/datawizard/unpartition.c
@@ -32,11 +32,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				 }
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_codelet,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = dummy_codelet,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func = dummy_codelet,
			
 
				+#endif
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
--- a/tests/datawizard/write_only_tmp_buffer.c
+++ b/tests/datawizard/write_only_tmp_buffer.c
@@ -24,6 +24,21 @@
 
				 
			
 
				 starpu_data_handle v_handle;
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <CL/cl.h>
			
 
				+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	cl_mem buf = (cl_mem)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+        char ptr = 42;
			
 
				+        cl_command_queue queue;
			
 
				+        int id = starpu_get_worker_id();
			
 
				+        int devid = starpu_get_worker_devid(id);
			
 
				+
			
 
				+        starpu_opencl_get_queue(devid, &queue);
			
 
				+        clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, sizeof(char), &ptr, 0, NULL, NULL);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
@@ -51,11 +66,14 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 
				 }
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = cpu_codelet_null,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_func = cuda_codelet_null,
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func = opencl_codelet_null,
			
 
				+#endif
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
--- a/tests/errorcheck/invalid_blocking_calls.c
+++ b/tests/errorcheck/invalid_blocking_calls.c
@@ -38,9 +38,10 @@ static void wrong_func(void *descr[], void *arg)
 
				 
			
 
				 static starpu_codelet wrong_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = wrong_func,
			
 
				 	.cuda_func = wrong_func,
			
 
				+        .opencl_func = wrong_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/errorcheck/starpu_init_noworker.c
+++ b/tests/errorcheck/starpu_init_noworker.c
@@ -29,9 +29,11 @@ int main(int argc, char **argv)
 
				 		.sched_policy_name = NULL, /* default */
			
 
				 		.ncpus = 0,
			
 
				 		.ncuda = 0,
			
 
				+                .nopencl = 0,
			
 
				 		.nspus = 0,
			
 
				 		.use_explicit_workers_bindid = 0,
			
 
				-		.use_explicit_workers_gpuid = 0,
			
 
				+		.use_explicit_workers_cuda_gpuid = 0,
			
 
				+		.use_explicit_workers_opencl_gpuid = 0,
			
 
				 		.calibrate = 0
			
 
				 	};
			
 
				 
			
--- a/tests/helper/execute_on_all.c
+++ b/tests/helper/execute_on_all.c
@@ -32,12 +32,14 @@ int main(int argc, char **argv)
 
				 
			
 
				 	int arg = 0x42;
			
 
				 
			
 
				-	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA);
			
 
				+	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA|STARPU_OPENCL);
			
 
				 
			
 
				 	starpu_execute_on_each_worker(func, &arg, STARPU_CPU);
			
 
				 	
			
 
				 	starpu_execute_on_each_worker(func, &arg, STARPU_CUDA);
			
 
				 
			
 
				+        starpu_execute_on_each_worker(func, &arg, STARPU_OPENCL);
			
 
				+
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/tests/helper/starpu_create_sync_task.c
+++ b/tests/helper/starpu_create_sync_task.c
@@ -25,9 +25,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+        .opencl_func = dummy_func,
			
 
				 	.nbuffers = 0
			
 
				 };
			
 
				 
			
--- a/tests/microbenchs/async_tasks_overhead.c
+++ b/tests/microbenchs/async_tasks_overhead.c
@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+        .opencl_func = dummy_func,
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	.gordon_func = 0, /* this will be defined later */
			
 
				 #endif
			
@@ -91,9 +92,11 @@ static struct starpu_conf conf = {
 
				 	.sched_policy_name = NULL,
			
 
				 	.ncpus = -1,
			
 
				 	.ncuda = -1,
			
 
				+        .nopencl = -1,
			
 
				 	.nspus = -1,
			
 
				 	.use_explicit_workers_bindid = 0,
			
 
				-	.use_explicit_workers_gpuid = 0,
			
 
				+	.use_explicit_workers_cuda_gpuid = 0,
			
 
				+	.use_explicit_workers_opencl_gpuid = 0,
			
 
				 	.calibrate = 0
			
 
				 };
			
 
				 
			
--- a/tests/microbenchs/prefetch_data_on_node.c
+++ b/tests/microbenchs/prefetch_data_on_node.c
@@ -75,9 +75,10 @@ static starpu_access_mode select_random_mode(void)
 
				 
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = codelet_null,
			
 
				 	.cuda_func = codelet_null,
			
 
				+	.opencl_func = codelet_null,
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
--- a/tests/microbenchs/redundant_buffer.c
+++ b/tests/microbenchs/redundant_buffer.c
@@ -27,6 +27,10 @@
 
				 starpu_data_handle v_handle;
			
 
				 static unsigned *v;
			
 
				 
			
 
				+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				 }
			
@@ -36,10 +40,11 @@ static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args
 
				 }
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = cpu_codelet_null,
			
 
				 	.cuda_func = cuda_codelet_null,
			
 
				-	.nbuffers = 2
			
 
				+	.opencl_func = opencl_codelet_null,
			
 
				+        .nbuffers = 2
			
 
				 };
			
 
				 
			
 
				 
			
--- a/tests/microbenchs/sync_tasks_overhead.c
+++ b/tests/microbenchs/sync_tasks_overhead.c
@@ -28,9 +28,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+        .opencl_func = dummy_func,
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	.gordon_func = 0, /* this will be defined later */
			
 
				 #endif
			
--- a/tests/microbenchs/tasks_overhead.c
+++ b/tests/microbenchs/tasks_overhead.c
@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static starpu_codelet dummy_codelet = 
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = dummy_func,
			
 
				 	.cuda_func = dummy_func,
			
 
				+	.opencl_func = dummy_func,
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/overlap/overlap.c
+++ b/tests/overlap/overlap.c
@@ -59,9 +59,10 @@ static struct starpu_perfmodel_t model = {
 
				 };
			
 
				 
			
 
				 static starpu_codelet cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_func = codelet_sleep,
			
 
				 	.cuda_func = codelet_sleep,
			
 
				+        .opencl_func = codelet_sleep,
			
 
				 	.nbuffers = 1,
			
 
				 	.model =  &model
			
 
				 };