15 years ago · 59c29c44ec
--- a/Makefile.am
+++ b/Makefile.am
@@ -34,4 +34,5 @@ include_HEADERS = 				\
 
																 	include/starpu_data.h			\
															
 
																 	include/starpu_perfmodel.h		\
															
 
																 	include/starpu_util.h			\
															
 
																+	include/starpu_opencl.h			\
															
 
																 	include/starpu_expert.h
															
--- a/configure.ac
+++ b/configure.ac
@@ -52,8 +52,10 @@ esac
 
																 # This will be useful for program which use CUDA (and .cubin files) which need
															
 
																 # some path to the CUDA code at runtime.
															
 
																-AC_DEFINE_UNQUOTED(STARPU_DIR, "$PWD", [location of StarPU sources])
															
 
																-AC_SUBST(STARPU_DIR, $PWD)
															
 
																+AC_DEFINE_UNQUOTED(STARPU_BUILD_DIR, "$PWD", [location of StarPU build directory])
															
 
																+AC_SUBST(STARPU_BUILD_DIR, $PWD)
															
 
																+AC_DEFINE_UNQUOTED(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})", [location of StarPU sources])
															
 
																+AC_SUBST(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})")
															
 
																 AC_CHECK_LIB([pthread], [pthread_create])
															
 
																 case "$target" in
															
@@ -300,6 +302,71 @@ fi
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																+#                                 OpenCL settings                             #
															
 
																+#                                                                             #
															
 
																+###############################################################################
															
 
																+
															
 
																+AC_MSG_CHECKING(maximum number of OpenCL devices)
															
 
																+AC_ARG_ENABLE(maxopencldev, [AS_HELP_STRING([--enable-maxopencldev=<number>],
															
 
																+			[maximum number of OPENCL devices])],
															
 
																+			nmaxopencldev=$enableval, nmaxopencldev=8)
															
 
																+AC_MSG_RESULT($nmaxopencldev)
															
 
																+AC_DEFINE_UNQUOTED(STARPU_MAXOPENCLDEVS, [$nmaxopencldev], 
															
 
																+		[maximum number of OPENCL devices])
															
 
																+AC_ARG_ENABLE(opencl, [AS_HELP_STRING([--disable-opencl],
															
 
																+		[do not use OpenCL device(s)])],, [enable_opencl=maybe])
															
 
																+
															
 
																+if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
															
 
																+	#AC_MSG_CHECKING(whether OpenCL is available)
															
 
																+	AC_ARG_WITH(opencl-dir, 
															
 
																+		[AS_HELP_STRING([--with-opencl-dir=<path>],
															
 
																+		[specify OpenCL installation directory (default is /usr/)])],
															
 
																+		[
															
 
																+			opencl_dir=$withval
															
 
																+			# in case this was not explicit yet
															
 
																+			enable_opencl=yes
															
 
																+		], opencl_dir=/usr/)
															
 
																+	
															
 
																+	if test -d "$opencl_dir/include/"; then
															
 
																+		CPPFLAGS="${CPPFLAGS} -I$opencl_dir/include/ "
															
 
																+	fi
															
 
																+
															
 
																+	# do we have a valid OpenCL setup ?
															
 
																+	have_valid_opencl=yes
															
 
																+	AC_CHECK_HEADER([CL/cl.h],,[have_valid_opencl=no])
															
 
																+
															
 
																+	# we are looking for the proper option in LDFLAGS, so we save the
															
 
																+	# current value of LDFLAGS so that we can add new things in it and
															
 
																+	# restore it in case it's not working.
															
 
																+	SAVED_LDFLAGS="${LDFLAGS}"
															
 
																+
															
 
																+	found_opencllib=no
															
 
																+	if test -d "$opencl_dir/lib/"; then
															
 
																+		LDFLAGS="${SAVED_LDFLAGS} -L$opencl_dir/lib/ "
															
 
																+		AC_SEARCH_LIBS([clCreateKernel],[OpenCL],[found_opencllib=yes],[found_opencllib=no])
															
 
																+	fi
															
 
																+
															
 
																+	# in case OpenCL was explicitely required, but is not available, this is an error
															
 
																+	if test x$enable_opencl = xyes -a x$have_valid_opencl = no; then
															
 
																+		AC_MSG_ERROR([cannot find OpenCL])
															
 
																+	fi
															
 
																+
															
 
																+	# now we enable OpenCL if and only if a proper setup is available
															
 
																+	enable_opencl=$have_valid_opencl
															
 
																+fi
															
 
																+
															
 
																+AC_MSG_CHECKING(whether OpenCL should be used)
															
 
																+AC_MSG_RESULT($enable_opencl)
															
 
																+AC_SUBST(STARPU_USE_OPENCL, $enable_opencl)
															
 
																+AM_CONDITIONAL(STARPU_USE_OPENCL, test x$enable_opencl = xyes)
															
 
																+if test x$enable_opencl = xyes; then
															
 
																+	AC_DEFINE(STARPU_USE_OPENCL, [1], [OpenCL support is activated])
															
 
																+        AC_DEFINE_UNQUOTED(STARPU_OPENCL_DATADIR, "$(eval echo ${datarootdir}/starpu/opencl)", [Path to OpenCL codelets])
															
 
																+        AC_SUBST(STARPU_OPENCL_DATAdir, "$(eval echo ${datarootdir}/starpu/opencl/examples)")
															
 
																+fi
															
 
																+
															
 
																+###############################################################################
															
 
																+#                                                                             #
															
 
																 #                                 Cell settings                               #
															
 
																 #                                                                             #
															
 
																 ###############################################################################
															
@@ -506,13 +573,19 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 
																 # We have one memory node shared by all CPU workers, one node per GPU, and
															
 
																 # currently the Cell driver is using the same memory node as the CPU.
															
 
																-if test x$enable_cuda = xyes; then
															
 
																+maxnodes=1
															
 
																+if test x$enable_cuda = xyes ; then
															
 
																 	# we could have used nmaxcudadev + 1, but this would certainly give an
															
 
																 	# odd number.
															
 
																-	maxnodes=`expr 2 \* $nmaxcudadev`
															
 
																-else
															
 
																-	maxnodes=1	
															
 
																+	maxnodes=`expr $maxnodes + $nmaxcudadev`
															
 
																 fi
															
 
																+if test x$enable_opencl = xyes ; then
															
 
																+	# we could have used nmaxcudadev + 1, but this would certainly give an
															
 
																+	# odd number.
															
 
																+	maxnodes=`expr $maxnodes + $nmaxopencldev`
															
 
																+fi
															
 
																+# todo: set maxnodes to the next power of 2 greater than maxnodes
															
 
																+
															
 
																 AC_MSG_CHECKING(maximum number of memory nodes)
															
 
																 AC_MSG_RESULT($maxnodes)
															
 
																 AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],
															
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -476,6 +476,10 @@ specified with the @code{STARPU_NCPUS} environment variable.
 
																 This is the maximum number of CUDA devices that StarPU can use. This can also be
															
 
																 specified with the @code{STARPU_NCUDA} environment variable.
															
 
																+@item @code{nopencl} (default = -1):
															
 
																+This is the maximum number of OpenCL devices that StarPU can use. This can also be
															
 
																+specified with the @code{STARPU_NOPENCL} environment variable.
															
 
																+
															
 
																 @item @code{nspus} (default = -1):
															
 
																 This is the maximum number of Cell SPUs that StarPU can use. This can also be
															
 
																 specified with the @code{STARPU_NGORDON} environment variable.
															
@@ -600,7 +604,8 @@ an integer between 0 and @code{starpu_get_worker_count() - 1}.
 
																 This function returns the type of worker associated to an identifier (as
															
 
																 returned by the @code{starpu_get_worker_id} function). The returned value
															
 
																 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
															
 
																-core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
															
 
																+core, @code{STARPU_CUDA_WORKER} for a CUDA device,
															
 
																+@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
															
 
																 @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
															
 
																 identifier is unspecified.
															
@@ -716,6 +721,13 @@ be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
 
																 field is ignored if @code{STARPU_CUDA} does not appear in the @code{.where}
															
 
																 field, it must be non-null otherwise.
															
 
																+@item @code{opencl_func} (optionnal):
															
 
																+Is a function pointer to the OpenCL implementation of the codelet. Its
															
 
																+prototype must be:
															
 
																+@code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
															
 
																+This pointer is ignored if @code{OPENCL} does not appear in the
															
 
																+@code{.where} field, it must be non-null otherwise.
															
 
																+
															
 
																 @item @code{gordon_func} (optionnal):
															
 
																 This is the index of the Cell SPU implementation within the Gordon library.
															
 
																 TODO
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -431,11 +431,97 @@ check_PROGRAMS +=				\
 
																 examplebin_PROGRAMS +=				\
															
 
																 	incrementer/incrementer
															
 
																-if STARPU_USE_CUDA
															
 
																-incrementer_incrementer_SOURCES =	\
															
 
																-	incrementer/incrementer.c	\
															
 
																-	incrementer/incrementer_kernels.cu
															
 
																-else
															
 
																 incrementer_incrementer_SOURCES =	\
															
 
																 	incrementer/incrementer.c
															
 
																+if STARPU_USE_CUDA
															
 
																+incrementer_incrementer_SOURCES +=	\
															
 
																+	incrementer/incrementer_kernels.cu
															
 
																+endif
															
 
																+if STARPU_USE_OPENCL
															
 
																+incrementer_incrementer_SOURCES +=	\
															
 
																+	incrementer/incrementer_kernels_opencl.c
															
 
																+endif
															
 
																+
															
 
																+nobase_STARPU_OPENCL_DATA_DATA = \
															
 
																+	incrementer/incrementer_kernels_opencl_codelet.cl
															
 
																+
															
 
																+###################
															
 
																+# Stencil example #
															
 
																+###################
															
 
																+
															
 
																+if STARPU_USE_OPENCL
															
 
																+check_PROGRAMS +=		\
															
 
																+	stencil/stencil
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	stencil/stencil
															
 
																+stencil_stencil_SOURCES =	\
															
 
																+	stencil/stencil.c       \
															
 
																+	stencil/stencil_opencl.c
															
 
																+nobase_STARPU_OPENCL_DATA_DATA += \
															
 
																+	stencil/stencil_opencl_codelet.cl \
															
 
																+	stencil/stencil.h
															
 
																+endif
															
 
																+
															
 
																+####################
															
 
																+# Variable example #
															
 
																+####################
															
 
																+
															
 
																+check_PROGRAMS +=				\
															
 
																+	variable/variable
															
 
																+
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	variable/variable
															
 
																+
															
 
																+variable_variable_SOURCES =	\
															
 
																+	variable/variable.c
															
 
																+if STARPU_USE_CUDA
															
 
																+variable_variable_SOURCES +=	\
															
 
																+	variable/variable_kernels.cu
															
 
																+endif
															
 
																+if STARPU_USE_OPENCL
															
 
																+variable_variable_SOURCES +=	\
															
 
																+	variable/variable_kernels_opencl.c
															
 
																+nobase_STARPU_OPENCL_DATA_DATA += \
															
 
																+	variable/variable_kernels_opencl_codelet.cl
															
 
																+endif
															
 
																+
															
 
																+######################
															
 
																+# matVecMult example #
															
 
																+######################
															
 
																+
															
 
																+check_PROGRAMS +=				\
															
 
																+	matvecmult/matvecmult
															
 
																+
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	matvecmult/matvecmult
															
 
																+
															
 
																+matvecmult_matvecmult_SOURCES =	\
															
 
																+	matvecmult/matvecmult.c
															
 
																+
															
 
																+if STARPU_USE_OPENCL
															
 
																+nobase_STARPU_OPENCL_DATA_DATA += \
															
 
																+	matvecmult/matvecmult_kernel.cl
															
 
																+endif
															
 
																+
															
 
																+#################
															
 
																+# block example #
															
 
																+#################
															
 
																+
															
 
																+check_PROGRAMS +=				\
															
 
																+	block/block
															
 
																+
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	block/block
															
 
																+
															
 
																+block_block_SOURCES =	\
															
 
																+	block/block.c
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+block_block_SOURCES +=				\
															
 
																+	block/block_cuda.cu
															
 
																+endif
															
 
																+
															
 
																+if STARPU_USE_OPENCL
															
 
																+nobase_STARPU_OPENCL_DATA_DATA += \
															
 
																+	block/block_kernel.cl
															
 
																 endif
															
--- a/examples/block/block.c
+++ b/examples/block/block.c
@@ -0,0 +1,167 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <pthread.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
															
 
																+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
															
 
																+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
															
 
																+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
															
 
																+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
															
 
																+        int i;
															
 
																+
															
 
																+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err, n;
															
 
																+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
															
 
																+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
															
 
																+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
															
 
																+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
															
 
																+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
															
 
																+
															
 
																+        id = starpu_get_worker_id();
															
 
																+        devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+        err = starpu_opencl_load_kernel(&kernel, &queue,
															
 
																+                                        "examples/block/block_kernel.cl", "block", devid);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+        n=0;
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
															
 
																+	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
															
 
																+	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
															
 
																+	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
															
 
																+	err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &multiplier);
															
 
																+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+                size_t global=1024;
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+        starpu_opencl_release(kernel);
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																+typedef void (*device_func)(void **, void *);
															
 
																+
															
 
																+int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny, int pnz, float multiplier)
															
 
																+{
															
 
																+	starpu_codelet cl;
															
 
																+	starpu_data_handle block_handle;
															
 
																+        starpu_data_handle multiplier_handle;
															
 
																+        int i, j, k;
															
 
																+
															
 
																+	starpu_register_block_data(&block_handle, 0, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
															
 
																+	starpu_register_variable_data(&multiplier_handle, 0, (uintptr_t)&multiplier, sizeof(float));
															
 
																+
															
 
																+	cl.where = where;
															
 
																+        cl.cuda_func = func;
															
 
																+        cl.cpu_func = func;
															
 
																+        cl.opencl_func = func;
															
 
																+        cl.nbuffers = 2;
															
 
																+        cl.model = NULL;
															
 
																+
															
 
																+        struct starpu_task *task = starpu_task_create();
															
 
																+        task->cl = &cl;
															
 
																+        task->callback_func = NULL;
															
 
																+        task->buffers[0].handle = block_handle;
															
 
																+        task->buffers[0].mode = STARPU_RW;
															
 
																+        task->buffers[1].handle = multiplier_handle;
															
 
																+        task->buffers[1].mode = STARPU_RW;
															
 
																+
															
 
																+        int ret = starpu_submit_task(task);
															
 
																+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																+                fprintf(stderr, "No worker may execute this task\n");
															
 
																+                return 1;
															
 
																+	}
															
 
																+
															
 
																+	starpu_wait_all_tasks();
															
 
																+
															
 
																+	/* update the array in RAM */
															
 
																+        starpu_sync_data_with_mem(block_handle, STARPU_R);
															
 
																+
															
 
																+        for(i=0 ; i<pnx*pny*pnz; i++) {
															
 
																+          fprintf(stderr, "%f ", block[i]);
															
 
																+        }
															
 
																+        fprintf(stderr, "\n");
															
 
																+
															
 
																+        starpu_release_data_from_mem(block_handle);
															
 
																+
															
 
																+        return 0;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	starpu_codelet cl;
															
 
																+        float *block;
															
 
																+        int i, ret;
															
 
																+        int nx=3;
															
 
																+        int ny=2;
															
 
																+        int nz=4;
															
 
																+        float multiplier=1.0;
															
 
																+
															
 
																+        starpu_init(NULL);
															
 
																+
															
 
																+        block = (float*)malloc(nx*ny*nz*sizeof(float));
															
 
																+        assert(block);
															
 
																+        for(i=0 ; i<nx*ny*nz ; i++) block[i] = i+1;
															
 
																+
															
 
																+        ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
															
 
																+        if (!ret) multiplier *= 1.0;
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        _starpu_opencl_compile_source_to_opencl("examples/block/block_kernel.cl");
															
 
																+        ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
															
 
																+        if (!ret) multiplier *= 2.0;
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+        ret = execute_on(STARPU_CUDA, cuda_codelet, block, nx, ny, nz, 3.0);
															
 
																+        if (!ret) multiplier *= 3.0;
															
 
																+#endif
															
 
																+
															
 
																+        // Check result is correct
															
 
																+        ret=1;
															
 
																+        for(i=0 ; i<nx*ny*nz ; i++) {
															
 
																+          if (block[i] != (i+1) * multiplier) {
															
 
																+            ret=0;
															
 
																+            break;
															
 
																+          }
															
 
																+        }
															
 
																+
															
 
																+        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
															
 
																+        starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/block/block_cuda.cu
+++ b/examples/block/block_cuda.cu
@@ -0,0 +1,34 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+static __global__ void cuda_block(float *block, int nx, int ny, int nz, float *multiplier)
															
 
																+{
															
 
																+        int i;
															
 
																+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
															
 
																+}
															
 
																+
															
 
																+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+        float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
															
 
																+	int nx = STARPU_GET_BLOCK_NX(descr[0]);
															
 
																+	int ny = STARPU_GET_BLOCK_NY(descr[0]);
															
 
																+	int nz = STARPU_GET_BLOCK_NZ(descr[0]);
															
 
																+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
															
 
																+
															
 
																+        cuda_block<<<1,1>>>(block, nx, ny, nz, multiplier);
															
 
																+}
															
--- a/examples/block/block_kernel.cl
+++ b/examples/block/block_kernel.cl
@@ -0,0 +1,23 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+__kernel void block(__global float *b, int nx, int ny, int nz, __global float *multiplier)
															
 
																+{
															
 
																+        const int i = get_global_id(0);
															
 
																+        if (i < nx*ny*nz) {
															
 
																+                b[i] *= *multiplier;
															
 
																+        }
															
 
																+}
															
--- a/examples/heat/Makefile.in
+++ b/examples/heat/Makefile.in
@@ -14,7 +14,7 @@
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																 #
															
 
																-export PKG_CONFIG_PATH=@STARPU_DIR@
															
 
																+export PKG_CONFIG_PATH=@STARPU_BUILD_DIR@
															
 
																 LIBS+=$$(pkg-config --libs libstarpu)
															
 
																 CFLAGS+=$$(pkg-config --cflags libstarpu)
															
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -23,6 +23,10 @@ static unsigned niter = 50000;
 
																 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																 extern void cuda_codelet_host(float *tab);
															
 
																 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
@@ -39,20 +43,27 @@ int main(int argc, char **argv)
 
																 	if (argc == 2)
															
 
																 		niter = atoi(argv[1]);
															
 
																-	float float_array[3] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f}; 
															
 
																+	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
															
 
																 	starpu_data_handle float_array_handle;
															
 
																 	starpu_register_vector_data(&float_array_handle, 0 /* home node */,
															
 
																-			(uintptr_t)&float_array, 3, sizeof(float));
															
 
																+			(uintptr_t)&float_array, 4, sizeof(float));
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        _starpu_opencl_compile_source_to_opencl("examples/incrementer/incrementer_kernels_opencl_codelet.cl");
															
 
																+#endif
															
 
																 	starpu_codelet cl =
															
 
																 	{
															
 
																 		/* CUBLAS stands for CUDA kernels controlled from the host */
															
 
																-		.where = STARPU_CPU|STARPU_CUDA,
															
 
																+		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 		.cpu_func = cpu_codelet,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		.cuda_func = cuda_codelet,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		.opencl_func = opencl_codelet,
															
 
																+#endif
															
 
																 		.nbuffers = 1
															
 
																 	};
															
@@ -67,7 +78,7 @@ int main(int argc, char **argv)
 
																 		struct starpu_task *task = starpu_task_create();
															
 
																 		task->cl = &cl;
															
 
																-		
															
 
																+
															
 
																 		task->callback_func = NULL;
															
 
																 		task->buffers[0].handle = float_array_handle;
															
@@ -85,15 +96,17 @@ int main(int argc, char **argv)
 
																 	/* update the array in RAM */
															
 
																 	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
															
 
																-	
															
 
																+
															
 
																 	gettimeofday(&end, NULL);
															
 
																-	fprintf(stderr, "array -> %f, %f, %f\n", float_array[0], 
															
 
																-			float_array[1], float_array[2]);
															
 
																-	
															
 
																-	if (float_array[0] != float_array[1] + float_array[2])
															
 
																+	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
															
 
																+                float_array[1], float_array[2], float_array[3]);
															
 
																+
															
 
																+	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
															
 
																+		fprintf(stderr, "Incorrect result\n");
															
 
																 		return 1;
															
 
																-	
															
 
																+	}
															
 
																+
															
 
																 	starpu_release_data_from_mem(float_array_handle);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
															
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -0,0 +1,49 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <CL/cl.h>
															
 
																+
															
 
																+void opencl_codelet(void *descr[], void *_args)
															
 
																+{
															
 
																+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err;
															
 
																+
															
 
																+        id = starpu_get_worker_id();
															
 
																+        devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+	err = starpu_opencl_load_kernel(&kernel, &queue,
															
 
																+                                        "examples/incrementer/incrementer_kernels_opencl_codelet.cl", "incrementer", devid);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+		size_t global=4;
															
 
																+		size_t local=4;
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+	starpu_opencl_release(kernel);
															
 
																+}
															
--- a/examples/incrementer/incrementer_kernels_opencl_codelet.cl
+++ b/examples/incrementer/incrementer_kernels_opencl_codelet.cl
@@ -0,0 +1,23 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+__kernel void incrementer(__global float* input) 
															
 
																+{
															
 
																+	const int i = get_global_id(0);
															
 
																+	if (i == 0 || i == 3)
															
 
																+		input[i] = input[i] + 1.0;
															
 
																+}
															
 
																+
															
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -0,0 +1,193 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <pthread.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+//static int width=1100;
															
 
																+//static int height=244021;
															
 
																+static int width=20;
															
 
																+static int height=4;
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err, n;
															
 
																+	float *matrix = (float *)STARPU_GET_MATRIX_PTR(descr[0]);
															
 
																+	float *vector = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
															
 
																+	float *mult = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
															
 
																+
															
 
																+        id = starpu_get_worker_id();
															
 
																+        devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+        err = starpu_opencl_load_kernel(&kernel, &queue,
															
 
																+                                        "examples/matvecmult/matvecmult_kernel.cl", "matVecMult", devid);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+        n=0;
															
 
																+        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
															
 
																+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
															
 
																+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&width);
															
 
																+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&height);
															
 
																+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
															
 
																+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+                size_t global=512;
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+	starpu_opencl_release(kernel);
															
 
																+}
															
 
																+
															
 
																+void fillArray(float* pfData, int iSize) {
															
 
																+    int i;
															
 
																+    const float fScale = 1.0f / (float)RAND_MAX;
															
 
																+    for (i = 0; i < iSize; ++i) {
															
 
																+            pfData[i] = fScale * rand();
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+void printArray(float* pfData, int iSize) {
															
 
																+    int i;
															
 
																+    for (i = 0; i < iSize; ++i) {
															
 
																+            fprintf(stderr, "%f ", pfData[i]);
															
 
																+    }
															
 
																+    fprintf(stderr, "\n");
															
 
																+}
															
 
																+
															
 
																+void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
															
 
																+    int i, j;
															
 
																+    for (i = 0; i < height; ++i) {
															
 
																+        double sum = 0;
															
 
																+        for (j = 0; j < width; ++j) {
															
 
																+            double a = matrix[i * width + j];
															
 
																+            double b = vector[j];
															
 
																+            sum += a * b;
															
 
																+        }
															
 
																+        mult[i] = (float)sum;
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+int compareL2fe(const float* reference, const float* data, const unsigned int len, const float epsilon) {
															
 
																+    float error = 0;
															
 
																+    float ref = 0;
															
 
																+    unsigned int i;
															
 
																+
															
 
																+    for(i = 0; i < len; ++i) {
															
 
																+        float diff = reference[i] - data[i];
															
 
																+        error += diff * diff;
															
 
																+        ref += reference[i] * reference[i];
															
 
																+    }
															
 
																+
															
 
																+    float normRef = sqrtf(ref);
															
 
																+    if (fabs(ref) < 1e-7) return 1;
															
 
																+
															
 
																+    float normError = sqrtf(error);
															
 
																+    error = normError / normRef;
															
 
																+
															
 
																+    return error < epsilon ? 0 : 1;
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	starpu_codelet cl;
															
 
																+        starpu_init(NULL);
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        float *matrix, *vector, *mult;
															
 
																+        float *correctResult;
															
 
																+        unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
															
 
																+
															
 
																+	starpu_data_handle matrix_handle, vector_handle, mult_handle;
															
 
																+
															
 
																+        mem_size_matrix = width * height * sizeof(float);
															
 
																+        matrix = (float*)malloc(mem_size_matrix);
															
 
																+        mem_size_vector = width * sizeof(float);
															
 
																+        vector = (float*)malloc(mem_size_vector);
															
 
																+        mem_size_mult = height * sizeof(float);
															
 
																+        mult = (float*)malloc(mem_size_mult);
															
 
																+        correctResult = (float*)malloc(mem_size_mult);
															
 
																+
															
 
																+        assert(matrix);
															
 
																+        assert(vector);
															
 
																+        assert(mult);
															
 
																+        assert(correctResult);
															
 
																+
															
 
																+        fillArray(matrix, width*height);
															
 
																+        fillArray(vector, width);
															
 
																+        fillArray(mult, height);
															
 
																+        matVecMult(matrix, vector, width, height, correctResult);
															
 
																+
															
 
																+	starpu_register_matrix_data(&matrix_handle, 0, (uintptr_t)matrix, width, width, height, sizeof(float));
															
 
																+	starpu_register_vector_data(&vector_handle, 0, (uintptr_t)vector, width, sizeof(float));
															
 
																+	starpu_register_vector_data(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
															
 
																+
															
 
																+        _starpu_opencl_compile_source_to_opencl("examples/matvecmult/matvecmult_kernel.cl");
															
 
																+
															
 
																+	cl.where = STARPU_OPENCL;
															
 
																+        cl.opencl_func = opencl_codelet;
															
 
																+        cl.nbuffers = 3;
															
 
																+        cl.model = NULL;
															
 
																+
															
 
																+        struct starpu_task *task = starpu_task_create();
															
 
																+        task->cl = &cl;
															
 
																+        task->callback_func = NULL;
															
 
																+        task->buffers[0].handle = matrix_handle;
															
 
																+        task->buffers[0].mode = STARPU_R;
															
 
																+        task->buffers[1].handle = vector_handle;
															
 
																+        task->buffers[1].mode = STARPU_R;
															
 
																+        task->buffers[2].handle = mult_handle;
															
 
																+        task->buffers[2].mode = STARPU_RW;
															
 
																+
															
 
																+        int ret = starpu_submit_task(task);
															
 
																+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																+                fprintf(stderr, "No worker may execute this task\n");
															
 
																+                exit(0);
															
 
																+	}
															
 
																+
															
 
																+	starpu_wait_all_tasks();
															
 
																+
															
 
																+	/* update the array in RAM */
															
 
																+        starpu_sync_data_with_mem(matrix_handle, STARPU_R);
															
 
																+        starpu_sync_data_with_mem(vector_handle, STARPU_R);
															
 
																+        starpu_sync_data_with_mem(mult_handle, STARPU_R);
															
 
																+
															
 
																+        int res = compareL2fe(correctResult, mult, height, 1e-6f);
															
 
																+        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
															
 
																+#if 0
															
 
																+        printArray(matrix, width*height);
															
 
																+        printArray(vector, width);
															
 
																+        printArray(mult, height);
															
 
																+#endif
															
 
																+        starpu_release_data_from_mem(matrix_handle);
															
 
																+        starpu_release_data_from_mem(vector_handle);
															
 
																+        starpu_release_data_from_mem(mult_handle);
															
 
																+
															
 
																+        starpu_shutdown();
															
 
																+#endif
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/matvecmult/matvecmult_kernel.cl
+++ b/examples/matvecmult/matvecmult_kernel.cl
@@ -0,0 +1,48 @@
 
																+/*
															
 
																+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ *
															
 
																+ * NVIDIA Corporation and its licensors retain all intellectual property and
															
 
																+ * proprietary rights in and to this software and related documentation.
															
 
																+ * Any use, reproduction, disclosure, or distribution of this software
															
 
																+ * and related documentation without an express license agreement from
															
 
																+ * NVIDIA Corporation is strictly prohibited.
															
 
																+ *
															
 
																+ * Please refer to the applicable NVIDIA end user license agreement (EULA)
															
 
																+ * associated with this source code for terms and conditions that govern
															
 
																+ * your use of this NVIDIA software.
															
 
																+ *
															
 
																+ */
															
 
																+
															
 
																+/* Matrix-vector multiplication: W = M * V.
															
 
																+ * Device code.
															
 
																+ *
															
 
																+ * This sample implements matrix-vector multiplication.
															
 
																+ * It has been written for clarity of exposition to illustrate various OpenCL
															
 
																+ * programming principles and optimizatoins, not with the goal of providing
															
 
																+ * the most performant generic kernel for matrix-vector multiplication.
															
 
																+ *
															
 
																+ * CUBLAS provides high-performance matrix-vector multiplication on GPU.
															
 
																+ */
															
 
																+
															
 
																+__kernel void matVecMult(
															
 
																+                         __global float* M,
															
 
																+                         __global float* V,
															
 
																+                         int width, int height,
															
 
																+                         __global float* W
															
 
																+                         )
															
 
																+{
															
 
																+        // Row index
															
 
																+        uint y = get_global_id(0);
															
 
																+        if (y < height) {
															
 
																+                // Row pointer
															
 
																+                const __global float* row = M + y * width;
															
 
																+
															
 
																+                // Compute dot product
															
 
																+                float dotProduct = 0;
															
 
																+                for (int x = 0; x < width; ++x)
															
 
																+                        dotProduct += row[x] * V[x];
															
 
																+
															
 
																+                // Write result to global memory
															
 
																+                W[y] = dotProduct;
															
 
																+        }
															
 
																+}
															
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -40,8 +40,8 @@ void parse_args(int argc, char **argv)
 
																 		strcpy(filename_out, argv[2]);
															
 
																 	}
															
 
																 	else {
															
 
																-		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_in_default);
															
 
																-		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_out_default);
															
 
																+		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_in_default);
															
 
																+		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_out_default);
															
 
																 	}
															
 
																 }
															
--- a/examples/spmv/dw_spmv.c
+++ b/examples/spmv/dw_spmv.c
@@ -27,6 +27,60 @@ extern void spmv_kernel_cuda(void *descr[], void *args);
 
																 struct timeval start;
															
 
																 struct timeval end;
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include "starpu_opencl.h"
															
 
																+void spmv_kernel_opencl(void *descr[], void *args)
															
 
																+{
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err, n;
															
 
																+
															
 
																+	uint32_t nnz = STARPU_GET_CSR_NNZ(descr[0]);
															
 
																+	uint32_t nrow = STARPU_GET_CSR_NROW(descr[0]);
															
 
																+	float *nzval = (float *)STARPU_GET_CSR_NZVAL(descr[0]);
															
 
																+	uint32_t *colind = STARPU_GET_CSR_COLIND(descr[0]);
															
 
																+	uint32_t *rowptr = STARPU_GET_CSR_ROWPTR(descr[0]);
															
 
																+	uint32_t firstentry = STARPU_GET_CSR_FIRSTENTRY(descr[0]);
															
 
																+
															
 
																+	float *vecin = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
															
 
																+	uint32_t nx_in = STARPU_GET_VECTOR_NX(descr[1]);
															
 
																+
															
 
																+	float *vecout = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
															
 
																+	uint32_t nx_out = STARPU_GET_VECTOR_NX(descr[2]);
															
 
																+
															
 
																+        id = starpu_get_worker_id();
															
 
																+        devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+        err = starpu_opencl_load_kernel(&kernel, &queue,
															
 
																+                                        "examples/spmv/spmv_opencl.cl", "spvm", devid);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+        n=0;
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nnz);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nrow);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &nzval);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &colind);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &rowptr);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &firstentry);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecin);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_in);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecout);
															
 
																+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_out);
															
 
																+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+                size_t global=1024;
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+        starpu_opencl_release(kernel);
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																 unsigned nblocks = 2;
															
 
																 uint32_t size = 4194304;
															
@@ -189,14 +243,28 @@ void call_spmv_codelet_filters(void)
 
																 	starpu_partition_data(sparse_matrix, &csr_f);
															
 
																 	starpu_partition_data(vector_out, &vector_f);
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        {
															
 
																+                int ret = _starpu_opencl_compile_source_to_opencl("examples/spmv/spmv_opencl.cl");
															
 
																+                if (ret)
															
 
																+		{
															
 
																+			fprintf(stderr, "Failed to compile OpenCL codelet\n");
															
 
																+			exit(ret);
															
 
																+		}
															
 
																+        }
															
 
																+#endif
															
 
																+
															
 
																 	starpu_codelet cl;
															
 
																 	memset(&cl, 0, sizeof(starpu_codelet));
															
 
																-	cl.where = STARPU_CPU|STARPU_CUDA;
															
 
																+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
															
 
																 	cl.cpu_func =  cpu_spmv;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	cl.cuda_func = spmv_kernel_cuda;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        cl.opencl_func = spmv_kernel_opencl;
															
 
																+#endif
															
 
																 	cl.nbuffers = 3;
															
 
																 	cl.model = NULL;
															
@@ -206,6 +274,7 @@ void call_spmv_codelet_filters(void)
 
																 	for (part = 0; part < nblocks; part++)
															
 
																 	{
															
 
																 		struct starpu_task *task = starpu_task_create();
															
 
																+                int ret;
															
 
																 		task->callback_func = NULL;
															
@@ -219,7 +288,12 @@ void call_spmv_codelet_filters(void)
 
																 		task->buffers[2].handle = starpu_get_sub_data(vector_out, 1, part);
															
 
																 		task->buffers[2].mode = STARPU_W;
															
 
																-		starpu_submit_task(task);
															
 
																+		ret = starpu_submit_task(task);
															
 
																+		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																+		{
															
 
																+			fprintf(stderr, "No worker may execute this task\n");
															
 
																+			exit(0);
															
 
																+		}
															
 
																 	}
															
 
																 	starpu_wait_all_tasks();
															
--- a/examples/spmv/dw_spmv.h
+++ b/examples/spmv/dw_spmv.h
@@ -24,7 +24,6 @@
 
																 #include <sys/types.h>
															
 
																 #include <pthread.h>
															
 
																 #include <signal.h>
															
 
																-#include <cblas.h>
															
 
																 #include <starpu.h>
															
--- a/examples/spmv/spmv_opencl.cl
+++ b/examples/spmv/spmv_opencl.cl
@@ -0,0 +1,42 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+__kernel void spvm(unsigned nnz, unsigned nrow,
															
 
																+                   __global float* nzval, __global unsigned* colind,
															
 
																+                   __global unsigned* rowptr, unsigned firstentry,
															
 
																+                   __global float *vecin, unsigned nx_in,
															
 
																+                   __global float *vecout, unsigned nx_out)
															
 
																+{
															
 
																+	unsigned row;
															
 
																+	for (row = 0; row < nrow; row++)
															
 
																+	{
															
 
																+		float tmp = 0.0f;
															
 
																+		unsigned index;
															
 
																+
															
 
																+		unsigned firstindex = rowptr[row] - firstentry;
															
 
																+		unsigned lastindex = rowptr[row+1] - firstentry;
															
 
																+
															
 
																+		for (index = firstindex; index < lastindex; index++)
															
 
																+		{
															
 
																+			unsigned col;
															
 
																+
															
 
																+			col = colind[index];
															
 
																+			tmp += nzval[index]*vecin[col];
															
 
																+		}
															
 
																+
															
 
																+		vecout[row] = tmp;
															
 
																+	}
															
 
																+}
															
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -0,0 +1,147 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include "stencil.h"
															
 
																+
															
 
																+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+
															
 
																+static int verbose = 0;
															
 
																+
															
 
																+static
															
 
																+void display_non_zero_values(TYPE *ptr, char *msg)
															
 
																+{
															
 
																+        if(verbose) {
															
 
																+                int x, y, z;
															
 
																+
															
 
																+                for(z = 0; z < DIM; z++)
															
 
																+                        for(y = 0; y < DIM; y++)
															
 
																+                                for(x = 0; x < DIM; x++) {
															
 
																+                                        TYPE r = ptr[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD];
															
 
																+                                        if(r != 0.0)
															
 
																+                                                printf("%s[%d, %d, %d] == %f\n", msg, z, y, x, r);
															
 
																+                                }
															
 
																+        }
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        TYPE *data;                         // original data set given to device
															
 
																+        TYPE *results;                      // results returned from device
															
 
																+        TYPE C0 = 0.25;
															
 
																+        TYPE C1 = 0.75;
															
 
																+        starpu_data_handle data_handle;
															
 
																+        starpu_data_handle results_handle;
															
 
																+        starpu_data_handle C0_handle;
															
 
																+        starpu_data_handle C1_handle;
															
 
																+
															
 
																+	starpu_init(NULL);
															
 
																+
															
 
																+        // Filter args
															
 
																+        argv++;
															
 
																+        while (argc > 1) {
															
 
																+                if(!strcmp(*argv, "--verbose")) {
															
 
																+                        verbose = 1;
															
 
																+                } else
															
 
																+                        break;
															
 
																+                argc--; argv++;
															
 
																+        }
															
 
																+
															
 
																+        // Fill our data set with random float values
															
 
																+        {
															
 
																+                long i, x, y, z;
															
 
																+
															
 
																+                data = (TYPE *)malloc(SIZE * sizeof(TYPE));
															
 
																+                results = (TYPE *)malloc(SIZE * sizeof(TYPE));
															
 
																+                if (!data || !results) {
															
 
																+                        fprintf(stderr, "Malloc failed!\n");
															
 
																+                        return;
															
 
																+                }
															
 
																+
															
 
																+                for(i = 0; i < SIZE; i++) {
															
 
																+                        data[i] = 0.0;
															
 
																+                        results[i] = 0.0;
															
 
																+                }
															
 
																+
															
 
																+                z = ZBLOCK-1;
															
 
																+                y = YBLOCK-1;
															
 
																+                x = XBLOCK-1;
															
 
																+
															
 
																+                data[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD] = 2.0;
															
 
																+        }
															
 
																+
															
 
																+        display_non_zero_values(data, "data");
															
 
																+
															
 
																+        starpu_register_vector_data(&data_handle, 0 /* home node */,
															
 
																+                                    (uintptr_t)data, SIZE, sizeof(TYPE));
															
 
																+        starpu_register_vector_data(&results_handle, 0 /* home node */,
															
 
																+                                    (uintptr_t)results, SIZE, sizeof(TYPE));
															
 
																+        starpu_register_vector_data(&C0_handle, 0 /* home node */,
															
 
																+                                    (uintptr_t)&C0, 1, sizeof(TYPE));
															
 
																+        starpu_register_vector_data(&C1_handle, 0 /* home node */,
															
 
																+                                    (uintptr_t)&C1, 1, sizeof(TYPE));
															
 
																+
															
 
																+        _starpu_opencl_compile_source_to_opencl("examples/stencil/stencil_opencl_codelet.cl");
															
 
																+
															
 
																+	starpu_codelet cl =
															
 
																+	{
															
 
																+		.where = STARPU_OPENCL,
															
 
																+		.opencl_func = opencl_codelet,
															
 
																+		.nbuffers = 4
															
 
																+	};
															
 
																+
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+                task->cl = &cl;
															
 
																+                task->callback_func = NULL;
															
 
																+
															
 
																+		task->buffers[0].handle = data_handle;
															
 
																+		task->buffers[0].mode = STARPU_R;
															
 
																+		task->buffers[1].handle = results_handle;
															
 
																+		task->buffers[1].mode = STARPU_W;
															
 
																+		task->buffers[2].handle = C0_handle;
															
 
																+		task->buffers[2].mode = STARPU_R;
															
 
																+		task->buffers[3].handle = C1_handle;
															
 
																+		task->buffers[3].mode = STARPU_R;
															
 
																+
															
 
																+		int ret = starpu_submit_task(task);
															
 
																+		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																+		{
															
 
																+			fprintf(stderr, "No worker may execute this task\n");
															
 
																+			exit(0);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_wait_all_tasks();
															
 
																+
															
 
																+	/* update the array in RAM */
															
 
																+	starpu_sync_data_with_mem(data_handle, STARPU_R);
															
 
																+	starpu_sync_data_with_mem(results_handle, STARPU_R);
															
 
																+	starpu_sync_data_with_mem(C0_handle, STARPU_R);
															
 
																+	starpu_sync_data_with_mem(C1_handle, STARPU_R);
															
 
																+
															
 
																+	display_non_zero_values(results, "results");
															
 
																+
															
 
																+	starpu_release_data_from_mem(data_handle);
															
 
																+	starpu_release_data_from_mem(results_handle);
															
 
																+	starpu_release_data_from_mem(C0_handle);
															
 
																+	starpu_release_data_from_mem(C1_handle);
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/stencil/stencil.h
+++ b/examples/stencil/stencil.h
@@ -0,0 +1,43 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STENCIL_H__
															
 
																+#define __STENCIL_H__
															
 
																+
															
 
																+#define TYPE    float
															
 
																+
															
 
																+#define DIM     (128)
															
 
																+#define BORDER  (1)
															
 
																+
															
 
																+#define PADDING (64 / sizeof(TYPE) - 2*BORDER)
															
 
																+#define FIRST_PAD (PADDING+1)
															
 
																+
															
 
																+#define REALDIM (DIM + 2 * BORDER + PADDING)
															
 
																+
															
 
																+#define SURFACE   ((DIM + 2 * BORDER) * REALDIM)
															
 
																+
															
 
																+#define SIZE    ((DIM + 2 * BORDER) * SURFACE + 1)
															
 
																+
															
 
																+#define XBLOCK  (16)
															
 
																+#define YBLOCK  (16)
															
 
																+#define ZBLOCK  (64)
															
 
																+
															
 
																+#define X_PER_THREAD (1)
															
 
																+#define Y_PER_THREAD (4)
															
 
																+#define Z_PER_THREAD (ZBLOCK)
															
 
																+
															
 
																+#endif
															
 
																+
															
--- a/examples/stencil/stencil_opencl.c
+++ b/examples/stencil/stencil_opencl.c
@@ -0,0 +1,66 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <starpu_util.h>
															
 
																+#include "stencil.h"
															
 
																+
															
 
																+void opencl_codelet(void *descr[], void *_args)
															
 
																+{
															
 
																+	float *data = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+	float *results = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
															
 
																+	float *C0 = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
															
 
																+	float *C1 = (float *)STARPU_GET_VECTOR_PTR(descr[3]);
															
 
																+
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err;
															
 
																+
															
 
																+        id = starpu_get_worker_id();
															
 
																+        devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+        err = starpu_opencl_load_kernel(&kernel, &queue, "examples/stencil/stencil_opencl_codelet.cl", "stencil", devid);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
															
 
																+	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &results);
															
 
																+	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &C0);
															
 
																+	err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &C1);
															
 
																+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+		size_t global[3];
															
 
																+		size_t local[3];
															
 
																+
															
 
																+                // Execute the kernel over the entire range of our 3d input data set
															
 
																+                local[0] = XBLOCK / X_PER_THREAD;   // threads along the X axis
															
 
																+                local[1] = YBLOCK / Y_PER_THREAD;   // threads along the Y axis
															
 
																+                local[2] = ZBLOCK / Z_PER_THREAD;   // threads along the Z axis
															
 
																+
															
 
																+                global[0] = DIM / X_PER_THREAD;  // virtual size of global X axis
															
 
																+                global[1] = DIM / Y_PER_THREAD;  // virtual size of global Y axis
															
 
																+                global[2] = DIM / Z_PER_THREAD;  // virtual size of global Z axis
															
 
																+
															
 
																+                err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+	starpu_opencl_release(kernel);
															
 
																+}
															
--- a/examples/variable/variable.c
+++ b/examples/variable/variable.c
@@ -0,0 +1,100 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <pthread.h>
															
 
																+
															
 
																+static unsigned niter = 50000;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																+extern void cuda_codelet_host(float *tab);
															
 
																+
															
 
																+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+
															
 
																+	*val += 1.0f;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+        float foo;
															
 
																+	starpu_data_handle float_array_handle;
															
 
																+	starpu_codelet cl;
															
 
																+
															
 
																+	starpu_init(NULL);
															
 
																+        if (argc == 2) niter = atoi(argv[1]);
															
 
																+        foo = 0.0f;
															
 
																+
															
 
																+	starpu_register_variable_data(&float_array_handle, 0 /* home node */,
															
 
																+                                      (uintptr_t)&foo, sizeof(float));
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        _starpu_opencl_compile_source_to_opencl("examples/variable/variable_kernels_opencl_codelet.cl");
															
 
																+#endif
															
 
																+
															
 
																+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
															
 
																+        cl.cpu_func = cpu_codelet;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+        cl.cuda_func = cuda_codelet;
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        cl.opencl_func = opencl_codelet;
															
 
																+#endif
															
 
																+        cl.nbuffers = 1;
															
 
																+        cl.model = NULL;
															
 
																+
															
 
																+	for (i = 0; i < niter; i++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+                int ret;
															
 
																+
															
 
																+		task->cl = &cl;
															
 
																+
															
 
																+		task->callback_func = NULL;
															
 
																+
															
 
																+		task->buffers[0].handle = float_array_handle;
															
 
																+		task->buffers[0].mode = STARPU_RW;
															
 
																+
															
 
																+		ret = starpu_submit_task(task);
															
 
																+		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																+		{
															
 
																+			fprintf(stderr, "No worker may execute this task\n");
															
 
																+			exit(0);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_wait_all_tasks();
															
 
																+
															
 
																+	/* update the array in RAM */
															
 
																+	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
															
 
																+
															
 
																+	fprintf(stderr, "variable -> %f\n", foo);
															
 
																+
															
 
																+	starpu_release_data_from_mem(float_array_handle);
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/variable/variable_kernels.cu
+++ b/examples/variable/variable_kernels.cu
@@ -0,0 +1,30 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+static __global__ void cuda_variable(float * tab)
															
 
																+{
															
 
																+	*tab += 2;
															
 
																+	return;
															
 
																+}
															
 
																+
															
 
																+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+
															
 
																+	cuda_variable<<<1,1>>>(val);
															
 
																+}
															
--- a/examples/variable/variable_kernels_opencl.c
+++ b/examples/variable/variable_kernels_opencl.c
@@ -0,0 +1,48 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+
															
 
																+void opencl_codelet(void *descr[], void *_args)
															
 
																+{
															
 
																+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err;
															
 
																+
															
 
																+        id = starpu_get_worker_id();
															
 
																+        devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+	err = starpu_opencl_load_kernel(&kernel, &queue,
															
 
																+                                        "examples/variable/variable_kernels_opencl_codelet.cl", "variable", devid);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+		size_t global=1;
															
 
																+		size_t local=1;
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+	starpu_opencl_release(kernel);
															
 
																+}
															
--- a/examples/variable/variable_kernels_opencl_codelet.cl
+++ b/examples/variable/variable_kernels_opencl_codelet.cl
@@ -0,0 +1,23 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+__kernel void variable(__global float* input) 
															
 
																+{
															
 
																+	const int i = get_global_id(0);
															
 
																+	if (i == 0)
															
 
																+		input[i] = input[i] + 4.0;
															
 
																+}
															
 
																+
															
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -46,14 +46,19 @@ struct starpu_conf {
 
																 	int ncpus;
															
 
																 	/* maximum number of CUDA GPUs (-1 for default) */
															
 
																 	int ncuda;
															
 
																+	/* maximum number of OpenCL GPUs (-1 for default) */
															
 
																+	int nopencl;
															
 
																 	/* maximum number of Cell's SPUs (-1 for default) */
															
 
																 	int nspus;
															
 
																 	unsigned use_explicit_workers_bindid;
															
 
																 	unsigned workers_bindid[STARPU_NMAXWORKERS];
															
 
																-	unsigned use_explicit_workers_gpuid;
															
 
																-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
															
 
																+	unsigned use_explicit_workers_cuda_gpuid;
															
 
																+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
															
 
																+
															
 
																+	unsigned use_explicit_workers_opencl_gpuid;
															
 
																+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
															
 
																 	/* calibrate performance models, if any */
															
 
																 	unsigned calibrate;
															
@@ -75,6 +80,7 @@ unsigned starpu_get_worker_count(void);
 
																 unsigned starpu_get_cpu_worker_count(void);
															
 
																 unsigned starpu_get_cuda_worker_count(void);
															
 
																 unsigned starpu_get_spu_worker_count(void);
															
 
																+unsigned starpu_get_opencl_worker_count(void);
															
 
																 /* Return the identifier of the thread in case this is associated to a worker.
															
 
																  * This will return -1 if this function is called directly from the application
															
@@ -84,6 +90,7 @@ int starpu_get_worker_id(void);
 
																 enum starpu_archtype {
															
 
																 	STARPU_CPU_WORKER, /* CPU core */
															
 
																 	STARPU_CUDA_WORKER, /* NVIDIA CUDA device */
															
 
																+	STARPU_OPENCL_WORKER, /* OpenCL CUDA device */
															
 
																 	STARPU_GORDON_WORKER /* Cell SPU */
															
 
																 };
															
@@ -103,6 +110,11 @@ enum starpu_archtype starpu_get_worker_type(int id);
 
																  * behaviour. */
															
 
																 void starpu_get_worker_name(int id, char *dst, size_t maxlen);
															
 
																+/* This functions returns the device id of the worker associated to an
															
 
																+ *  identifier (as returned by the starpu_get_worker_id() function)
															
 
																+ */
															
 
																+int starpu_get_worker_devid(int id);
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -3,16 +3,17 @@
 
																 #undef STARPU_USE_CPU
															
 
																 #undef STARPU_USE_CUDA
															
 
																+#undef STARPU_USE_OPENCL
															
 
																 #undef STARPU_USE_GORDON
															
 
																 #undef STARPU_ATLAS
															
 
																 #undef STARPU_GOTO
															
 
																 #undef STARPU_SYSTEM_BLAS
															
 
																+#undef STARPU_BUILD_DIR
															
 
																+#undef STARPU_OPENCL_DATADIR
															
 
																 #undef STARPU_HAVE_MAGMA
															
 
																-#undef STARPU_DIR
															
 
																-
															
 
																 #undef STARPU_OPENGL_RENDER
															
 
																 #undef STARPU_USE_GTK
															
@@ -37,6 +38,7 @@
 
																 #undef STARPU_NMAXBUFS
															
 
																 #undef STARPU_MAXCUDADEVS
															
 
																+#undef STARPU_MAXOPENCLDEVS
															
 
																 #undef STARPU_HAVE_LIBNUMA
															
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -36,6 +36,8 @@ void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memo
 
																 /* Matrix interface for dense matrices */
															
 
																 typedef struct starpu_matrix_interface_s {
															
 
																 	uintptr_t ptr;
															
 
																+        uintptr_t dev_handle;
															
 
																+        size_t offset;
															
 
																 	uint32_t nx;
															
 
																 	uint32_t ny;
															
 
																 	uint32_t ld;
															
@@ -62,6 +64,8 @@ size_t starpu_get_matrix_elemsize(starpu_data_handle handle);
 
																 /* BLOCK interface for 3D dense blocks */
															
 
																 typedef struct starpu_block_interface_s {
															
 
																 	uintptr_t ptr;
															
 
																+        uintptr_t dev_handle;
															
 
																+        size_t offset;
															
 
																 	uint32_t nx;
															
 
																 	uint32_t ny;
															
 
																 	uint32_t nz;
															
@@ -93,6 +97,8 @@ size_t starpu_get_block_elemsize(starpu_data_handle handle);
 
																 /* vector interface for contiguous (non-strided) buffers */
															
 
																 typedef struct starpu_vector_interface_s {
															
 
																 	uintptr_t ptr;
															
 
																+        uintptr_t dev_handle;
															
 
																+        size_t offset;
															
 
																 	uint32_t nx;
															
 
																 	size_t elemsize;
															
 
																 } starpu_vector_interface_t;
															
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -0,0 +1,192 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_OPENCL_H__
															
 
																+#define __STARPU_OPENCL_H__
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <CL/cl.h>
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+extern "C" {
															
 
																+#endif
															
 
																+
															
 
																+#define STARPU_OPENCL_REPORT_ERROR(status)                                     \
															
 
																+	do {                                                            \
															
 
																+		char *errormsg;                                         \
															
 
																+		switch (status) {                                       \
															
 
																+		case CL_SUCCESS:                                        \
															
 
																+			errormsg = "success";                           \
															
 
																+			break;                                          \
															
 
																+		case CL_DEVICE_NOT_FOUND:				\
															
 
																+			errormsg = "Device not found";                  \
															
 
																+			break;                                          \
															
 
																+		case CL_DEVICE_NOT_AVAILABLE:				\
															
 
																+			errormsg = "Device not available";              \
															
 
																+			break;                                          \
															
 
																+		case CL_COMPILER_NOT_AVAILABLE:				\
															
 
																+			errormsg = "Compiler not available";            \
															
 
																+			break;                                          \
															
 
																+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:			\
															
 
																+			errormsg = "Memory object allocation failure";  \
															
 
																+			break;                                          \
															
 
																+		case CL_OUT_OF_RESOURCES:				\
															
 
																+			errormsg = "Out of resources";                  \
															
 
																+			break;                                          \
															
 
																+		case CL_OUT_OF_HOST_MEMORY:				\
															
 
																+			errormsg = "Out of host memory";                \
															
 
																+			break;                                          \
															
 
																+		case CL_PROFILING_INFO_NOT_AVAILABLE:			\
															
 
																+			errormsg = "Profiling info not available";      \
															
 
																+			break;                                          \
															
 
																+		case CL_MEM_COPY_OVERLAP:				\
															
 
																+			errormsg = "Memory copy overlap";               \
															
 
																+			break;                                          \
															
 
																+		case CL_IMAGE_FORMAT_MISMATCH:				\
															
 
																+			errormsg = "Image format mismatch";             \
															
 
																+			break;                                          \
															
 
																+		case CL_IMAGE_FORMAT_NOT_SUPPORTED:			\
															
 
																+			errormsg = "Image format not supported";        \
															
 
																+			break;                                          \
															
 
																+		case CL_BUILD_PROGRAM_FAILURE:				\
															
 
																+			errormsg = "Build program failure";             \
															
 
																+			break;                                          \
															
 
																+		case CL_MAP_FAILURE:				        \
															
 
																+			errormsg = "Map failure";                       \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_VALUE:				        \
															
 
																+			errormsg = "Invalid value";                     \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_DEVICE_TYPE:				\
															
 
																+			errormsg = "Invalid device type";               \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_PLATFORM:				\
															
 
																+			errormsg = "Invalid platform";                  \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_DEVICE:				        \
															
 
																+			errormsg = "Invalid device";                    \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_CONTEXT:				\
															
 
																+			errormsg = "Invalid context";                   \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_QUEUE_PROPERTIES:			\
															
 
																+			errormsg = "Invalid queue properties";          \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_COMMAND_QUEUE:				\
															
 
																+			errormsg = "Invalid command queue";             \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_HOST_PTR:				\
															
 
																+			errormsg = "Invalid host pointer";              \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_MEM_OBJECT:				\
															
 
																+			errormsg = "Invalid memory object";             \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:		\
															
 
																+			errormsg = "Invalid image format descriptor";   \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_IMAGE_SIZE:				\
															
 
																+			errormsg = "Invalid image size";                \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_SAMPLER:				\
															
 
																+			errormsg = "Invalid sampler";                   \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_BINARY:				        \
															
 
																+			errormsg = "Invalid binary";                    \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_BUILD_OPTIONS:				\
															
 
																+			errormsg = "Invalid build options";             \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_PROGRAM:				\
															
 
																+			errormsg = "Invalid program";                   \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_PROGRAM_EXECUTABLE:			\
															
 
																+			errormsg = "Invalid program executable";        \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_KERNEL_NAME:				\
															
 
																+			errormsg = "Invalid kernel name";               \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_KERNEL_DEFINITION:			\
															
 
																+			errormsg = "Invalid kernel definition";         \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_KERNEL:				        \
															
 
																+			errormsg = "Invalid kernel";                    \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_ARG_INDEX:				\
															
 
																+			errormsg = "Invalid argument index";            \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_ARG_VALUE:				\
															
 
																+			errormsg = "Invalid argument value";            \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_ARG_SIZE:				\
															
 
																+			errormsg = "Invalid argument size";             \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_KERNEL_ARGS:				\
															
 
																+			errormsg = "Invalid kernel arguments";          \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_WORK_DIMENSION:				\
															
 
																+			errormsg = "Invalid work dimension";            \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_WORK_GROUP_SIZE:			\
															
 
																+			errormsg = "Invalid work group size";           \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_WORK_ITEM_SIZE:				\
															
 
																+			errormsg = "Invalid work item size";            \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_GLOBAL_OFFSET:				\
															
 
																+			errormsg = "Invalid global offset";             \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_EVENT_WAIT_LIST:			\
															
 
																+			errormsg = "Invalid event wait list";           \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_EVENT:				        \
															
 
																+			errormsg = "Invalid event";                     \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_OPERATION:				\
															
 
																+			errormsg = "Invalid operation";                 \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_GL_OBJECT:				\
															
 
																+			errormsg = "Invalid GL object";                 \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_BUFFER_SIZE:				\
															
 
																+			errormsg = "Invalid buffer size";               \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_MIP_LEVEL:				\
															
 
																+			errormsg = "Invalid MIP level";                 \
															
 
																+			break;                                          \
															
 
																+		case CL_INVALID_GLOBAL_WORK_SIZE:			\
															
 
																+			errormsg = "Invalid global work size";          \
															
 
																+			break;                                          \
															
 
																+		default:						\
															
 
																+			errormsg = "unknown error";			\
															
 
																+			break;			                        \
															
 
																+		}                                                       \
															
 
																+		printf("oops in %s ... <%s> \n", __func__, errormsg);	\
															
 
																+		assert(0);	                                        \
															
 
																+	} while (0)
															
 
																+
															
 
																+void starpu_opencl_get_context(int devid, cl_context *context);
															
 
																+void starpu_opencl_get_device(int devid, cl_device_id *device);
															
 
																+void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
															
 
																+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid);
															
 
																+int starpu_opencl_release(cl_kernel kernel);
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#endif // STARPU_USE_OPENCL
															
 
																+#endif // __STARPU_OPENCL_H__
															
 
																+
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -21,6 +21,7 @@
 
																 #include <pthread.h>
															
 
																 #include <starpu.h>
															
 
																 #include <starpu_config.h>
															
 
																+#include <starpu_task.h>
															
 
																 #ifdef __cplusplus
															
 
																 extern "C" {
															
@@ -39,8 +40,9 @@ struct starpu_buffer_descr_t;
 
																 enum starpu_perf_archtype {
															
 
																 	STARPU_CPU_DEFAULT = 0,
															
 
																 	STARPU_CUDA_DEFAULT = 1,
															
 
																-	/* STARPU_CUDA_DEFAULT + devid */
															
 
																-	STARPU_GORDON_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS
															
 
																+	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
															
 
																+	/* STARPU_OPENCL_DEFAULT + devid */
															
 
																+	STARPU_GORDON_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS
															
 
																 };
															
 
																 #define STARPU_NARCH_VARIATIONS	(STARPU_GORDON_DEFAULT+1)
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -31,6 +31,7 @@
 
																 #define STARPU_CUDA	((1ULL)<<3)
															
 
																 #define STARPU_SPU	((1ULL)<<4)
															
 
																 #define STARPU_GORDON	((1ULL)<<5)
															
 
																+#define STARPU_OPENCL	((1ULL)<<6)
															
 
																 #define STARPU_MIN_PRIO        (-4)
															
 
																 #define STARPU_MAX_PRIO        5
															
@@ -53,6 +54,7 @@ typedef struct starpu_codelet_t {
 
																 	/* the different implementations of the codelet */
															
 
																 	void (*cuda_func)(void **, void *);
															
 
																 	void (*cpu_func)(void **, void *);
															
 
																+	void (*opencl_func)(void **, void *);
															
 
																 	uint8_t gordon_func;
															
 
																 	/* how many buffers do the codelet takes as argument ? */
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -157,3 +157,8 @@ endif
 
																 if STARPU_USE_GORDON
															
 
																 libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
															
 
																 endif
															
 
																+
															
 
																+if STARPU_USE_OPENCL
															
 
																+libstarpu_la_SOURCES += drivers/opencl/driver_opencl.c
															
 
																+libstarpu_la_SOURCES += drivers/opencl/driver_opencl_utils.c
															
 
																+endif
															
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -49,6 +49,7 @@ typedef void (*callback)(void *);
 
																 #define STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
															
 
																 #define STARPU_SPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SPU)
															
 
																 #define STARPU_GORDON_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_GORDON)
															
 
																+#define STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
															
 
																 /* a job is the internal representation of a task */
															
 
																 LIST_TYPE(starpu_job,
															
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -82,6 +82,9 @@ static double common_task_expected_length(struct starpu_perfmodel_t *model, uint
 
																 			case STARPU_CUDA:
															
 
																 				alpha = STARPU_CUDA_ALPHA;
															
 
																 				break;
															
 
																+  		        case STARPU_OPENCL:
															
 
																+	                        alpha = STARPU_OPENCL_ALPHA;
															
 
																+                                break;
															
 
																 			default:
															
 
																 				/* perhaps there are various worker types on that queue */
															
 
																 				alpha = 1.0; // this value is not significant ...
															
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -106,8 +106,11 @@ double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_
 
																 void _starpu_set_calibrate_flag(unsigned val);
															
 
																 unsigned _starpu_get_calibrate_flag(void);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-int *_starpu_get_gpu_affinity_vector(unsigned gpuid);
															
 
																+#if defined(STARPU_USE_CUDA)
															
 
																+int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
															
 
																 #endif
															
 
																- 
															
 
																+#if defined(STARPU_USE_OPENCL)
															
 
																+int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
															
 
																+#endif
															
 
																+
															
 
																 #endif // __PERFMODEL_H__
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -23,8 +23,10 @@
 
																 #include <unistd.h>
															
 
																 #include <sys/time.h>
															
 
																 #include <stdlib.h>
															
 
																+#include <math.h>
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																 #include <common/config.h>
															
 
																 #include <core/workers.h>
															
 
																 #include <core/perfmodel/perfmodel.h>
															
@@ -34,7 +36,7 @@
 
																 #define MAXCPUS	32
															
 
																-struct cudadev_timing {
															
 
																+struct dev_timing {
															
 
																 	int cpu_id;
															
 
																 	double timing_htod;
															
 
																 	double timing_dtoh;
															
@@ -43,19 +45,29 @@ struct cudadev_timing {
 
																 static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
															
 
																 static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
															
 
																 static unsigned was_benchmarked = 0;
															
 
																+static unsigned ncpus = 0;
															
 
																 static int ncuda = 0;
															
 
																-
															
 
																-static int affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
															
 
																+static int nopencl = 0;
															
 
																 /* Benchmarking the performance of the bus */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																+static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
															
 
																 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
															
 
																 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
															
 
																+static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
															
 
																+static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
															
 
																+static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
															
 
																+static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
															
 
																+#endif
															
 
																-static struct cudadev_timing cudadev_timing_per_cpu[STARPU_MAXNODES][MAXCPUS];
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																-static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
															
 
																 {
															
 
																 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																 	_starpu_bind_thread_on_cpu(config, cpu);
															
@@ -84,7 +96,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
																 	/* Allocate a buffer on the host */
															
 
																 	unsigned char *h_buffer;
															
 
																-	cudaHostAlloc((void **)&h_buffer, SIZE, 0); 
															
 
																+	cudaHostAlloc((void **)&h_buffer, SIZE, 0);
															
 
																 	assert(h_buffer);
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
@@ -104,7 +116,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
																 	struct timeval start;
															
 
																 	struct timeval end;
															
 
																-	cudadev_timing_per_cpu[dev+1][cpu].cpu_id = cpu;
															
 
																+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
															
 
																 	/* Measure upload bandwidth */
															
 
																 	gettimeofday(&start, NULL);
															
@@ -116,7 +128,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
																 	gettimeofday(&end, NULL);
															
 
																 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	cudadev_timing_per_cpu[dev+1][cpu].timing_htod = timing/NITER;
															
 
																+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
															
 
																 	/* Measure download bandwidth */
															
 
																 	gettimeofday(&start, NULL);
															
@@ -128,7 +140,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
																 	gettimeofday(&end, NULL);
															
 
																 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh = timing/NITER;
															
 
																+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
															
 
																 	/* Free buffers */
															
 
																 	cudaFreeHost(h_buffer);
															
@@ -137,18 +149,101 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
																 	cudaThreadExit();
															
 
																 }
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
															
 
																+{
															
 
																+        cl_context context;
															
 
																+        cl_command_queue queue;
															
 
																+
															
 
																+        struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+	_starpu_bind_thread_on_cpu(config, cpu);
															
 
																+
															
 
																+	/* Initialize OpenCL context on the device */
															
 
																+        _starpu_opencl_init_context(dev);
															
 
																+        starpu_opencl_get_context(dev, &context);
															
 
																+        starpu_opencl_get_queue(dev, &queue);
															
 
																+
															
 
																+	/* hack to avoid third party libs to rebind threads */
															
 
																+	_starpu_bind_thread_on_cpu(config, cpu);
															
 
																+
															
 
																+	/* Allocate a buffer on the device */
															
 
																+        int err;
															
 
																+	cl_mem d_buffer;
															
 
																+	d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, SIZE, NULL, &err);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	/* hack to avoid third party libs to rebind threads */
															
 
																+	_starpu_bind_thread_on_cpu(config, cpu);
															
 
																+
															
 
																+        /* Allocate a buffer on the host */
															
 
																+	unsigned char *h_buffer;
															
 
																+        h_buffer = malloc(SIZE);
															
 
																+	assert(h_buffer);
															
 
																+
															
 
																+	/* hack to avoid third party libs to rebind threads */
															
 
																+	_starpu_bind_thread_on_cpu(config, cpu);
															
 
																+
															
 
																+        /* Fill them */
															
 
																+	memset(h_buffer, 0, SIZE);
															
 
																+        err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	/* hack to avoid third party libs to rebind threads */
															
 
																+	_starpu_bind_thread_on_cpu(config, cpu);
															
 
																+
															
 
																+        unsigned iter;
															
 
																+	double timing;
															
 
																+	struct timeval start;
															
 
																+	struct timeval end;
															
 
																+
															
 
																+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
															
 
																+
															
 
																+	/* Measure upload bandwidth */
															
 
																+	gettimeofday(&start, NULL);
															
 
																+	for (iter = 0; iter < NITER; iter++)
															
 
																+	{
															
 
																+                err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
															
 
																+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+	gettimeofday(&end, NULL);
															
 
																+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+
															
 
																+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
															
 
																+
															
 
																+	/* Measure download bandwidth */
															
 
																+	gettimeofday(&start, NULL);
															
 
																+	for (iter = 0; iter < NITER; iter++)
															
 
																+	{
															
 
																+                err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
															
 
																+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+	gettimeofday(&end, NULL);
															
 
																+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+
															
 
																+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
															
 
																+
															
 
																+	/* Free buffers */
															
 
																+	clReleaseMemObject(d_buffer);
															
 
																+	free(h_buffer);
															
 
																+
															
 
																+	/* Uninitiliaze OpenCL context on the device */
															
 
																+        _starpu_opencl_deinit_context(dev);
															
 
																+}
															
 
																+#endif
															
 
																 /* NB: we want to sort the bandwidth by DECREASING order */
															
 
																-static int compar_cudadev_timing(const void *left_cudadev_timing, const void *right_cudadev_timing)
															
 
																+static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
															
 
																 {
															
 
																-	const struct cudadev_timing *left = left_cudadev_timing;
															
 
																-	const struct cudadev_timing *right = right_cudadev_timing;
															
 
																-	
															
 
																+	const struct dev_timing *left = left_dev_timing;
															
 
																+	const struct dev_timing *right = right_dev_timing;
															
 
																+
															
 
																 	double left_dtoh = left->timing_dtoh;
															
 
																 	double left_htod = left->timing_htod;
															
 
																 	double right_dtoh = right->timing_dtoh;
															
 
																 	double right_htod = right->timing_htod;
															
 
																-	
															
 
																+
															
 
																 	double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
															
 
																 	double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
															
@@ -156,47 +251,55 @@ static int compar_cudadev_timing(const void *left_cudadev_timing, const void *ri
 
																 	return (bandwidth_sum2_left < bandwidth_sum2_right);
															
 
																 }
															
 
																-static void measure_bandwidth_between_host_and_dev(int dev, unsigned ncpus)
															
 
																+static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
															
 
																+                                                   struct dev_timing *dev_timing_per_cpu, char type)
															
 
																 {
															
 
																 	unsigned cpu;
															
 
																 	for (cpu = 0; cpu < ncpus; cpu++)
															
 
																 	{
															
 
																-		measure_bandwidth_between_host_and_dev_on_cpu(dev, cpu);
															
 
																-	}
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+                if (type == 'C')
															
 
																+                        measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                if (type == 'O')
															
 
																+                        measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
															
 
																+#endif
															
 
																+        }
															
 
																 	/* sort the results */
															
 
																-	qsort(cudadev_timing_per_cpu[dev+1], ncpus,
															
 
																-			sizeof(struct cudadev_timing),
															
 
																-			compar_cudadev_timing);
															
 
																-	
															
 
																+	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
															
 
																+              sizeof(struct dev_timing),
															
 
																+			compar_dev_timing);
															
 
																+
															
 
																 #ifdef STARPU_VERBOSE
															
 
																 	for (cpu = 0; cpu < ncpus; cpu++)
															
 
																 	{
															
 
																-		unsigned current_cpu = cudadev_timing_per_cpu[dev+1][cpu].cpu_id;
															
 
																-		double bandwidth_dtoh = cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh;
															
 
																-		double bandwidth_htod = cudadev_timing_per_cpu[dev+1][cpu].timing_htod;
															
 
																+		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
															
 
																+		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
															
 
																+		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
															
 
																 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
															
 
																 		fprintf(stderr, "BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
															
 
																 	}
															
 
																-	unsigned best_cpu = cudadev_timing_per_cpu[dev+1][0].cpu_id;
															
 
																+	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
															
 
																 	fprintf(stderr, "BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
															
 
																 #endif
															
 
																 	/* The results are sorted in a decreasing order, so that the best
															
 
																 	 * measurement is currently the first entry. */
															
 
																-	cudadev_timing_dtoh[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_dtoh;
															
 
																-	cudadev_timing_htod[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_htod;
															
 
																+	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
															
 
																+	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
															
 
																 }
															
 
																-#endif
															
 
																+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
															
 
																-static void benchmark_all_cuda_devices(void)
															
 
																+static void benchmark_all_gpu_devices(void)
															
 
																 {
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	int ret;
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																+	int i, ret;
															
 
																 #ifdef STARPU_VERBOSE
															
 
																 	fprintf(stderr, "Benchmarking the speed of the bus\n");
															
@@ -213,15 +316,24 @@ static void benchmark_all_cuda_devices(void)
 
																 	}
															
 
																 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+	ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																         cudaGetDeviceCount(&ncuda);
															
 
																-	int i;
															
 
																 	for (i = 0; i < ncuda; i++)
															
 
																 	{
															
 
																 		/* measure bandwidth between Host and Device i */
															
 
																-		measure_bandwidth_between_host_and_dev(i, ncpus);
															
 
																+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
															
 
																+	}
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        nopencl = _starpu_opencl_get_device_count();
															
 
																+	for (i = 0; i < nopencl; i++)
															
 
																+	{
															
 
																+		/* measure bandwith between Host and Device i */
															
 
																+		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
															
 
																 	}
															
 
																+#endif
															
 
																 	/* FIXME: use hwloc */
															
 
																 	/* Restore the former affinity */
															
@@ -235,7 +347,7 @@ static void benchmark_all_cuda_devices(void)
 
																 #ifdef STARPU_VERBOSE
															
 
																 	fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
															
 
																 #endif
															
 
																-#endif
															
 
																+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
															
 
																 	was_benchmarked = 1;
															
 
																 }
															
@@ -244,7 +356,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 
																 {
															
 
																 	_starpu_get_perf_model_dir_bus(path, maxlen);
															
 
																 	strncat(path, type, maxlen);
															
 
																-	
															
 
																+
															
 
																 	char hostname[32];
															
 
																 	gethostname(hostname, 32);
															
 
																 	strncat(path, ".", maxlen);
															
@@ -270,13 +382,13 @@ static void load_bus_affinity_file_content(void)
 
																 	f = fopen(path, "r");
															
 
																 	STARPU_ASSERT(f);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+	ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+        int gpu;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																         cudaGetDeviceCount(&ncuda);
															
 
																-
															
 
																-	int gpu;
															
 
																 	for (gpu = 0; gpu < ncuda; gpu++)
															
 
																 	{
															
 
																 		int ret;
															
@@ -292,7 +404,7 @@ static void load_bus_affinity_file_content(void)
 
																 		unsigned cpu;
															
 
																 		for (cpu = 0; cpu < ncpus; cpu++)
															
 
																 		{
															
 
																-			ret = fscanf(f, "%d\t", &affinity_matrix[gpu][cpu]);
															
 
																+			ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
															
 
																 			STARPU_ASSERT(ret == 1);
															
 
																 		}
															
@@ -300,6 +412,32 @@ static void load_bus_affinity_file_content(void)
 
																 		STARPU_ASSERT(ret == 0);
															
 
																 	}
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        nopencl = _starpu_opencl_get_device_count();
															
 
																+	for (gpu = 0; gpu < nopencl; gpu++)
															
 
																+	{
															
 
																+		int ret;
															
 
																+
															
 
																+		int dummy;
															
 
																+
															
 
																+		starpu_drop_comments(f);
															
 
																+		ret = fscanf(f, "%d\t", &dummy);
															
 
																+		STARPU_ASSERT(ret == 1);
															
 
																+
															
 
																+		STARPU_ASSERT(dummy == gpu);
															
 
																+
															
 
																+		unsigned cpu;
															
 
																+		for (cpu = 0; cpu < ncpus; cpu++)
															
 
																+		{
															
 
																+			ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
															
 
																+			STARPU_ASSERT(ret == 1);
															
 
																+		}
															
 
																+
															
 
																+		ret = fscanf(f, "\n");
															
 
																+		STARPU_ASSERT(ret == 0);
															
 
																+	}
															
 
																+#endif
															
 
																+#endif
															
 
																 	fclose(f);
															
 
																 }
															
@@ -320,24 +458,36 @@ static void write_bus_affinity_file_content(void)
 
																 		STARPU_ABORT();
															
 
																 	}
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																 	unsigned cpu;
															
 
																+        int gpu;
															
 
																-	fprintf(f, "# GPU\t");
															
 
																+        fprintf(f, "# GPU\t");
															
 
																 	for (cpu = 0; cpu < ncpus; cpu++)
															
 
																 		fprintf(f, "CPU%d\t", cpu);
															
 
																 	fprintf(f, "\n");
															
 
																-	int gpu;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																 	for (gpu = 0; gpu < ncuda; gpu++)
															
 
																 	{
															
 
																 		fprintf(f, "%d\t", gpu);
															
 
																 		for (cpu = 0; cpu < ncpus; cpu++)
															
 
																 		{
															
 
																-			fprintf(f, "%d\t", cudadev_timing_per_cpu[gpu+1][cpu].cpu_id);
															
 
																+			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
															
 
																+		}
															
 
																+
															
 
																+		fprintf(f, "\n");
															
 
																+	}
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	for (gpu = 0; gpu < nopencl; gpu++)
															
 
																+	{
															
 
																+		fprintf(f, "%d\t", gpu);
															
 
																+
															
 
																+		for (cpu = 0; cpu < ncpus; cpu++)
															
 
																+		{
															
 
																+                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
															
 
																 		}
															
 
																 		fprintf(f, "\n");
															
@@ -345,12 +495,13 @@ static void write_bus_affinity_file_content(void)
 
																 #endif
															
 
																 	fclose(f);
															
 
																+#endif
															
 
																 }
															
 
																 static void generate_bus_affinity_file(void)
															
 
																 {
															
 
																 	if (!was_benchmarked)
															
 
																-		benchmark_all_cuda_devices();
															
 
																+		benchmark_all_gpu_devices();
															
 
																 	write_bus_affinity_file_content();
															
 
																 }
															
@@ -372,10 +523,19 @@ static void load_bus_affinity_file(void)
 
																 	load_bus_affinity_file_content();
															
 
																 }
															
 
																-int *_starpu_get_gpu_affinity_vector(unsigned gpuid)
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
															
 
																+{
															
 
																+        return cuda_affinity_matrix[gpuid];
															
 
																+}
															
 
																+#endif /* STARPU_USE_CUDA */
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
															
 
																 {
															
 
																-	return affinity_matrix[gpuid];
															
 
																+        return opencl_affinity_matrix[gpuid];
															
 
																 }
															
 
																+#endif /* STARPU_USE_OPENCL */
															
 
																 /*
															
 
																  *	Latency
															
@@ -420,7 +580,7 @@ static void load_bus_latency_file_content(void)
 
																 static void write_bus_latency_file_content(void)
															
 
																 {
															
 
																-	int src, dst;
															
 
																+        int src, dst, maxnode;
															
 
																 	FILE *f;
															
 
																 	STARPU_ASSERT(was_benchmarked);
															
@@ -440,13 +600,17 @@ static void write_bus_latency_file_content(void)
 
																 		fprintf(f, "to %d\t\t", dst);
															
 
																 	fprintf(f, "\n");
															
 
																-	for (src = 0; src < STARPU_MAXNODES; src++)
															
 
																+        maxnode = ncuda;
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        maxnode += nopencl;
															
 
																+#endif
															
 
																+        for (src = 0; src < STARPU_MAXNODES; src++)
															
 
																 	{
															
 
																 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
															
 
																 		{
															
 
																 			double latency;
															
 
																-			if ((src > ncuda) || (dst > ncuda))
															
 
																+			if ((src > maxnode) || (dst > maxnode))
															
 
																 			{
															
 
																 				/* convention */
															
 
																 				latency = -1.0;
															
@@ -471,7 +635,7 @@ static void write_bus_latency_file_content(void)
 
																 static void generate_bus_latency_file(void)
															
 
																 {
															
 
																 	if (!was_benchmarked)
															
 
																-		benchmark_all_cuda_devices();
															
 
																+		benchmark_all_gpu_devices();
															
 
																 	write_bus_latency_file_content();
															
 
																 }
															
@@ -494,7 +658,7 @@ static void load_bus_latency_file(void)
 
																 }
															
 
																-/* 
															
 
																+/*
															
 
																  *	Bandwidth
															
 
																  */
															
 
																 static void get_bandwidth_path(char *path, size_t maxlen)
															
@@ -540,7 +704,7 @@ static void load_bus_bandwidth_file_content(void)
 
																 static void write_bus_bandwidth_file_content(void)
															
 
																 {
															
 
																-	int src, dst;
															
 
																+	int src, dst, maxnode;
															
 
																 	FILE *f;
															
 
																 	STARPU_ASSERT(was_benchmarked);
															
@@ -556,25 +720,38 @@ static void write_bus_bandwidth_file_content(void)
 
																 		fprintf(f, "to %d\t\t", dst);
															
 
																 	fprintf(f, "\n");
															
 
																+        maxnode = ncuda;
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        maxnode += nopencl;
															
 
																+#endif
															
 
																 	for (src = 0; src < STARPU_MAXNODES; src++)
															
 
																 	{
															
 
																 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
															
 
																 		{
															
 
																 			double bandwidth;
															
 
																-			
															
 
																-			if ((src > ncuda) || (dst > ncuda))
															
 
																+
															
 
																+			if ((src > maxnode) || (dst > maxnode))
															
 
																 			{
															
 
																 				bandwidth = -1.0;
															
 
																 			}
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																 			else if (src != dst)
															
 
																 			{
															
 
																-			/* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
															
 
																-				double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
															
 
																-				double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
															
 
																-				
															
 
																+                                double time_src_to_ram=0.0, time_ram_to_dst=0.0;
															
 
																+                                /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
															
 
																+                                time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                                if (src > ncuda)
															
 
																+                                        time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
															
 
																+                                if (dst > ncuda)
															
 
																+                                        time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
															
 
																+#endif
															
 
																+
															
 
																 				double timing =time_src_to_ram + time_ram_to_dst;
															
 
																-				
															
 
																+
															
 
																 				bandwidth = 1.0*SIZE/timing;
															
 
																 			}
															
 
																 #endif
															
@@ -582,7 +759,7 @@ static void write_bus_bandwidth_file_content(void)
 
																 			        /* convention */
															
 
																 			        bandwidth = 0.0;
															
 
																 			}
															
 
																-			
															
 
																+
															
 
																 			fprintf(f, "%lf\t", bandwidth);
															
 
																 		}
															
@@ -595,7 +772,7 @@ static void write_bus_bandwidth_file_content(void)
 
																 static void generate_bus_bandwidth_file(void)
															
 
																 {
															
 
																 	if (!was_benchmarked)
															
 
																-		benchmark_all_cuda_devices();
															
 
																+		benchmark_all_gpu_devices();
															
 
																 	write_bus_bandwidth_file_content();
															
 
																 }
															
@@ -618,6 +795,96 @@ static void load_bus_bandwidth_file(void)
 
																 }
															
 
																 /*
															
 
																+ *	Config
															
 
																+ */
															
 
																+static void get_config_path(char *path, size_t maxlen)
															
 
																+{
															
 
																+	get_bus_path("config", path, maxlen);
															
 
																+}
															
 
																+
															
 
																+static void check_bus_config_file()
															
 
																+{
															
 
																+        int res;
															
 
																+        char path[256];
															
 
																+
															
 
																+        get_config_path(path, 256);
															
 
																+        res = access(path, F_OK);
															
 
																+        if (res) {
															
 
																+                starpu_force_bus_sampling();
															
 
																+        }
															
 
																+        else {
															
 
																+                FILE *f;
															
 
																+                int ret, read_cuda, read_opencl;
															
 
																+                unsigned read_cpus;
															
 
																+                struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+
															
 
																+                // Loading configuration from file
															
 
																+                f = fopen(path, "r");
															
 
																+                STARPU_ASSERT(f);
															
 
																+                starpu_drop_comments(f);
															
 
																+                ret = fscanf(f, "%d\t", &read_cpus);
															
 
																+		STARPU_ASSERT(ret == 1);
															
 
																+                starpu_drop_comments(f);
															
 
																+		ret = fscanf(f, "%d\t", &read_cuda);
															
 
																+		STARPU_ASSERT(ret == 1);
															
 
																+                starpu_drop_comments(f);
															
 
																+		ret = fscanf(f, "%d\t", &read_opencl);
															
 
																+		STARPU_ASSERT(ret == 1);
															
 
																+                starpu_drop_comments(f);
															
 
																+                fclose(f);
															
 
																+
															
 
																+                // Loading current configuration
															
 
																+                ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+                cudaGetDeviceCount(&ncuda);
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                nopencl = _starpu_opencl_get_device_count();
															
 
																+#endif
															
 
																+
															
 
																+                // Checking if both configurations match
															
 
																+                if (read_cpus != ncpus) {
															
 
																+                        fprintf(stderr, "Current configuration does not match the performance model (CPUS: %d != %d)\n", read_cpus, ncpus);
															
 
																+                        starpu_force_bus_sampling();
															
 
																+                }
															
 
																+                else if (read_cuda != ncuda) {
															
 
																+                        fprintf(stderr, "Current configuration does not match the performance model (CUDA: %d != %d)\n", read_cuda, ncuda);
															
 
																+                        starpu_force_bus_sampling();
															
 
																+                }
															
 
																+                else if (read_opencl != nopencl) {
															
 
																+                        fprintf(stderr, "Current configuration does not match the performance model (OpenCL: %d != %d)\n", read_opencl, nopencl);
															
 
																+                        starpu_force_bus_sampling();
															
 
																+                }
															
 
																+        }
															
 
																+}
															
 
																+
															
 
																+static void write_bus_config_file_content(void)
															
 
																+{
															
 
																+	FILE *f;
															
 
																+	char path[256];
															
 
																+
															
 
																+	STARPU_ASSERT(was_benchmarked);
															
 
																+        get_config_path(path, 256);
															
 
																+        f = fopen(path, "w+");
															
 
																+	STARPU_ASSERT(f);
															
 
																+
															
 
																+        fprintf(f, "# Current configuration\n");
															
 
																+        fprintf(f, "%d # Number of CPUs\n", ncpus);
															
 
																+        fprintf(f, "%d # Number of CUDA devices\n", ncuda);
															
 
																+        fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
															
 
																+
															
 
																+        fclose(f);
															
 
																+}
															
 
																+
															
 
																+static void generate_bus_config_file()
															
 
																+{
															
 
																+	if (!was_benchmarked)
															
 
																+		benchmark_all_gpu_devices();
															
 
																+
															
 
																+	write_bus_config_file_content();
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																  *	Generic
															
 
																  */
															
@@ -628,12 +895,14 @@ void starpu_force_bus_sampling(void)
 
																 	generate_bus_affinity_file();
															
 
																 	generate_bus_latency_file();
															
 
																 	generate_bus_bandwidth_file();
															
 
																+        generate_bus_config_file();
															
 
																 }
															
 
																 void _starpu_load_bus_performance_files(void)
															
 
																 {
															
 
																 	_starpu_create_sampling_directory_if_needed();
															
 
																+        check_bus_config_file();
															
 
																 	load_bus_affinity_file();
															
 
																 	load_bus_latency_file();
															
 
																 	load_bus_bandwidth_file();
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -21,6 +21,7 @@
 
																 #include <core/debug.h>
															
 
																 #include <core/topology.h>
															
 
																 #include <drivers/cuda/driver_cuda.h>
															
 
																+#include <common/hash.h>
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 #include <hwloc.h>
															
@@ -37,8 +38,16 @@ static unsigned topology_is_initialized = 0;
 
																 static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *config);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config);
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																+#  ifdef STARPU_USE_CUDA
															
 
																+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config);
															
 
																+static struct starpu_htbl32_node_s *devices_using_cuda = NULL;
															
 
																+#  endif
															
 
																+#  ifdef STARPU_USE_OPENCL
															
 
																+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config);
															
 
																+#  endif
															
 
																+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
															
 
																+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus);
															
 
																 static unsigned may_bind_automatically = 0;
															
 
																 #endif
															
@@ -47,12 +56,70 @@ static unsigned may_bind_automatically = 0;
 
																  */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config)
															
 
																+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config)
															
 
																+{
															
 
																+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_cuda_gpuid,
															
 
																+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_cuda_gpuid,
															
 
																+                                         &(config->current_cuda_gpuid), (int *)config->workers_cuda_gpuid, "STARPU_WORKERS_CUDAID",
															
 
																+                                         config->nhwcudagpus);
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config)
															
 
																+{
															
 
																+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_opencl_gpuid,
															
 
																+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_opencl_gpuid,
															
 
																+                                         &(config->current_opencl_gpuid), (int *)config->workers_opencl_gpuid, "STARPU_WORKERS_OPENCLID",
															
 
																+                                         config->nhwopenclgpus);
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+        // Detect devices which are already used with CUDA
															
 
																+        {
															
 
																+                unsigned tmp[STARPU_NMAXWORKERS];
															
 
																+                unsigned nb=0;
															
 
																+                int i;
															
 
																+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
															
 
																+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
															
 
																+                        if (_starpu_htbl_search_32(devices_using_cuda, key) == NULL) {
															
 
																+                                tmp[nb] = config->workers_opencl_gpuid[i];
															
 
																+                                nb++;
															
 
																+                        }
															
 
																+                }
															
 
																+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
															
 
																+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
															
 
																+        }
															
 
																+#endif /* STARPU_USE_CUDA */
															
 
																+        {
															
 
																+                // Detect identical devices
															
 
																+                struct starpu_htbl32_node_s *devices_already_used = NULL;
															
 
																+                unsigned tmp[STARPU_NMAXWORKERS];
															
 
																+                unsigned nb=0;
															
 
																+                int i;
															
 
																+
															
 
																+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
															
 
																+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
															
 
																+                        if (_starpu_htbl_search_32(devices_already_used, key) == NULL) {
															
 
																+                                _starpu_htbl_insert_32(&devices_already_used, key, config);
															
 
																+                                tmp[nb] = config->workers_opencl_gpuid[i];
															
 
																+                                nb ++;
															
 
																+                        }
															
 
																+                }
															
 
																+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
															
 
																+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
															
 
																+        }
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
															
 
																+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus)
															
 
																 {
															
 
																 	char *strval;
															
 
																 	unsigned i;
															
 
																-	config->current_gpuid = 0;
															
 
																+	*current = 0;
															
 
																 	/* conf->workers_bindid indicates the successive cpu identifier that
															
 
																 	 * should be used to bind the workers. It should be either filled
															
@@ -62,14 +129,14 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
																 	 * cpus. */
															
 
																 	/* what do we use, explicit value, env. variable, or round-robin ? */
															
 
																-	if (config->user_conf && config->user_conf->use_explicit_workers_gpuid)
															
 
																+	if (use_explicit_workers_gpuid)
															
 
																 	{
															
 
																 		/* we use the explicit value from the user */
															
 
																-		memcpy(config->workers_gpuid,
															
 
																-			config->user_conf->workers_gpuid,
															
 
																-			STARPU_NMAXWORKERS*sizeof(unsigned));
															
 
																+		memcpy(workers_gpuid,
															
 
																+                       explicit_workers_gpuid,
															
 
																+                       STARPU_NMAXWORKERS*sizeof(unsigned));
															
 
																 	}
															
 
																-	else if ((strval = getenv("STARPU_WORKERS_CUDAID")))
															
 
																+	else if ((strval = getenv(varname)))
															
 
																 	{
															
 
																 		/* STARPU_WORKERS_CUDAID certainly contains less entries than
															
 
																 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
															
@@ -86,7 +153,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
																 				val = strtol(strval, &endptr, 10);
															
 
																 				if (endptr != strval)
															
 
																 				{
															
 
																-					config->workers_gpuid[i] = (unsigned)val;
															
 
																+					workers_gpuid[i] = (unsigned)val;
															
 
																 					strval = endptr;
															
 
																 				}
															
 
																 				else {
															
@@ -97,11 +164,11 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
																 					/* there is no more values in the string */
															
 
																 					wrap = 1;
															
 
																-					config->workers_gpuid[i] = config->workers_gpuid[0];
															
 
																+					workers_gpuid[i] = workers_gpuid[0];
															
 
																 				}
															
 
																 			}
															
 
																 			else {
															
 
																-				config->workers_gpuid[i] = config->workers_gpuid[i % number_of_entries];
															
 
																+				workers_gpuid[i] = workers_gpuid[i % number_of_entries];
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
@@ -109,7 +176,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
																 	{
															
 
																 		/* by default, we take a round robin policy */
															
 
																 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																-			config->workers_gpuid[i] = (unsigned)i;
															
 
																+			workers_gpuid[i] = (unsigned)(i % nhwgpus);
															
 
																 		/* StarPU can use sampling techniques to bind threads correctly */
															
 
																 		may_bind_automatically = 1;
															
@@ -117,11 +184,18 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 
																 }
															
 
																 #endif
															
 
																-static inline int _starpu_get_next_gpuid(struct starpu_machine_config_s *config)
															
 
																+static inline int _starpu_get_next_cuda_gpuid(struct starpu_machine_config_s *config)
															
 
																 {
															
 
																-	unsigned i = ((config->current_gpuid++) % config->ncudagpus);
															
 
																+	unsigned i = ((config->current_cuda_gpuid++) % config->ncudagpus);
															
 
																-	return (int)config->workers_gpuid[i];
															
 
																+	return (int)config->workers_cuda_gpuid[i];
															
 
																+}
															
 
																+
															
 
																+static inline int _starpu_get_next_opencl_gpuid(struct starpu_machine_config_s *config)
															
 
																+{
															
 
																+	unsigned i = ((config->current_opencl_gpuid++) % config->nopenclgpus);
															
 
																+
															
 
																+	return (int)config->workers_opencl_gpuid[i];
															
 
																 }
															
 
																 static void _starpu_init_topology(struct starpu_machine_config_s *config)
															
@@ -152,7 +226,14 @@ static void _starpu_init_topology(struct starpu_machine_config_s *config)
 
																 #warning no way to know number of cores, assuming 1
															
 
																 		config->nhwcpus = 1;
															
 
																 #endif
															
 
																-	
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+                config->nhwcudagpus = _starpu_get_cuda_device_count();
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                config->nhwopenclgpus = _starpu_opencl_get_device_count();
															
 
																+#endif
															
 
																+
															
 
																 		topology_is_initialized = 1;
															
 
																 	}
															
 
																 }
															
@@ -208,22 +289,79 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 
																 	if (config->ncudagpus > 0)
															
 
																 		use_accelerator = 1;
															
 
																-	_starpu_initialize_workers_gpuid(config);
															
 
																+	_starpu_initialize_workers_cuda_gpuid(config);
															
 
																 	unsigned cudagpu;
															
 
																 	for (cudagpu = 0; cudagpu < config->ncudagpus; cudagpu++)
															
 
																 	{
															
 
																 		config->workers[config->nworkers + cudagpu].arch = STARPU_CUDA_WORKER;
															
 
																-		int devid = _starpu_get_next_gpuid(config);
															
 
																+		int devid = _starpu_get_next_cuda_gpuid(config);
															
 
																 		enum starpu_perf_archtype arch = STARPU_CUDA_DEFAULT + devid;
															
 
																 		config->workers[config->nworkers + cudagpu].devid = devid;
															
 
																 		config->workers[config->nworkers + cudagpu].perf_arch = arch; 
															
 
																 		config->workers[config->nworkers + cudagpu].worker_mask = STARPU_CUDA;
															
 
																 		config->worker_mask |= STARPU_CUDA;
															
 
																-	}
															
 
																+
															
 
																+                uint32_t key = _starpu_crc32_be(devid, 0);
															
 
																+                _starpu_htbl_insert_32(&devices_using_cuda, key, config);
															
 
																+        }
															
 
																 	config->nworkers += config->ncudagpus;
															
 
																 #endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	if (user_conf && (user_conf->nopencl == 0))
															
 
																+	{
															
 
																+		/* the user explicitely disabled OpenCL */
															
 
																+		config->nopenclgpus = 0;
															
 
																+	}
															
 
																+	else {
															
 
																+		/* we need to initialize OpenCL early to count the number of devices */
															
 
																+		_starpu_opencl_init();
															
 
																+
															
 
																+		if (user_conf && (user_conf->nopencl != -1))
															
 
																+		{
															
 
																+			explicitval = user_conf->nopencl;
															
 
																+		}
															
 
																+		else {
															
 
																+			explicitval = starpu_get_env_number("STARPU_NOPENCL");
															
 
																+		}
															
 
																+
															
 
																+		if (explicitval < 0) {
															
 
																+			config->nopenclgpus =
															
 
																+				STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
															
 
																+		} else {
															
 
																+			/* use the specified value */
															
 
																+			config->nopenclgpus = (unsigned)explicitval;
															
 
																+			STARPU_ASSERT(config->nopenclgpus <= STARPU_MAXOPENCLDEVS);
															
 
																+		}
															
 
																+		STARPU_ASSERT(config->nopenclgpus + config->nworkers <= STARPU_NMAXWORKERS);
															
 
																+	}
															
 
																+
															
 
																+	if (config->nopenclgpus > 0)
															
 
																+		use_accelerator = 1;
															
 
																+	// TODO: use_accelerator pour les OpenCL?
															
 
																+
															
 
																+	_starpu_initialize_workers_opencl_gpuid(config);
															
 
																+
															
 
																+	unsigned openclgpu;
															
 
																+	for (openclgpu = 0; openclgpu < config->nopenclgpus; openclgpu++)
															
 
																+	{
															
 
																+		int devid = _starpu_get_next_opencl_gpuid(config);
															
 
																+                if (devid == -1) { // There is no more devices left
															
 
																+                  config->nopenclgpus = openclgpu;
															
 
																+                  break;
															
 
																+                }
															
 
																+		config->workers[config->nworkers + openclgpu].arch = STARPU_OPENCL_WORKER;
															
 
																+		enum starpu_perf_archtype arch = STARPU_OPENCL_DEFAULT + devid;
															
 
																+		config->workers[config->nworkers + openclgpu].devid = devid;
															
 
																+		config->workers[config->nworkers + openclgpu].perf_arch = arch; 
															
 
																+		config->workers[config->nworkers + openclgpu].worker_mask = STARPU_OPENCL;
															
 
																+		config->worker_mask |= STARPU_OPENCL;
															
 
																+	}
															
 
																+
															
 
																+	config->nworkers += config->nopenclgpus;
															
 
																+#endif
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 	if (user_conf && (user_conf->ncuda != -1)) {
															
@@ -500,13 +638,27 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
																 				if (may_bind_automatically)
															
 
																 				{
															
 
																 					/* StarPU is allowed to bind threads automatically */
															
 
																-					preferred_binding = _starpu_get_gpu_affinity_vector(workerarg->devid);
															
 
																+					preferred_binding = _starpu_get_cuda_affinity_vector(workerarg->devid);
															
 
																 					npreferred = config->nhwcpus;
															
 
																 				}
															
 
																 				is_a_set_of_accelerators = 0;
															
 
																 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
															
 
																 				break;
															
 
																 #endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		        case STARPU_OPENCL_WORKER:
															
 
																+				if (may_bind_automatically)
															
 
																+				{
															
 
																+					/* StarPU is allowed to bind threads automatically */
															
 
																+					preferred_binding = _starpu_get_opencl_affinity_vector(workerarg->devid);
															
 
																+					npreferred = config->nhwcpus;
															
 
																+				}
															
 
																+				is_a_set_of_accelerators = 0;
															
 
																+				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
															
 
																+				break;
															
 
																+#endif
															
 
																+
															
 
																 			default:
															
 
																 				STARPU_ABORT();
															
 
																 		}
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -59,6 +59,11 @@ inline uint32_t _starpu_may_submit_cpu_task(void)
 
																 	return (STARPU_CPU & config.worker_mask);
															
 
																 }
															
 
																+inline uint32_t _starpu_may_submit_opencl_task(void)
															
 
																+{
															
 
																+	return (STARPU_OPENCL & config.worker_mask);
															
 
																+}
															
 
																+
															
 
																 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where)
															
 
																 {
															
 
																 	return (where & config.workers[workerid].worker_mask);
															
@@ -90,6 +95,9 @@ static void _starpu_init_worker_queue(struct starpu_worker_s *workerarg)
 
																 		case STARPU_CUDA_WORKER:
															
 
																 			jobq->alpha = STARPU_CUDA_ALPHA;
															
 
																 			break;
															
 
																+		case STARPU_OPENCL_WORKER:
															
 
																+			jobq->alpha = STARPU_OPENCL_ALPHA;
															
 
																+			break;
															
 
																 		case STARPU_GORDON_WORKER:
															
 
																 			jobq->alpha = STARPU_GORDON_ALPHA;
															
 
																 			break;
															
@@ -151,6 +159,15 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 
																 				break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+			case STARPU_OPENCL_WORKER:
															
 
																+				workerarg->set = NULL;
															
 
																+				workerarg->worker_is_initialized = 0;
															
 
																+				pthread_create(&workerarg->worker_thread, 
															
 
																+						NULL, _starpu_opencl_worker, workerarg);
															
 
																+
															
 
																+				break;
															
 
																+#endif
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 			case STARPU_GORDON_WORKER:
															
 
																 				/* we will only launch gordon once, but it will handle 
															
@@ -192,6 +209,7 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 
																 		switch (workerarg->arch) {
															
 
																 			case STARPU_CPU_WORKER:
															
 
																 			case STARPU_CUDA_WORKER:
															
 
																+			case STARPU_OPENCL_WORKER:			  
															
 
																 				PTHREAD_MUTEX_LOCK(&workerarg->mutex);
															
 
																 				while (!workerarg->worker_is_initialized)
															
 
																 					PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
															
@@ -523,6 +541,11 @@ unsigned starpu_get_cuda_worker_count(void)
 
																 	return config.ncudagpus;
															
 
																 }
															
 
																+unsigned starpu_get_opencl_worker_count(void)
															
 
																+{
															
 
																+	return config.nopenclgpus;
															
 
																+}
															
 
																+
															
 
																 unsigned starpu_get_spu_worker_count(void)
															
 
																 {
															
 
																 	return config.ngordon_spus;
															
@@ -549,6 +572,11 @@ int starpu_get_worker_id(void)
 
																 	}
															
 
																 }
															
 
																+int starpu_get_worker_devid(int id)
															
 
																+{
															
 
																+	return config.workers[id].devid;
															
 
																+}
															
 
																+
															
 
																 struct starpu_worker_s *_starpu_get_worker_struct(unsigned id)
															
 
																 {
															
 
																 	return &config.workers[id];
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -42,6 +42,10 @@
 
																 #include <drivers/cuda/driver_cuda.h>
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																+
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 #include <drivers/gordon/driver_gordon.h>
															
 
																 #endif
															
@@ -52,6 +56,7 @@
 
																 #define STARPU_CPU_ALPHA	1.0f
															
 
																 #define STARPU_CUDA_ALPHA	13.33f
															
 
																+#define STARPU_OPENCL_ALPHA	12.22f
															
 
																 #define STARPU_GORDON_ALPHA	6.0f /* XXX this is a random value ... */
															
 
																 #ifdef STARPU_DATA_STATS
															
@@ -105,18 +110,25 @@ struct starpu_machine_config_s {
 
																 #endif
															
 
																 	unsigned nhwcpus;
															
 
																+        unsigned nhwcudagpus;
															
 
																+        unsigned nhwopenclgpus;
															
 
																 	unsigned ncpus;
															
 
																 	unsigned ncudagpus;
															
 
																+	unsigned nopenclgpus;
															
 
																 	unsigned ngordon_spus;
															
 
																 	/* Where to bind workers ? */
															
 
																 	int current_bindid;
															
 
																 	unsigned workers_bindid[STARPU_NMAXWORKERS];
															
 
																-	/* Which GPU(s) do we use ? */
															
 
																-	int current_gpuid;
															
 
																-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
															
 
																+	/* Which GPU(s) do we use for CUDA ? */
															
 
																+	int current_cuda_gpuid;
															
 
																+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
															
 
																+
															
 
																+	/* Which GPU(s) do we use for OpenCL ? */
															
 
																+	int current_opencl_gpuid;
															
 
																+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
															
 
																 	struct starpu_worker_s workers[STARPU_NMAXWORKERS];
															
 
																 	uint32_t worker_mask;
															
@@ -138,6 +150,7 @@ unsigned _starpu_machine_is_running(void);
 
																 inline uint32_t _starpu_worker_exists(uint32_t task_mask);
															
 
																 inline uint32_t _starpu_may_submit_cuda_task(void);
															
 
																 inline uint32_t _starpu_may_submit_cpu_task(void);
															
 
																+inline uint32_t _starpu_may_submit_opencl_task(void);
															
 
																 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where);
															
 
																 unsigned _starpu_worker_can_block(unsigned memnode);
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -25,8 +25,8 @@ uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_n
 
																 	/* in case one of the node is a GPU, it needs to perform the transfer,
															
 
																 	 * if both of them are GPU, it's a bit more complicated (TODO !) */
															
 
																-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
															
 
																-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM);
															
 
																+	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
															
 
																+	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
															
 
																 	/* we do not handle GPU->GPU transfers yet ! */
															
 
																 	STARPU_ASSERT( !(src_is_a_gpu && dst_is_a_gpu) );
															
@@ -77,6 +77,8 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
																 			 * 	other should be ok */
															
 
																 			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM)
															
 
																 				break;
															
 
																+			if (_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)
															
 
																+				break;
															
 
																 			/* XXX do a better algorithm to distribute the memory copies */
															
 
																 			/* TODO : use the "requesting_node" as an argument to do so */
															
@@ -181,8 +183,8 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, uint32_t requesting_no
 
																 			STARPU_ASSERT(src_node != requesting_node);
															
 
																 		}
															
 
																-		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
															
 
																-		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM);
															
 
																+		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
															
 
																+		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(requesting_node) == STARPU_OPENCL_RAM);
															
 
																 		/* we have to perform 2 successive requests for GPU->GPU transfers */
															
 
																 		if (read && (src_is_a_gpu && dst_is_a_gpu)) {
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -135,6 +135,28 @@ cudaStream_t *stream;
 
																 				}
															
 
																 				break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+    		        case STARPU_OPENCL_RAM:
															
 
																+				/* OpenCL -> RAM */
															
 
																+				if (_starpu_get_local_memory_node() == src_node)
															
 
																+				{
															
 
																+					STARPU_ASSERT(copy_methods->opencl_to_ram);
															
 
																+					if (!req || !copy_methods->opencl_to_ram_async)
															
 
																+					{
															
 
																+						/* this is not associated to a request so it's synchronous */
															
 
																+                                                copy_methods->opencl_to_ram(handle, src_node, dst_node);
															
 
																+                                        }
															
 
																+                                        else {
															
 
																+                                                ret = copy_methods->opencl_to_ram_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
															
 
																+                                        }
															
 
																+				}
															
 
																+				else
															
 
																+				{
															
 
																+					/* we should not have a blocking call ! */
															
 
																+					STARPU_ABORT();
															
 
																+				}
															
 
																+				break;
															
 
																+#endif
															
 
																 			case STARPU_SPU_LS:
															
 
																 				STARPU_ABORT(); // TODO
															
 
																 				break;
															
@@ -180,6 +202,34 @@ cudaStream_t *stream;
 
																 		}
															
 
																 		break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	case STARPU_OPENCL_RAM:
															
 
																+		switch (src_kind) {
															
 
																+		        case STARPU_RAM:
															
 
																+				/* STARPU_RAM -> STARPU_OPENCL_RAM */
															
 
																+				STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
															
 
																+				STARPU_ASSERT(copy_methods->ram_to_opencl);
															
 
																+				if (!req || !copy_methods->ram_to_opencl_async)
															
 
																+				{
															
 
																+					/* this is not associated to a request so it's synchronous */
															
 
																+					copy_methods->ram_to_opencl(handle, src_node, dst_node);
															
 
																+				}
															
 
																+				else {
															
 
																+                                        ret = copy_methods->ram_to_opencl_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
															
 
																+				}
															
 
																+				break;
															
 
																+			case STARPU_CUDA_RAM:
															
 
																+			case STARPU_OPENCL_RAM:
															
 
																+			case STARPU_SPU_LS:
															
 
																+				STARPU_ABORT(); // TODO 
															
 
																+				break;
															
 
																+			case STARPU_UNUSED:
															
 
																+			default:
															
 
																+				STARPU_ABORT();
															
 
																+				break;
															
 
																+		}
															
 
																+		break;
															
 
																+#endif
															
 
																 	case STARPU_SPU_LS:
															
 
																 		STARPU_ABORT(); // TODO
															
 
																 		break;
															
@@ -275,6 +325,12 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                case STARPU_OPENCL_RAM:
															
 
																+                        fprintf(stderr, "not implemented yet\n");
															
 
																+			STARPU_ABORT();
															
 
																+                        break;
															
 
																+#endif
															
 
																 		case STARPU_RAM:
															
 
																 		default:
															
 
																 			STARPU_ABORT();
															
@@ -301,6 +357,17 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                case STARPU_OPENCL_RAM:
															
 
																+                        {
															
 
																+                                cl_int event_status;
															
 
																+                                cl_event opencl_event = (*async_channel).opencl_event;
															
 
																+                                if (opencl_event == NULL) STARPU_ABORT();
															
 
																+                                clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
															
 
																+                                success = (event_status == CL_COMPLETE);
															
 
																+                                break;
															
 
																+                        }
															
 
																+#endif
															
 
																 		case STARPU_RAM:
															
 
																 		default:
															
 
																 			STARPU_ABORT();
															
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -28,6 +28,10 @@
 
																 #include <cublas.h>
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <CL/cl.h>
															
 
																+#endif
															
 
																+
															
 
																 struct starpu_data_request_s;
															
 
																 /* this is a structure that can be queried to see whether an asynchronous
															
@@ -37,24 +41,36 @@ typedef union {
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	cudaEvent_t cuda_event;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        cl_event opencl_event;
															
 
																+#endif
															
 
																 } starpu_async_channel;
															
 
																 struct starpu_copy_data_methods_s {
															
 
																 	/* src type is ram */
															
 
																 	int (*ram_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	int (*ram_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+	int (*ram_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	int (*ram_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	/* src type is cuda */
															
 
																 	int (*cuda_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	int (*cuda_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+	int (*cuda_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	int (*cuda_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	/* src type is spu */
															
 
																 	int (*spu_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	int (*spu_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+	int (*spu_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																 	int (*spu_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+	/* src type is opencl */
															
 
																+	int (*opencl_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+	int (*opencl_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+	int (*opencl_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+	int (*opencl_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
															
 
																+
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	/* for asynchronous CUDA transfers */
															
 
																 	int (*ram_to_cuda_async)(starpu_data_handle handle, uint32_t src,
															
@@ -64,6 +80,13 @@ struct starpu_copy_data_methods_s {
 
																 	int (*cuda_to_cuda_async)(starpu_data_handle handle, uint32_t src,
															
 
																 					uint32_t dst, cudaStream_t *stream);
															
 
																 #endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	/* for asynchronous OpenCL transfers */
															
 
																+        int (*ram_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
															
 
																+	int (*opencl_to_ram_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
															
 
																+	int (*opencl_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
															
 
																+#endif
															
 
																 };
															
 
																 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
															
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -22,6 +22,11 @@
 
																 #include <datawizard/filters.h>
															
 
																 #include <common/hash.h>
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																+
															
 
																 /*
															
 
																  * BCSR : blocked CSR, we use blocks of size (r x c)
															
 
																  */
															
@@ -31,6 +36,10 @@ static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, u
 
																 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+#endif
															
 
																 static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
															
 
																 	.ram_to_ram = dummy_copy_ram_to_ram,
															
@@ -39,6 +48,10 @@ static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
 
																 	.ram_to_cuda = copy_ram_to_cuda,
															
 
																 	.cuda_to_ram = copy_cuda_to_ram,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.ram_to_opencl = copy_ram_to_opencl,
															
 
																+	.opencl_to_ram = copy_opencl_to_ram,
															
 
																+#endif
															
 
																 	.cuda_to_cuda = NULL,
															
 
																 	.cuda_to_spu = NULL,
															
 
																 	.spu_to_ram = NULL,
															
@@ -274,6 +287,27 @@ static size_t allocate_bcsr_buffer_on_node(starpu_data_handle handle, uint32_t d
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		case STARPU_OPENCL_RAM:
															
 
																+                        {
															
 
																+                                int ret;
															
 
																+                                void *ptr;
															
 
																+
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
															
 
																+                                addr_nzval = (uintptr_t)ptr;
															
 
																+                                if (ret) goto fail_nzval;
															
 
																+
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
															
 
																+                                addr_colind = ptr;
															
 
																+				if (ret) goto fail_colind;
															
 
																+
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
															
 
																+                                addr_rowptr = ptr;
															
 
																+				if (ret) goto fail_rowptr;
															
 
																+
															
 
																+                                break;
															
 
																+                        }
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -298,6 +332,11 @@ fail_rowptr:
 
																 			cudaFree((void*)addr_colind);
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		case STARPU_OPENCL_RAM:
															
 
																+			clReleaseMemObject((void*)addr_colind);
															
 
																+			break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -311,6 +350,11 @@ fail_colind:
 
																 			cudaFree((void*)addr_nzval);
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		case STARPU_OPENCL_RAM:
															
 
																+			clReleaseMemObject((void*)addr_nzval);
															
 
																+			break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -341,6 +385,13 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 
																 			cudaFree((void*)bcsr_interface->rowptr);
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		case STARPU_OPENCL_RAM:
															
 
																+			clReleaseMemObject((void*)bcsr_interface->nzval);
															
 
																+			clReleaseMemObject((void*)bcsr_interface->colind);
															
 
																+			clReleaseMemObject((void*)bcsr_interface->rowptr);
															
 
																+			break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -420,6 +471,76 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
																 }
															
 
																 #endif // STARPU_USE_CUDA
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																+{
															
 
																+	starpu_bcsr_interface_t *src_bcsr;
															
 
																+	starpu_bcsr_interface_t *dst_bcsr;
															
 
																+
															
 
																+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	uint32_t nnz = src_bcsr->nnz;
															
 
																+	uint32_t nrow = src_bcsr->nrow;
															
 
																+	size_t elemsize = src_bcsr->elemsize;
															
 
																+
															
 
																+	uint32_t r = src_bcsr->r;
															
 
																+	uint32_t c = src_bcsr->c;
															
 
																+
															
 
																+        int err;
															
 
																+
															
 
																+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->nzval, (void *)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->colind, (void *)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->rowptr, (void *)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																+{
															
 
																+	starpu_bcsr_interface_t *src_bcsr;
															
 
																+	starpu_bcsr_interface_t *dst_bcsr;
															
 
																+
															
 
																+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	uint32_t nnz = src_bcsr->nnz;
															
 
																+	uint32_t nrow = src_bcsr->nrow;
															
 
																+	size_t elemsize = src_bcsr->elemsize;
															
 
																+
															
 
																+	uint32_t r = src_bcsr->r;
															
 
																+	uint32_t c = src_bcsr->c;
															
 
																+
															
 
																+        int err;
															
 
																+
															
 
																+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->nzval, (cl_mem)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->colind, (cl_mem)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->rowptr, (cl_mem)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+#endif // STARPU_USE_OPENCL
															
 
																+
															
 
																 /* as not all platform easily have a BLAS lib installed ... */
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																 {
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -22,6 +22,11 @@
 
																 #include <common/hash.h>
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																+
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
@@ -29,6 +34,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
																 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+#endif
															
 
																 static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
															
 
																 	.ram_to_ram = dummy_copy_ram_to_ram,
															
@@ -39,6 +50,12 @@ static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
 
																 	.ram_to_cuda_async = copy_ram_to_cuda_async,
															
 
																 	.cuda_to_ram_async = copy_cuda_to_ram_async,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.ram_to_opencl = copy_ram_to_opencl,
															
 
																+	.opencl_to_ram = copy_opencl_to_ram,
															
 
																+        .ram_to_opencl_async = copy_ram_to_opencl_async,
															
 
																+	.opencl_to_ram_async = copy_opencl_to_ram_async,
															
 
																+#endif
															
 
																 	.cuda_to_cuda = NULL,
															
 
																 	.cuda_to_spu = NULL,
															
 
																 	.spu_to_ram = NULL,
															
@@ -94,11 +111,15 @@ static void register_block_handle(starpu_data_handle handle, uint32_t home_node,
 
																 		if (node == home_node) {
															
 
																 			local_interface->ptr = block_interface->ptr;
															
 
																+                        local_interface->dev_handle = block_interface->dev_handle;
															
 
																+                        local_interface->offset = block_interface->offset;
															
 
																 			local_interface->ldy  = block_interface->ldy;
															
 
																 			local_interface->ldz  = block_interface->ldz;
															
 
																 		}
															
 
																 		else {
															
 
																 			local_interface->ptr = 0;
															
 
																+                        local_interface->dev_handle = 0;
															
 
																+                        local_interface->offset = 0;
															
 
																 			local_interface->ldy  = 0;
															
 
																 			local_interface->ldz  = 0;
															
 
																 		}
															
@@ -117,6 +138,8 @@ void starpu_register_block_data(starpu_data_handle *handleptr, uint32_t home_nod
 
																 {
															
 
																 	starpu_block_interface_t interface = {
															
 
																 		.ptr = ptr,
															
 
																+                .dev_handle = ptr,
															
 
																+                .offset = 0,
															
 
																 		.ldy = ldy,
															
 
																 		.ldz = ldz,
															
 
																 		.nx = nx,
															
@@ -278,6 +301,19 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	        case STARPU_OPENCL_RAM:
															
 
																+			{
															
 
																+                                int ret;
															
 
																+                                void *ptr;
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
															
 
																+                                addr = (uintptr_t)ptr;
															
 
																+				if (ret) {
															
 
																+					fail = 1;
															
 
																+				}
															
 
																+				break;
															
 
																+			}
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -288,6 +324,8 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
																 		/* update the data properly in consequence */
															
 
																 		dst_block->ptr = addr;
															
 
																+                dst_block->dev_handle = addr;
															
 
																+                dst_block->offset = 0;
															
 
																 		dst_block->ldy = nx;
															
 
																 		dst_block->ldz = nx*ny;
															
 
																 	} else {
															
@@ -319,6 +357,11 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node)
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                case STARPU_OPENCL_RAM:
															
 
																+                        clReleaseMemObject((void *)block_interface->ptr);
															
 
																+                        break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -660,6 +703,85 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
																 }
															
 
																 #endif // STARPU_USE_CUDA
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_block_interface_t *src_block;
															
 
																+	starpu_block_interface_t *dst_block;
															
 
																+
															
 
																+	src_block = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
															
 
																+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
															
 
																+                                                dst_block->offset, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_block_interface_t *src_block;
															
 
																+	starpu_block_interface_t *dst_block;
															
 
																+
															
 
																+	src_block = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
															
 
																+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
															
 
																+                                                  src_block->offset, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_block_interface_t *src_block;
															
 
																+	starpu_block_interface_t *dst_block;
															
 
																+
															
 
																+	src_block = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
															
 
																+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
															
 
																+                                                dst_block->offset, NULL);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_block_interface_t *src_block;
															
 
																+	starpu_block_interface_t *dst_block;
															
 
																+
															
 
																+	src_block = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
															
 
																+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
															
 
																+                                                  src_block->offset, NULL);
															
 
																+
															
 
																+        if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+#endif
															
 
																+
															
 
																 /* as not all platform easily have a BLAS lib installed ... */
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																 {
															
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -22,12 +22,20 @@
 
																 #include <common/hash.h>
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+#endif
															
 
																 static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
															
 
																 	.ram_to_ram = dummy_copy_ram_to_ram,
															
@@ -36,6 +44,10 @@ static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
 
																 	.ram_to_cuda = copy_ram_to_cuda,
															
 
																 	.cuda_to_ram = copy_cuda_to_ram,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.ram_to_opencl = copy_ram_to_opencl,
															
 
																+	.opencl_to_ram = copy_opencl_to_ram,
															
 
																+#endif
															
 
																 	.cuda_to_cuda = NULL,
															
 
																 	.cuda_to_spu = NULL,
															
 
																 	.spu_to_ram = NULL,
															
@@ -246,6 +258,27 @@ static size_t allocate_csr_buffer_on_node(starpu_data_handle handle, uint32_t ds
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	        case STARPU_OPENCL_RAM:
															
 
																+			{
															
 
																+                                int ret;
															
 
																+                                void *ptr;
															
 
																+
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
															
 
																+                                addr_nzval = (uintptr_t)ptr;
															
 
																+				if (ret) goto fail_nzval;
															
 
																+
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
															
 
																+                                addr_colind = ptr;
															
 
																+				if (ret) goto fail_colind;
															
 
																+
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
															
 
																+                                addr_rowptr = ptr;
															
 
																+				if (ret) goto fail_rowptr;
															
 
																+
															
 
																+				break;
															
 
																+			}
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -270,6 +303,11 @@ fail_rowptr:
 
																 			cudaFree((void*)addr_colind);
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		case STARPU_OPENCL_RAM:
															
 
																+			clReleaseMemObject((void*)addr_colind);
															
 
																+			break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -283,6 +321,11 @@ fail_colind:
 
																 			cudaFree((void*)addr_nzval);
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		case STARPU_OPENCL_RAM:
															
 
																+			clReleaseMemObject((void*)addr_nzval);
															
 
																+			break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -313,6 +356,13 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 
																 			cudaFree((void*)csr_interface->rowptr);
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+		case STARPU_OPENCL_RAM:
															
 
																+			clReleaseMemObject((void*)csr_interface->nzval);
															
 
																+			clReleaseMemObject((void*)csr_interface->colind);
															
 
																+			clReleaseMemObject((void*)csr_interface->rowptr);
															
 
																+			break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -386,6 +436,70 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
																 }
															
 
																 #endif // STARPU_USE_CUDA
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																+{
															
 
																+	starpu_csr_interface_t *src_csr;
															
 
																+	starpu_csr_interface_t *dst_csr;
															
 
																+
															
 
																+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	uint32_t nnz = src_csr->nnz;
															
 
																+	uint32_t nrow = src_csr->nrow;
															
 
																+	size_t elemsize = src_csr->elemsize;
															
 
																+
															
 
																+        int err;
															
 
																+
															
 
																+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
															
 
																+        if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																+{
															
 
																+	starpu_csr_interface_t *src_csr;
															
 
																+	starpu_csr_interface_t *dst_csr;
															
 
																+
															
 
																+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	uint32_t nnz = src_csr->nnz;
															
 
																+	uint32_t nrow = src_csr->nrow;
															
 
																+	size_t elemsize = src_csr->elemsize;
															
 
																+
															
 
																+        int err;
															
 
																+
															
 
																+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = _starpu_opencl_copy_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
															
 
																+        if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+#endif // STARPU_USE_OPENCL
															
 
																+
															
 
																 /* as not all platform easily have a BLAS lib installed ... */
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																 {
															
--- a/src/datawizard/interfaces/matrix_filters.c
+++ b/src/datawizard/interfaces/matrix_filters.c
@@ -69,6 +69,8 @@ void starpu_block_filter_func(starpu_filter *f, starpu_data_handle root_handle)
 
																 				local->ptr = local_root->ptr + offset;
															
 
																 				local->ld = local_root->ld;
															
 
																+                                local->dev_handle = local_root->dev_handle;
															
 
																+                                local->offset = local_root->offset + offset;
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
@@ -122,6 +124,8 @@ void starpu_vertical_block_filter_func(starpu_filter *f, starpu_data_handle root
 
																 					(size_t)chunk*chunk_size*local_root->ld*elemsize;
															
 
																 				local->ptr = local_root->ptr + offset;
															
 
																 				local->ld = local_root->ld;
															
 
																+                                local->dev_handle = local_root->dev_handle;
															
 
																+                                local->offset = local_root->offset + offset;
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -26,6 +26,10 @@
 
																 #include <cuda.h>
															
 
																 #include <cuda_runtime.h>
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
																 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+#endif
															
 
																 static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
															
 
																 	.ram_to_ram = dummy_copy_ram_to_ram,
															
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
 
																 	.ram_to_cuda_async = copy_ram_to_cuda_async,
															
 
																 	.cuda_to_ram_async = copy_cuda_to_ram_async,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.ram_to_opencl = copy_ram_to_opencl,
															
 
																+	.opencl_to_ram = copy_opencl_to_ram,
															
 
																+        .ram_to_opencl_async = copy_ram_to_opencl_async,
															
 
																+	.opencl_to_ram_async = copy_opencl_to_ram_async,
															
 
																+#endif
															
 
																 	.cuda_to_cuda = NULL,
															
 
																 	.cuda_to_spu = NULL,
															
 
																 	.spu_to_ram = NULL,
															
@@ -106,10 +122,14 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 
																 		if (node == home_node) {
															
 
																 			local_interface->ptr = matrix_interface->ptr;
															
 
																+                        local_interface->dev_handle = matrix_interface->dev_handle;
															
 
																+                        local_interface->offset = matrix_interface->offset;
															
 
																 			local_interface->ld  = matrix_interface->ld;
															
 
																 		}
															
 
																 		else {
															
 
																 			local_interface->ptr = 0;
															
 
																+			local_interface->dev_handle = 0;
															
 
																+			local_interface->offset = 0;
															
 
																 			local_interface->ld  = 0;
															
 
																 		}
															
@@ -129,7 +149,9 @@ void starpu_register_matrix_data(starpu_data_handle *handleptr, uint32_t home_no
 
																 		.ld = ld,
															
 
																 		.nx = nx,
															
 
																 		.ny = ny,
															
 
																-		.elemsize = elemsize
															
 
																+		.elemsize = elemsize,
															
 
																+                .dev_handle = ptr,
															
 
																+                .offset = 0
															
 
																 	};
															
 
																 	_starpu_register_data_handle(handleptr, home_node, &interface, &_starpu_interface_matrix_ops);
															
@@ -256,6 +278,19 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	        case STARPU_OPENCL_RAM:
															
 
																+			{
															
 
																+                                int ret;
															
 
																+                                void *ptr;
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*elemsize, CL_MEM_READ_WRITE);
															
 
																+                                addr = (uintptr_t)ptr;
															
 
																+				if (ret) {
															
 
																+					fail = 1;
															
 
																+				}
															
 
																+				break;
															
 
																+			}
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -266,6 +301,8 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
																 		/* update the data properly in consequence */
															
 
																 		interface->ptr = addr;
															
 
																+                interface->dev_handle = addr;
															
 
																+                interface->offset = 0;
															
 
																 		interface->ld = ld;
															
 
																 	} else {
															
 
																 		/* allocation failed */
															
@@ -296,6 +333,11 @@ static void liberate_matrix_buffer_on_node(void *interface, uint32_t node)
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                case STARPU_OPENCL_RAM:
															
 
																+                        clReleaseMemObject((void *)matrix_interface->ptr);
															
 
																+                        break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -423,6 +465,83 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
																 #endif // STARPU_USE_CUDA
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_matrix_interface_t *src_matrix;
															
 
																+	starpu_matrix_interface_t *dst_matrix;
															
 
																+
															
 
																+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
															
 
																+                                                dst_matrix->offset, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_matrix_interface_t *src_matrix;
															
 
																+	starpu_matrix_interface_t *dst_matrix;
															
 
																+
															
 
																+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
															
 
																+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
															
 
																+                                                  src_matrix->offset, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_matrix_interface_t *src_matrix;
															
 
																+	starpu_matrix_interface_t *dst_matrix;
															
 
																+
															
 
																+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
															
 
																+                                                dst_matrix->offset, NULL);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_matrix_interface_t *src_matrix;
															
 
																+	starpu_matrix_interface_t *dst_matrix;
															
 
																+
															
 
																+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
															
 
																+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
															
 
																+                                                  src_matrix->offset, NULL);
															
 
																+
															
 
																+        if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+#endif
															
 
																+
															
 
																 /* as not all platform easily have a BLAS lib installed ... */
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																 {
															
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -26,6 +26,10 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
																 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+#endif
															
 
																 static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
															
 
																 	.ram_to_ram = dummy_copy_ram_to_ram,
															
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
 
																 	.ram_to_cuda_async = copy_ram_to_cuda_async,
															
 
																 	.cuda_to_ram_async = copy_cuda_to_ram_async,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.ram_to_opencl = copy_ram_to_opencl,
															
 
																+	.opencl_to_ram = copy_opencl_to_ram,
															
 
																+        .ram_to_opencl_async = copy_ram_to_opencl_async,
															
 
																+	.opencl_to_ram_async = copy_opencl_to_ram_async,
															
 
																+#endif
															
 
																 	.cuda_to_cuda = NULL,
															
 
																 	.cuda_to_spu = NULL,
															
 
																 	.spu_to_ram = NULL,
															
@@ -192,6 +208,19 @@ static size_t allocate_variable_buffer_on_node(starpu_data_handle handle, uint32
 
																 			}
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	        case STARPU_OPENCL_RAM:
															
 
																+			{
															
 
																+                                int ret;
															
 
																+                                void *ptr;
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
															
 
																+                                addr = (uintptr_t)ptr;
															
 
																+				if (ret) {
															
 
																+					fail = 1;
															
 
																+				}
															
 
																+				break;
															
 
																+			}
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -220,6 +249,11 @@ static void liberate_variable_buffer_on_node(void *interface, uint32_t node)
 
																 			cudaFree((void*)STARPU_GET_VARIABLE_PTR(interface));
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                case STARPU_OPENCL_RAM:
															
 
																+                        clReleaseMemObject((void*)STARPU_GET_VARIABLE_PTR(interface));
															
 
																+                        break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -324,6 +358,81 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
																 #endif // STARPU_USE_CUDA
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_variable_interface_t *src_variable;
															
 
																+	starpu_variable_interface_t *dst_variable;
															
 
																+
															
 
																+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
															
 
																+                                                0, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_variable_interface_t *src_variable;
															
 
																+	starpu_variable_interface_t *dst_variable;
															
 
																+
															
 
																+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
															
 
																+                                                  0, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_variable_interface_t *src_variable;
															
 
																+	starpu_variable_interface_t *dst_variable;
															
 
																+
															
 
																+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
															
 
																+                                                0, NULL);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_variable_interface_t *src_variable;
															
 
																+	starpu_variable_interface_t *dst_variable;
															
 
																+
															
 
																+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
															
 
																+                                                  0, NULL);
															
 
																+
															
 
																+        if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+#endif
															
 
																+
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																 {
															
 
																 	starpu_variable_interface_t *src_variable;
															
--- a/src/datawizard/interfaces/vector_filters.c
+++ b/src/datawizard/interfaces/vector_filters.c
@@ -62,6 +62,8 @@ void starpu_block_filter_func_vector(starpu_filter *f, starpu_data_handle root_h
 
																 					starpu_data_get_interface_on_node(root_handle, node);
															
 
																 				local->ptr = local_root->ptr + offset;
															
 
																+                                local->dev_handle = local_root->dev_handle;
															
 
																+                                local->offset = local_root->offset + offset;
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
@@ -100,6 +102,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 
																 				starpu_data_get_interface_on_node(root_handle, node);
															
 
																 			local->ptr = local_root->ptr;
															
 
																+                        local->offset = local_root->offset;
															
 
																+                        local->dev_handle = local_root->dev_handle;
															
 
																 		}
															
 
																 	}
															
@@ -119,6 +123,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 
																 				starpu_data_get_interface_on_node(root_handle, node);
															
 
																 			local->ptr = local_root->ptr + length_first*elemsize;
															
 
																+                        local->offset = local_root->offset + length_first*elemsize;
															
 
																+                        local->dev_handle = local_root->dev_handle;
															
 
																 		}
															
 
																 	}
															
 
																 }
															
@@ -161,6 +167,8 @@ void starpu_list_filter_func_vector(starpu_filter *f, starpu_data_handle root_ha
 
																 					starpu_data_get_interface_on_node(root_handle, node);
															
 
																 				local->ptr = local_root->ptr + current_pos*elemsize;
															
 
																+                                local->offset = local_root->offset + current_pos*elemsize;
															
 
																+                                local->dev_handle = local_root->dev_handle;
															
 
																 			}
															
 
																 		}
															
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -1,6 +1,6 @@
 
																 /*
															
 
																  * StarPU
															
 
																- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																  *
															
 
																  * This program is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -25,6 +25,10 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -33,6 +37,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 
																 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
															
 
																+#endif
															
 
																 static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
															
 
																 	.ram_to_ram = dummy_copy_ram_to_ram,
															
@@ -43,6 +53,12 @@ static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
 
																 	.ram_to_cuda_async = copy_ram_to_cuda_async,
															
 
																 	.cuda_to_ram_async = copy_cuda_to_ram_async,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.ram_to_opencl = copy_ram_to_opencl,
															
 
																+	.opencl_to_ram = copy_opencl_to_ram,
															
 
																+        .ram_to_opencl_async = copy_ram_to_opencl_async,
															
 
																+	.opencl_to_ram_async = copy_opencl_to_ram_async,
															
 
																+#endif
															
 
																 	.cuda_to_cuda = NULL,
															
 
																 	.cuda_to_spu = NULL,
															
 
																 	.spu_to_ram = NULL,
															
@@ -87,9 +103,13 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 
																 		if (node == home_node) {
															
 
																 			local_interface->ptr = vector_interface->ptr;
															
 
																+                        local_interface->dev_handle = vector_interface->dev_handle;
															
 
																+                        local_interface->offset = vector_interface->offset;
															
 
																 		}
															
 
																 		else {
															
 
																 			local_interface->ptr = 0;
															
 
																+                        local_interface->dev_handle = 0;
															
 
																+                        local_interface->offset = 0;
															
 
																 		}
															
 
																 		local_interface->nx = vector_interface->nx;
															
@@ -116,7 +136,9 @@ void starpu_register_vector_data(starpu_data_handle *handleptr, uint32_t home_no
 
																 	starpu_vector_interface_t vector = {
															
 
																 		.ptr = ptr,
															
 
																 		.nx = nx,
															
 
																-		.elemsize = elemsize
															
 
																+		.elemsize = elemsize,
															
 
																+                .dev_handle = ptr,
															
 
																+                .offset = 0
															
 
																 	};	
															
 
																 	_starpu_register_data_handle(handleptr, home_node, &vector, &interface_vector_ops); 
															
@@ -216,6 +238,19 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
																 			}
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	        case STARPU_OPENCL_RAM:
															
 
																+			{
															
 
																+                                int ret;
															
 
																+                                void *ptr;
															
 
																+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*elemsize, CL_MEM_READ_WRITE);
															
 
																+                                addr = (uintptr_t)ptr;
															
 
																+				if (ret) {
															
 
																+					fail = 1;
															
 
																+				}
															
 
																+				break;
															
 
																+			}
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -228,6 +263,8 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
																 	/* update the data properly in consequence */
															
 
																 	interface->ptr = addr;
															
 
																+        interface->dev_handle = addr;
															
 
																+        interface->offset = 0;
															
 
																 	return allocated_memory;
															
 
																 }
															
@@ -246,6 +283,11 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 
																 			cudaFree((void*)vector_interface->ptr);
															
 
																 			break;
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+                case STARPU_OPENCL_RAM:
															
 
																+                        clReleaseMemObject((void *)vector_interface->ptr);
															
 
																+                        break;
															
 
																+#endif
															
 
																 		default:
															
 
																 			assert(0);
															
 
																 	}
															
@@ -349,6 +391,80 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
																 #endif // STARPU_USE_CUDA
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_vector_interface_t *src_vector;
															
 
																+	starpu_vector_interface_t *dst_vector;
															
 
																+
															
 
																+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
															
 
																+                                                dst_vector->offset, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
															
 
																+	starpu_vector_interface_t *src_vector;
															
 
																+	starpu_vector_interface_t *dst_vector;
															
 
																+
															
 
																+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
															
 
																+                                                  src_vector->offset, event);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
															
 
																+
															
 
																+	return EAGAIN;
															
 
																+}
															
 
																+
															
 
																+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_vector_interface_t *src_vector;
															
 
																+	starpu_vector_interface_t *dst_vector;
															
 
																+
															
 
																+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
															
 
																+                                                dst_vector->offset, NULL);
															
 
																+
															
 
																+	if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
															
 
																+	starpu_vector_interface_t *src_vector;
															
 
																+	starpu_vector_interface_t *dst_vector;
															
 
																+
															
 
																+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
															
 
																+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
															
 
																+
															
 
																+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
															
 
																+                                                  src_vector->offset, NULL);
															
 
																+
															
 
																+        if (STARPU_UNLIKELY(err))
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+#endif
															
 
																 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
															
 
																 {
															
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -26,7 +26,8 @@ typedef enum {
 
																 	STARPU_UNUSED,
															
 
																 	STARPU_SPU_LS,
															
 
																 	STARPU_RAM,
															
 
																-	STARPU_CUDA_RAM
															
 
																+	STARPU_CUDA_RAM,
															
 
																+        STARPU_OPENCL_RAM,
															
 
																 } starpu_node_kind;
															
 
																 typedef struct {
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -0,0 +1,422 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <math.h>
															
 
																+#include <common/config.h>
															
 
																+#include <common/utils.h>
															
 
																+#include <core/debug.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+#include "driver_opencl.h"
															
 
																+#include "driver_opencl_utils.h"
															
 
																+#include <common/utils.h>
															
 
																+
															
 
																+static cl_context contexts[STARPU_MAXOPENCLDEVS];
															
 
																+static cl_device_id devices[STARPU_MAXOPENCLDEVS];
															
 
																+static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
															
 
																+static cl_uint nb_devices = -1;
															
 
																+static int init_done = 0;
															
 
																+extern char *_starpu_opencl_codelet_dir;
															
 
																+
															
 
																+void starpu_opencl_get_context(int devid, cl_context *context)
															
 
																+{
															
 
																+        *context = contexts[devid];
															
 
																+}
															
 
																+
															
 
																+void starpu_opencl_get_device(int devid, cl_device_id *device)
															
 
																+{
															
 
																+        *device = devices[devid];
															
 
																+}
															
 
																+
															
 
																+void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
															
 
																+{
															
 
																+        *queue = queues[devid];
															
 
																+}
															
 
																+
															
 
																+int _starpu_opencl_init_context(int devid)
															
 
																+{
															
 
																+	cl_int err;
															
 
																+        cl_device_id device;
															
 
																+
															
 
																+        _STARPU_OPENCL_DEBUG("Initialising context for dev %d\n", devid);
															
 
																+
															
 
																+        // Create a compute context
															
 
																+        device = devices[devid];
															
 
																+        contexts[devid] = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        // Create queue for the given device
															
 
																+        queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        _starpu_opencl_init_programs(devid);
															
 
																+
															
 
																+	return EXIT_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int _starpu_opencl_deinit_context(int devid)
															
 
																+{
															
 
																+        int err;
															
 
																+
															
 
																+        _STARPU_OPENCL_DEBUG("De-initialising context for dev %d\n", devid);
															
 
																+
															
 
																+        err = clReleaseContext(contexts[devid]);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        err = clReleaseCommandQueue(queues[devid]);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        _starpu_opencl_release_programs(devid);
															
 
																+
															
 
																+        return EXIT_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags)
															
 
																+{
															
 
																+	cl_int err;
															
 
																+        cl_mem address;
															
 
																+        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
															
 
																+
															
 
																+	address = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        *addr = address;
															
 
																+        return EXIT_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event)
															
 
																+{
															
 
																+      int err;
															
 
																+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
															
 
																+
															
 
																+      if (event == NULL) {
															
 
																+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
															
 
																+      }
															
 
																+      else {
															
 
																+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
															
 
																+      }
															
 
																+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+      return EXIT_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event)
															
 
																+{
															
 
																+      int err;
															
 
																+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
															
 
																+
															
 
																+      if (event == NULL) {
															
 
																+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
															
 
																+      }
															
 
																+      else {
															
 
																+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
															
 
																+      }
															
 
																+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+      return EXIT_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+void _starpu_opencl_init()
															
 
																+{
															
 
																+   if (!init_done) {
															
 
																+           cl_platform_id platform_id[STARPU_OPENCL_PLATFORM_MAX];
															
 
																+           cl_uint nb_platforms;
															
 
																+           cl_device_type device_type = CL_DEVICE_TYPE_GPU;
															
 
																+           cl_int err;
															
 
																+
															
 
																+           _STARPU_OPENCL_DEBUG("Initialising OpenCL\n");
															
 
																+
															
 
																+           // Get Platforms
															
 
																+           err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
															
 
																+           if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+           _STARPU_OPENCL_DEBUG("Platforms detected: %d\n", nb_platforms);
															
 
																+
															
 
																+           // Get devices
															
 
																+           nb_devices = 0;
															
 
																+           {
															
 
																+                   unsigned int i;
															
 
																+                   for (i=0; i<nb_platforms; i++) {
															
 
																+                           cl_uint num;
															
 
																+
															
 
																+#ifdef STARPU_VERBOSE
															
 
																+                           {
															
 
																+                                   char name[1024], vendor[1024];
															
 
																+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
															
 
																+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
															
 
																+                                   _STARPU_OPENCL_DEBUG("Platform: %s - %s\n", name, vendor);
															
 
																+                           }
															
 
																+#endif
															
 
																+                           err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
															
 
																+                           if (err == CL_DEVICE_NOT_FOUND) {
															
 
																+                                   _STARPU_OPENCL_DEBUG("  No devices detected on this platform\n");
															
 
																+                           }
															
 
																+                           else {
															
 
																+                                   if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+                                   _STARPU_OPENCL_DEBUG("  %d devices detected\n", num);
															
 
																+                                   nb_devices += num;
															
 
																+                           }
															
 
																+         }
															
 
																+      }
															
 
																+
															
 
																+      // Get location of OpenCl codelet source files
															
 
																+      _starpu_opencl_codelet_dir = getenv("STARPU_OPENCL_CODELET_DIR");
															
 
																+
															
 
																+      init_done=1;
															
 
																+   }
															
 
																+}
															
 
																+
															
 
																+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
															
 
																+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args);
															
 
																+
															
 
																+void *_starpu_opencl_worker(void *arg)
															
 
																+{
															
 
																+	struct starpu_worker_s* args = arg;
															
 
																+
															
 
																+	int devid = args->devid;
															
 
																+	unsigned memory_node = args->memory_node;
															
 
																+
															
 
																+#ifdef USE_FXT
															
 
																+	fxt_register_thread(args->bindid);
															
 
																+#endif
															
 
																+
															
 
																+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
															
 
																+
															
 
																+	_starpu_set_local_memory_node_key(&(args->memory_node));
															
 
																+
															
 
																+	_starpu_set_local_queue(args->jobq);
															
 
																+
															
 
																+	_starpu_set_local_worker_key(args);
															
 
																+
															
 
																+	/* this is only useful (and meaningful) is there is a single
															
 
																+	   memory node "related" to that queue */
															
 
																+	args->jobq->memory_node = memory_node;
															
 
																+
															
 
																+	args->jobq->total_computation_time = 0.0;
															
 
																+	args->jobq->total_communication_time = 0.0;
															
 
																+	args->jobq->total_computation_time_error = 0.0;
															
 
																+	args->jobq->total_job_performed = 0;
															
 
																+
															
 
																+	_starpu_opencl_init_context(devid);
															
 
																+
															
 
																+	/* one more time to avoid hacks from third party lib :) */
															
 
																+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
															
 
																+
															
 
																+	args->status = STATUS_UNKNOWN;
															
 
																+
															
 
																+	/* get the device's name */
															
 
																+	char devname[128];
															
 
																+	_starpu_opencl_get_device_name(devid, devname, 128);
															
 
																+	snprintf(args->name, 32, "OpenCL %d (%s)", args->devid, devname);
															
 
																+
															
 
																+	_STARPU_OPENCL_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
															
 
																+
															
 
																+	STARPU_TRACE_WORKER_INIT_END
															
 
																+
															
 
																+	/* tell the main thread that this one is ready */
															
 
																+	PTHREAD_MUTEX_LOCK(&args->mutex);
															
 
																+	args->worker_is_initialized = 1;
															
 
																+	PTHREAD_COND_SIGNAL(&args->ready_cond);
															
 
																+	PTHREAD_MUTEX_UNLOCK(&args->mutex);
															
 
																+
															
 
																+	struct starpu_job_s * j;
															
 
																+	int res;
															
 
																+
															
 
																+	struct starpu_sched_policy_s *policy = _starpu_get_sched_policy();
															
 
																+	struct starpu_jobq_s *queue = policy->starpu_get_local_queue(policy);
															
 
																+	unsigned memnode = args->memory_node;
															
 
																+
															
 
																+	while (_starpu_machine_is_running())
															
 
																+	{
															
 
																+		STARPU_TRACE_START_PROGRESS(memnode);
															
 
																+		_starpu_datawizard_progress(memnode, 1);
															
 
																+		STARPU_TRACE_END_PROGRESS(memnode);
															
 
																+
															
 
																+		_starpu_execute_registered_progression_hooks();
															
 
																+
															
 
																+		_starpu_jobq_lock(queue);
															
 
																+
															
 
																+		/* perhaps there is some local task to be executed first */
															
 
																+		j = _starpu_pop_local_task(args);
															
 
																+
															
 
																+		/* otherwise ask a task to the scheduler */
															
 
																+		if (!j)
															
 
																+			j = _starpu_pop_task();
															
 
																+
															
 
																+		if (j == NULL) {
															
 
																+			if (_starpu_worker_can_block(memnode))
															
 
																+				PTHREAD_COND_WAIT(&queue->activity_cond, &queue->activity_mutex);
															
 
																+			_starpu_jobq_unlock(queue);
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		_starpu_jobq_unlock(queue);
															
 
																+
															
 
																+		/* can OpenCL do that task ? */
															
 
																+		if (!STARPU_OPENCL_MAY_PERFORM(j))
															
 
																+		{
															
 
																+			/* this is not a OpenCL task */
															
 
																+			_starpu_push_task(j);
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		_starpu_set_current_task(j->task);
															
 
																+
															
 
																+		res = _starpu_opencl_execute_job(j, args);
															
 
																+
															
 
																+		_starpu_set_current_task(NULL);
															
 
																+
															
 
																+                if (res) {
															
 
																+			switch (res) {
															
 
																+				case -EAGAIN:
															
 
																+					fprintf(stderr, "ouch, put the codelet %p back ... \n", j);
															
 
																+					_starpu_push_task(j);
															
 
																+					STARPU_ABORT();
															
 
																+					continue;
															
 
																+				default:
															
 
																+					assert(0);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		_starpu_handle_job_termination(j);
															
 
																+	}
															
 
																+
															
 
																+	STARPU_TRACE_WORKER_DEINIT_START
															
 
																+
															
 
																+          _starpu_opencl_deinit_context(devid);
															
 
																+
															
 
																+#ifdef DATA_STATS
															
 
																+	fprintf(stderr, "OpenCL #%d computation %le comm %le (%lf \%%)\n", args->id, args->jobq->total_computation_time, args->jobq->total_communication_time, args->jobq->total_communication_time*100.0/args->jobq->total_computation_time);
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_VERBOSE
															
 
																+	double ratio = 0;
															
 
																+	if (args->jobq->total_job_performed != 0)
															
 
																+	{
															
 
																+		ratio = args->jobq->total_computation_time_error/args->jobq->total_computation_time;
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	_starpu_print_to_logfile("MODEL ERROR: OpenCL %d ERROR %lf EXEC %lf RATIO %lf NTASKS %d\n", args->devid, args->jobq->total_computation_time_error, args->jobq->total_computation_time, ratio, args->jobq->total_job_performed);
															
 
																+#endif
															
 
																+
															
 
																+	pthread_exit(NULL);
															
 
																+
															
 
																+	return NULL;
															
 
																+}
															
 
																+
															
 
																+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
															
 
																+{
															
 
																+	int err;
															
 
																+
															
 
																+        if (!init_done) {
															
 
																+                _starpu_opencl_init();
															
 
																+        }
															
 
																+
															
 
																+	// Get device name
															
 
																+	err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	_STARPU_OPENCL_DEBUG("Device %d : [%s]\n", dev, name);
															
 
																+	return EXIT_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+unsigned _starpu_opencl_get_device_count(void)
															
 
																+{
															
 
																+        if (!init_done) {
															
 
																+                _starpu_opencl_init();
															
 
																+        }
															
 
																+	return nb_devices;
															
 
																+}
															
 
																+
															
 
																+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args)
															
 
																+{
															
 
																+	int ret;
															
 
																+//	uint32_t mask = (1<<0);
															
 
																+	uint32_t mask = 0;
															
 
																+
															
 
																+	STARPU_ASSERT(j);
															
 
																+	struct starpu_task *task = j->task;
															
 
																+
															
 
																+	starpu_tick_t codelet_start, codelet_end;
															
 
																+	starpu_tick_t codelet_start_comm, codelet_end_comm;
															
 
																+
															
 
																+	unsigned calibrate_model = 0;
															
 
																+
															
 
																+	STARPU_ASSERT(task);
															
 
																+	struct starpu_codelet_t *cl = task->cl;
															
 
																+	STARPU_ASSERT(cl);
															
 
																+
															
 
																+	if (cl->model && cl->model->benchmarking)
															
 
																+		calibrate_model = 1;
															
 
																+
															
 
																+	/* we do not take communication into account when modeling the performance */
															
 
																+	if (STARPU_BENCHMARK_COMM)
															
 
																+	{
															
 
																+                //barrier(CLK_GLOBAL_MEM_FENCE);
															
 
																+		STARPU_GET_TICK(codelet_start_comm);
															
 
																+	}
															
 
																+
															
 
																+	ret = _starpu_fetch_task_input(task, mask);
															
 
																+	if (ret != 0) {
															
 
																+		/* there was not enough memory, so the input of
															
 
																+		 * the codelet cannot be fetched ... put the
															
 
																+		 * codelet back, and try it later */
															
 
																+		return -EAGAIN;
															
 
																+	}
															
 
																+
															
 
																+	if (calibrate_model || STARPU_BENCHMARK_COMM)
															
 
																+	{
															
 
																+                //barrier(CLK_GLOBAL_MEM_FENCE);
															
 
																+		STARPU_GET_TICK(codelet_end_comm);
															
 
																+	}
															
 
																+
															
 
																+	STARPU_TRACE_START_CODELET_BODY(j);
															
 
																+
															
 
																+	args->status = STATUS_EXECUTING;
															
 
																+	cl_func func = cl->opencl_func;
															
 
																+	STARPU_ASSERT(func);
															
 
																+	STARPU_GET_TICK(codelet_start);
															
 
																+	func(task->interface, task->cl_arg);
															
 
																+
															
 
																+	cl->per_worker_stats[args->workerid]++;
															
 
																+
															
 
																+	STARPU_GET_TICK(codelet_end);
															
 
																+
															
 
																+	args->status = STATUS_UNKNOWN;
															
 
																+
															
 
																+	STARPU_TRACE_END_CODELET_BODY(j);
															
 
																+
															
 
																+	if (calibrate_model || STARPU_BENCHMARK_COMM)
															
 
																+	{
															
 
																+		double measured = _starpu_timing_delay(&codelet_start, &codelet_end);
															
 
																+		double measured_comm = _starpu_timing_delay(&codelet_start_comm, &codelet_end_comm);
															
 
																+
															
 
																+		args->jobq->total_computation_time += measured;
															
 
																+		args->jobq->total_communication_time += measured_comm;
															
 
																+
															
 
																+		double error;
															
 
																+		error = fabs(STARPU_MAX(measured, 0.0) - STARPU_MAX(j->predicted, 0.0));
															
 
																+		args->jobq->total_computation_time_error += error;
															
 
																+
															
 
																+		if (calibrate_model)
															
 
																+			_starpu_update_perfmodel_history(j, args->perf_arch, (unsigned)args->devid, measured);
															
 
																+	}
															
 
																+
															
 
																+	args->jobq->total_job_performed++;
															
 
																+
															
 
																+	_starpu_push_task_output(task, mask);
															
 
																+
															
 
																+	return EXIT_SUCCESS;
															
 
																+}
															
--- a/src/drivers/opencl/driver_opencl.h
+++ b/src/drivers/opencl/driver_opencl.h
@@ -0,0 +1,57 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __DRIVER_OPENCL_H__
															
 
																+#define __DRIVER_OPENCL_H__
															
 
																+
															
 
																+#ifndef _GNU_SOURCE
															
 
																+#define _GNU_SOURCE
															
 
																+#endif
															
 
																+
															
 
																+#include <CL/cl.h>
															
 
																+
															
 
																+extern
															
 
																+int _starpu_opencl_init_context(int devid);
															
 
																+
															
 
																+extern
															
 
																+int _starpu_opencl_deinit_context(int devid);
															
 
																+
															
 
																+extern
															
 
																+unsigned _starpu_opencl_get_device_count(void);
															
 
																+
															
 
																+extern
															
 
																+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
															
 
																+
															
 
																+extern
															
 
																+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event);
															
 
																+
															
 
																+extern
															
 
																+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event);
															
 
																+
															
 
																+extern
															
 
																+void _starpu_opencl_init(void);
															
 
																+
															
 
																+extern
															
 
																+void *_starpu_opencl_worker(void *);
															
 
																+
															
 
																+extern
															
 
																+int _starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue,
															
 
																+                               char *program_name, char *kernel_name, int dev);
															
 
																+
															
 
																+extern
															
 
																+int _starpu_opencl_compile_source_to_opencl(char *source_file_name);
															
 
																+
															
 
																+#endif //  __DRIVER_OPENCL_H__
															
--- a/src/drivers/opencl/driver_opencl_utils.c
+++ b/src/drivers/opencl/driver_opencl_utils.c
@@ -0,0 +1,430 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <sys/stat.h>
															
 
																+#include <stdlib.h>
															
 
																+#include <stdio.h>
															
 
																+#include <string.h>
															
 
																+#include <unistd.h>
															
 
																+#include <sys/types.h>
															
 
																+#include <sys/wait.h>
															
 
																+
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <common/list.h>
															
 
																+#include <common/htable32.h>
															
 
																+#include <core/workers.h>
															
 
																+#include "driver_opencl_utils.h"
															
 
																+#include "driver_opencl.h"
															
 
																+
															
 
																+#define CRC32C_POLY_BE 0x1EDC6F41
															
 
																+
															
 
																+static
															
 
																+inline uint32_t __attribute__ ((pure)) crc32_be_8(uint8_t inputbyte, uint32_t inputcrc)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+	uint32_t crc;
															
 
																+
															
 
																+	crc = inputcrc ^ (inputbyte << 24);
															
 
																+	for (i = 0; i < 8; i++)
															
 
																+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE : 0);
															
 
																+
															
 
																+	return crc;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+uint32_t crc32_string(char *str)
															
 
																+{
															
 
																+	uint32_t hash = 0;
															
 
																+
															
 
																+	size_t len = strlen(str);
															
 
																+
															
 
																+	unsigned i;
															
 
																+	for (i = 0; i < len; i++)
															
 
																+	{
															
 
																+		hash = crc32_be_8((uint8_t)str[i], hash);
															
 
																+	}
															
 
																+
															
 
																+	return hash;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+cl_uint _starpu_opencl_device_uniqueid(cl_device_id id)
															
 
																+{
															
 
																+	char name[1024];
															
 
																+	cl_int  err;
															
 
																+
															
 
																+	err = clGetDeviceInfo(id, CL_DEVICE_NAME, 1024, name, NULL);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	//  fprintf(stderr, "name %s\n", name);
															
 
																+
															
 
																+	return crc32_string(name);
															
 
																+}
															
 
																+
															
 
																+char *_starpu_opencl_codelet_dir;
															
 
																+
															
 
																+static
															
 
																+int _starpu_opencl_locate_file(char *source_file_name, char *located_file_name) {
															
 
																+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", source_file_name);
															
 
																+        if (access(source_file_name, R_OK) == 0) {
															
 
																+                strcpy(located_file_name, source_file_name);
															
 
																+                return EXIT_SUCCESS;
															
 
																+        }
															
 
																+        if (_starpu_opencl_codelet_dir) {
															
 
																+                sprintf(located_file_name, "%s/%s", _starpu_opencl_codelet_dir, source_file_name);
															
 
																+                _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
															
 
																+                if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
															
 
																+        }
															
 
																+        sprintf(located_file_name, "%s/%s", STARPU_OPENCL_DATADIR, source_file_name);
															
 
																+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
															
 
																+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
															
 
																+        sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
															
 
																+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
															
 
																+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
															
 
																+
															
 
																+        strcpy(located_file_name, "");
															
 
																+        OPENCL_ERROR("Cannot locate file <%s>\n", source_file_name);
															
 
																+        return EXIT_FAILURE;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+unsigned char *_starpu_opencl_load_program_binary(char *filename, size_t *len)
															
 
																+{
															
 
																+	struct stat statbuf;
															
 
																+	FILE        *fh;
															
 
																+	unsigned char        *binary;
															
 
																+
															
 
																+	fh = fopen(filename, "r");
															
 
																+	if (fh == 0)
															
 
																+		return EXIT_SUCCESS;
															
 
																+
															
 
																+	stat(filename, &statbuf);
															
 
																+
															
 
																+	binary = (unsigned char *) malloc(statbuf.st_size);
															
 
																+	if(!binary)
															
 
																+		return binary;
															
 
																+
															
 
																+	fread(binary, statbuf.st_size, 1, fh);
															
 
																+
															
 
																+	*len = statbuf.st_size;
															
 
																+	return binary;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+cl_int _starpu_opencl_load_program(cl_context context, char *program_name, cl_device_id device, cl_program *program)
															
 
																+{
															
 
																+        //	cl_program     program;
															
 
																+        const unsigned char *binary;
															
 
																+	size_t         len;
															
 
																+	cl_int         err;
															
 
																+	cl_int         status;
															
 
																+        cl_device_type type;
															
 
																+
															
 
																+	cl_uint uniqueid;
															
 
																+	char     located_program_name[1024];
															
 
																+	char     binary_file_name[1024];
															
 
																+	char    *p;
															
 
																+
															
 
																+        // locate file
															
 
																+        _starpu_opencl_locate_file(program_name, located_program_name);
															
 
																+
															
 
																+        // Get type of device
															
 
																+        err = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
															
 
																+	if (err != CL_SUCCESS) {
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+                return err;
															
 
																+        }
															
 
																+
															
 
																+        // Get the name of the binary file
															
 
																+	uniqueid = _starpu_opencl_device_uniqueid(device);
															
 
																+	strcpy(binary_file_name, located_program_name);
															
 
																+	p = strstr(binary_file_name, ".cl");
															
 
																+	if(p == NULL) OPENCL_ERROR("Program file name doesn't have the '.cl' extension!\n");
															
 
																+        strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
															
 
																+	sprintf(p + strlen(p), "%u", uniqueid);
															
 
																+
															
 
																+        // Load the binary file
															
 
																+	binary = _starpu_opencl_load_program_binary(binary_file_name, &len);
															
 
																+	if(binary == NULL)
															
 
																+		OPENCL_ERROR("Cannot load binary file %s\n", binary_file_name);
															
 
																+
															
 
																+	//_STARPU_OPENCL_DEBUG("[%s] binary file loaded.\n", binary_file_name);
															
 
																+	*program = clCreateProgramWithBinary(context, 1, &device, &len, &binary, &status, &err);
															
 
																+	if (err != CL_SUCCESS) {
															
 
																+                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+                return err;
															
 
																+        }
															
 
																+
															
 
																+	// Build the program executable
															
 
																+	err = clBuildProgram(*program, 0, NULL, NULL, NULL, NULL);
															
 
																+	if (err != CL_SUCCESS) {
															
 
																+		size_t len;
															
 
																+		char buffer[2048];
															
 
																+
															
 
																+		fprintf(stderr, "Error: Failed to build program executable!\n");
															
 
																+		clGetProgramBuildInfo(*program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
															
 
																+		fprintf(stderr, "%s\n", buffer);
															
 
																+                return err;
															
 
																+	}
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_htbl32_node_s *history_program_hash[STARPU_MAXOPENCLDEVS] = {NULL};
															
 
																+LIST_TYPE(program,
															
 
																+          char *program_name;
															
 
																+          cl_program program;
															
 
																+          );
															
 
																+program_list_t history_program_list[STARPU_MAXOPENCLDEVS];
															
 
																+
															
 
																+int _starpu_opencl_init_programs(int dev)
															
 
																+{
															
 
																+        history_program_list[dev] = program_list_new();
															
 
																+        return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int _starpu_opencl_release_programs(int dev)
															
 
																+{
															
 
																+        while (!program_list_empty(history_program_list[dev])) {
															
 
																+                program_t pp = program_list_pop_front(history_program_list[dev]);
															
 
																+                _STARPU_OPENCL_DEBUG("Releasing program=<%s> on dev=<%d>\n", pp->program_name, dev);
															
 
																+                clReleaseProgram(pp->program);
															
 
																+        }
															
 
																+        program_list_delete(history_program_list[dev]);
															
 
																+        return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid)
															
 
																+{
															
 
																+        int err;
															
 
																+	cl_device_id device;
															
 
																+        cl_context context;
															
 
																+        uint32_t key;
															
 
																+        cl_program program;
															
 
																+
															
 
																+        starpu_opencl_get_device(devid, &device);
															
 
																+        starpu_opencl_get_context(devid, &context);
															
 
																+        starpu_opencl_get_queue(devid, queue);
															
 
																+
															
 
																+        key = crc32_string(program_name);
															
 
																+        program = _starpu_htbl_search_32(history_program_hash[devid], key);
															
 
																+        if (!program) {
															
 
																+                err = _starpu_opencl_load_program(context, program_name, device, &program);
															
 
																+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+                _starpu_htbl_insert_32(&(history_program_hash[devid]), key, program);
															
 
																+                program_t pp = program_new();
															
 
																+                pp->program_name = program_name;
															
 
																+                pp->program = program;
															
 
																+                program_list_push_front(history_program_list[devid], pp);
															
 
																+        }
															
 
																+
															
 
																+        // Create the compute kernel in the program we wish to run
															
 
																+        *kernel = clCreateKernel(program, kernel_name, &err);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int starpu_opencl_release(cl_kernel kernel) {
															
 
																+	cl_int err;
															
 
																+
															
 
																+	err = clReleaseKernel(kernel);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+char *_starpu_opencl_load_program_source(const char *filename)
															
 
																+{
															
 
																+        struct stat statbuf;
															
 
																+        FILE        *fh;
															
 
																+        char        *source;
															
 
																+
															
 
																+        fh = fopen(filename, "r");
															
 
																+        if (fh == 0)
															
 
																+                return EXIT_SUCCESS;
															
 
																+
															
 
																+        stat(filename, &statbuf);
															
 
																+        source = (char *) malloc(statbuf.st_size + 1);
															
 
																+        fread(source, statbuf.st_size, 1, fh);
															
 
																+        source[statbuf.st_size] = '\0';
															
 
																+
															
 
																+        fclose(fh);
															
 
																+
															
 
																+        return source;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+int _starpu_opencl_store_program_binary(const char *filename, const char *binary, size_t len)
															
 
																+{
															
 
																+        FILE *fh;
															
 
																+
															
 
																+        fh = fopen(filename, "w");
															
 
																+        if(fh == NULL) {
															
 
																+                perror("fopen"); return EXIT_FAILURE;
															
 
																+        }
															
 
																+
															
 
																+        fwrite(binary, len, 1, fh);
															
 
																+        fclose(fh);
															
 
																+
															
 
																+        return EXIT_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+int _starpu_opencl_compile_source_to_opencl(char *source_file_name)
															
 
																+{
															
 
																+        int              err;
															
 
																+        int              device_type = CL_DEVICE_TYPE_ALL;
															
 
																+        cl_device_id     devices[STARPU_MAXOPENCLDEVS];
															
 
																+        unsigned         max = STARPU_MAXOPENCLDEVS;
															
 
																+        unsigned         nb_devices = 0;
															
 
																+        cl_uint          history[STARPU_MAXOPENCLDEVS]; // To track similar devices
															
 
																+        char             preproc_file_name[1024];
															
 
																+        char             located_file_name[1024];
															
 
																+        cl_platform_id   platform_ids[STARPU_OPENCL_PLATFORM_MAX];
															
 
																+        cl_uint          platform, nb_platforms;
															
 
																+
															
 
																+        // Locate source file
															
 
																+        _starpu_opencl_locate_file(source_file_name, located_file_name);
															
 
																+        _STARPU_OPENCL_DEBUG("Source file name : <%s>\n", located_file_name);
															
 
																+
															
 
																+        // Prepare preprocessor temporary filename
															
 
																+        {
															
 
																+                char *p;
															
 
																+                strcpy(preproc_file_name, located_file_name);
															
 
																+                p = strstr(preproc_file_name, ".cl");
															
 
																+                if(p == NULL)
															
 
																+                        OPENCL_ERROR("Kernel file name doesn't have the '.cl' extension!\n");
															
 
																+                strcpy(p, ".pre");
															
 
																+        }
															
 
																+
															
 
																+        // Get Platforms
															
 
																+        err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_ids, &nb_platforms);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+        // Iterate over each platform
															
 
																+        for(platform=0; platform<nb_platforms; platform++) {
															
 
																+                // Get devices
															
 
																+                err = clGetDeviceIDs(platform_ids[platform], device_type, max, devices, &nb_devices);
															
 
																+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+                if(nb_devices > max)
															
 
																+                        nb_devices = max;
															
 
																+
															
 
																+                // Iterate over each device
															
 
																+                unsigned int dev;
															
 
																+                for(dev = 0; dev < nb_devices; dev ++) {
															
 
																+                        cl_context       context;
															
 
																+                        cl_program       program;
															
 
																+                        cl_device_type   type;
															
 
																+                        cl_int           err;
															
 
																+                        cl_uint          uniqueid;
															
 
																+
															
 
																+                        uniqueid =_starpu_opencl_device_uniqueid(devices[dev]);
															
 
																+                        // Look up and update history (to avoid unuseful compilations in the case of identical devices)
															
 
																+                        {
															
 
																+                                unsigned int d;
															
 
																+                                for(d = 0; d < dev; d++)
															
 
																+                                        if(history[d] == uniqueid)
															
 
																+                                                break; // Just skip compiling for this device
															
 
																+                                if(d != dev)
															
 
																+                                        continue;
															
 
																+                                history[dev] = uniqueid;
															
 
																+                        }
															
 
																+
															
 
																+                        err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
															
 
																+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+                        // Create a compute context
															
 
																+                        context = clCreateContext(0, 1, devices + dev, NULL, NULL, &err);
															
 
																+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+                        // Run C preprocessor
															
 
																+                        {
															
 
																+                                pid_t pid;
															
 
																+                                pid = fork();
															
 
																+                                if(pid == 0) {
															
 
																+                                        execlp("cpp", "cpp", located_file_name, "-o", preproc_file_name, NULL);
															
 
																+                                        perror("execlp");
															
 
																+                                        exit(EXIT_FAILURE);
															
 
																+                                }
															
 
																+                                else {
															
 
																+                                        int status;
															
 
																+                                        waitpid(pid, &status, 0);
															
 
																+                                        if (WEXITSTATUS(status) != EXIT_SUCCESS)
															
 
																+                                                OPENCL_ERROR("Cannot preprocess file [%s]\n", located_file_name);
															
 
																+                                }
															
 
																+                        }
															
 
																+
															
 
																+                        // Load the compute program from disk into a cstring buffer
															
 
																+                        char *source = _starpu_opencl_load_program_source(preproc_file_name);
															
 
																+                        if(!source)
															
 
																+                                OPENCL_ERROR("Failed to load compute program from file <%s>!\n", preproc_file_name);
															
 
																+
															
 
																+                        // Create the compute program from the source buffer
															
 
																+                        program = clCreateProgramWithSource(context, 1, (const char **) & source, NULL, &err);
															
 
																+                        if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+                        // Build the program executable
															
 
																+                        err = clBuildProgram(program, 1, devices + dev, "-Werror -cl-mad-enable", NULL, NULL);
															
 
																+                        if (err != CL_SUCCESS) {
															
 
																+                                size_t len;
															
 
																+                                static char buffer[4096];
															
 
																+
															
 
																+                                fprintf(stderr, "Error: Failed to build program executable!\n");
															
 
																+                                clGetProgramBuildInfo(program, devices[dev], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
															
 
																+
															
 
																+                                fprintf(stderr, "%s\n", buffer);
															
 
																+                                return EXIT_FAILURE;
															
 
																+                        }
															
 
																+
															
 
																+                        // Store program binary
															
 
																+                        {
															
 
																+                                char     binary_file_name[1024];
															
 
																+                                char    *binary;
															
 
																+                                size_t   binary_len;
															
 
																+                                char    *p;
															
 
																+
															
 
																+                                strcpy(binary_file_name, located_file_name);
															
 
																+                                p = strstr(binary_file_name, ".cl");
															
 
																+                                if(p == NULL)
															
 
																+                                        OPENCL_ERROR("Input file name doesn't have the '.cl' extension!\n");
															
 
																+
															
 
																+                                strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
															
 
																+                                sprintf(p + strlen(p), "%u", uniqueid);
															
 
																+
															
 
																+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_len, NULL);
															
 
																+                                if(err != CL_SUCCESS)
															
 
																+                                        OPENCL_ERROR("Cannot get program binary size (err = %d)!\n", err);
															
 
																+
															
 
																+                                binary = malloc(binary_len);
															
 
																+
															
 
																+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binary), &binary, NULL);
															
 
																+                                if(err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+                                _starpu_opencl_store_program_binary(binary_file_name, binary, binary_len);
															
 
																+
															
 
																+                                free(binary);
															
 
																+
															
 
																+                                _STARPU_OPENCL_DEBUG("Binary file [%s] successfully built (%ld bytes).\n", binary_file_name, binary_len);
															
 
																+                        }
															
 
																+
															
 
																+                        clReleaseProgram(program);
															
 
																+                        clReleaseContext(context);
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+        return EXIT_SUCCESS;
															
 
																+}
															
--- a/src/drivers/opencl/driver_opencl_utils.h
+++ b/src/drivers/opencl/driver_opencl_utils.h
@@ -0,0 +1,41 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_OPENCL_UTILS_H__
															
 
																+#define __STARPU_OPENCL_UTILS_H__
															
 
																+
															
 
																+#include <config.h>
															
 
																+
															
 
																+#ifdef STARPU_VERBOSE
															
 
																+#  define _STARPU_OPENCL_DEBUG(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
															
 
																+#else
															
 
																+#  define _STARPU_OPENCL_DEBUG(fmt, args ...)
															
 
																+#endif
															
 
																+
															
 
																+#define _STARPU_OPENCL_DISP(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
															
 
																+
															
 
																+#define OPENCL_ERROR(fmt, args ...)                                                   \
															
 
																+	do {                                                                          \
															
 
																+                fprintf(stderr, "[starpu][%s] Error: " fmt ,__func__ ,##args); \
															
 
																+		assert(0);                                                            \
															
 
																+	} while (0)
															
 
																+
															
 
																+#define STARPU_OPENCL_PLATFORM_MAX 4
															
 
																+
															
 
																+int _starpu_opencl_init_programs(int dev);
															
 
																+int _starpu_opencl_release_programs(int dev);
															
 
																+
															
 
																+#endif /* __STARPU_OPENCL_UTILS_H__ */
															
--- a/src/util/execute_on_all.c
+++ b/src/util/execute_on_all.c
@@ -39,6 +39,7 @@ void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t whe
 
																 		.where = where,
															
 
																 		.cuda_func = wrapper_func,
															
 
																 		.cpu_func = wrapper_func,
															
 
																+		.opencl_func = wrapper_func,
															
 
																 		/* XXX we do not handle Cell .. */
															
 
																 		.nbuffers = 0,
															
 
																 		.model = NULL
															
--- a/src/util/malloc.c
+++ b/src/util/malloc.c
@@ -24,13 +24,28 @@
 
																 #include <cuda.h>
															
 
																 #endif
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <drivers/opencl/driver_opencl.h>
															
 
																+#endif
															
 
																+
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																 struct malloc_pinned_codelet_struct {
															
 
																 	void **ptr;
															
 
																 	size_t dim;
															
 
																 };
															
 
																+#endif
															
 
																+
															
 
																+//#ifdef STARPU_USE_OPENCL
															
 
																+//static void malloc_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
															
 
																+//{
															
 
																+//	struct malloc_pinned_codelet_struct *s = arg;
															
 
																+//        //        *(s->ptr) = malloc(s->dim);
															
 
																+//        _starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
															
 
																+//}
															
 
																+//#endif
															
 
																-static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static void malloc_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																 	struct malloc_pinned_codelet_struct *s = arg;
															
@@ -39,10 +54,16 @@ static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void
 
																 	if (STARPU_UNLIKELY(cures))
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 }
															
 
																+#endif
															
 
																+#if defined(STARPU_USE_CUDA)// || defined(STARPU_USE_OPENCL)
															
 
																 static starpu_codelet malloc_pinned_cl = {
															
 
																-	.where = STARPU_CUDA,
															
 
																-	.cuda_func = malloc_pinned_codelet,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_func = malloc_pinned_cuda_codelet,
															
 
																+#endif
															
 
																+//#ifdef STARPU_USE_OPENCL
															
 
																+//	.opencl_func = malloc_pinned_opencl_codelet,
															
 
																+//#endif
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
@@ -59,24 +80,47 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 
																 	{
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		int push_res;
															
 
																-	
															
 
																+
															
 
																 		struct malloc_pinned_codelet_struct s = {
															
 
																 			.ptr = A,
															
 
																 			.dim = dim
															
 
																-		};	
															
 
																-	
															
 
																+		};
															
 
																+
															
 
																+                malloc_pinned_cl.where = STARPU_CUDA;
															
 
																 		struct starpu_task *task = starpu_task_create();
															
 
																-			task->callback_func = NULL; 
															
 
																+			task->callback_func = NULL;
															
 
																 			task->cl = &malloc_pinned_cl;
															
 
																 			task->cl_arg = &s;
															
 
																 		task->synchronous = 1;
															
 
																-	
															
 
																+
															
 
																 		push_res = starpu_submit_task(task);
															
 
																 		STARPU_ASSERT(push_res != -ENODEV);
															
 
																 #endif
															
 
																 	}
															
 
																-	else {
															
 
																+//	else if (_starpu_may_submit_opencl_task())
															
 
																+//	{
															
 
																+//#ifdef STARPU_USE_OPENCL
															
 
																+//		int push_res;
															
 
																+//
															
 
																+//		struct malloc_pinned_codelet_struct s = {
															
 
																+//			.ptr = A,
															
 
																+//			.dim = dim
															
 
																+//		};
															
 
																+//
															
 
																+//                malloc_pinned_cl.where = STARPU_OPENCL;
															
 
																+//		struct starpu_task *task = starpu_task_create();
															
 
																+//			task->callback_func = NULL;
															
 
																+//			task->cl = &malloc_pinned_cl;
															
 
																+//			task->cl_arg = &s;
															
 
																+//
															
 
																+//		task->synchronous = 1;
															
 
																+//
															
 
																+//		push_res = starpu_submit_task(task);
															
 
																+//		STARPU_ASSERT(push_res != -ENODEV);
															
 
																+//#endif
															
 
																+//        }
															
 
																+        else {
															
 
																 		*A = malloc(dim);
															
 
																 	}
															
@@ -86,17 +130,32 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 
																 }
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-static void free_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
															
 
																+static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																 	cudaError_t cures;
															
 
																 	cures = cudaFreeHost(arg);
															
 
																 	if (STARPU_UNLIKELY(cures))
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 }
															
 
																+#endif
															
 
																+
															
 
																+//#ifdef STARPU_USE_OPENCL
															
 
																+//static void free_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
															
 
																+//{
															
 
																+//        //        free(arg);
															
 
																+//        int err = clReleaseMemObject(arg);
															
 
																+//        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+//}
															
 
																+//#endif
															
 
																+#if defined(STARPU_USE_CUDA) // || defined(STARPU_USE_OPENCL)
															
 
																 static starpu_codelet free_pinned_cl = {
															
 
																-	.where = STARPU_CUDA,
															
 
																-	.cuda_func = free_pinned_codelet,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_func = free_pinned_cuda_codelet,
															
 
																+#endif
															
 
																+//#ifdef STARPU_USE_OPENCL
															
 
																+//	.opencl_func = free_pinned_opencl_codelet,
															
 
																+//#endif
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
@@ -111,18 +170,36 @@ int starpu_free_pinned_if_possible(void *A)
 
																 	{
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		int push_res;
															
 
																-	
															
 
																+
															
 
																+                free_pinned_cl.where = STARPU_CUDA;
															
 
																 		struct starpu_task *task = starpu_task_create();
															
 
																-			task->callback_func = NULL; 
															
 
																+			task->callback_func = NULL;
															
 
																 			task->cl = &free_pinned_cl;
															
 
																 			task->cl_arg = A;
															
 
																 		task->synchronous = 1;
															
 
																-	
															
 
																+
															
 
																 		push_res = starpu_submit_task(task);
															
 
																 		STARPU_ASSERT(push_res != -ENODEV);
															
 
																 #endif
															
 
																 	}
															
 
																+//	else if (_starpu_may_submit_opencl_task())
															
 
																+//	{
															
 
																+//#ifdef STARPU_USE_OPENCL
															
 
																+//		int push_res;
															
 
																+//
															
 
																+//                free_pinned_cl.where = STARPU_OPENCL;
															
 
																+//		struct starpu_task *task = starpu_task_create();
															
 
																+//			task->callback_func = NULL;
															
 
																+//			task->cl = &free_pinned_cl;
															
 
																+//			task->cl_arg = A;
															
 
																+//
															
 
																+//		task->synchronous = 1;
															
 
																+//
															
 
																+//		push_res = starpu_submit_task(task);
															
 
																+//		STARPU_ASSERT(push_res != -ENODEV);
															
 
																+//#endif
															
 
																+//	}
															
 
																 	else {
															
 
																 		free(A);
															
 
																 	}
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -33,7 +33,7 @@ if STARPU_USE_CUDA
 
																 # TODO define NVCCFLAGS
															
 
																 NVCC ?= nvcc
															
 
																-NVCCFLAGS += -I$(top_srcdir)/include/
															
 
																+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
															
 
																 .cu.cubin:
															
 
																 	$(MKDIR_P) `dirname $@`
															
@@ -246,6 +246,11 @@ datawizard_sync_and_notify_data_SOURCES +=	\
 
																 	datawizard/sync_and_notify_data_kernels.cu
															
 
																 endif
															
 
																+if STARPU_USE_OPENCL
															
 
																+datawizard_sync_and_notify_data_SOURCES +=	\
															
 
																+	datawizard/sync_and_notify_data_opencl.c
															
 
																+endif
															
 
																+
															
 
																 if STARPU_USE_GORDON
															
 
																 datawizard_sync_and_notify_data_SOURCES +=	\
															
 
																 	datawizard/sync_and_notify_data_gordon_kernels.c
															
--- a/tests/core/empty_task_sync_point.c
+++ b/tests/core/empty_task_sync_point.c
@@ -34,9 +34,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/execute_on_a_specific_worker.c
+++ b/tests/core/execute_on_a_specific_worker.c
@@ -71,9 +71,10 @@ static starpu_access_mode select_random_mode(void)
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = codelet_null,
															
 
																 	.cuda_func = codelet_null,
															
 
																+        .opencl_func = codelet_null,
															
 
																 	.nbuffers = 1
															
 
																 };
															
--- a/tests/core/multithreaded.c
+++ b/tests/core/multithreaded.c
@@ -33,9 +33,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/regenerate.c
+++ b/tests/core/regenerate.c
@@ -51,9 +51,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/starpu_wait_all_tasks.c
+++ b/tests/core/starpu_wait_all_tasks.c
@@ -28,9 +28,10 @@ static void dummy_func(void *descr[], void *arg)
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 	.gordon_func = 0, /* this will be defined later */
															
 
																 #endif
															
@@ -70,9 +71,11 @@ static struct starpu_conf conf = {
 
																 	.sched_policy_name = NULL,
															
 
																 	.ncpus = -1,
															
 
																 	.ncuda = -1,
															
 
																+        .nopencl = -1,
															
 
																 	.nspus = -1,
															
 
																 	.use_explicit_workers_bindid = 0,
															
 
																-	.use_explicit_workers_gpuid = 0,
															
 
																+	.use_explicit_workers_cuda_gpuid = 0,
															
 
																+	.use_explicit_workers_opencl_gpuid = 0,
															
 
																 	.calibrate = 0
															
 
																 };
															
--- a/tests/core/starpu_wait_task.c
+++ b/tests/core/starpu_wait_task.c
@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
@@ -86,7 +87,8 @@ int main(int argc, char **argv)
 
																 		int ret = starpu_submit_task(task);
															
 
																 		STARPU_ASSERT(!ret);
															
 
																-		starpu_wait_task(task);
															
 
																+		ret = starpu_wait_task(task);
															
 
																+		STARPU_ASSERT(!ret);
															
 
																 		starpu_task_destroy(task);
															
 
																 	}
															
--- a/tests/core/static_restartable.c
+++ b/tests/core/static_restartable.c
@@ -28,10 +28,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																-	.model = NULL,
															
 
																+	.opencl_func = dummy_func,
															
 
																+        .model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/static_restartable_tag.c
+++ b/tests/core/static_restartable_tag.c
@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/static_restartable_using_initializer.c
+++ b/tests/core/static_restartable_using_initializer.c
@@ -31,9 +31,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/subgraph_repeat.c
+++ b/tests/core/subgraph_repeat.c
@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/subgraph_repeat_regenerate.c
+++ b/tests/core/subgraph_repeat_regenerate.c
@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/tag_wait_api.c
+++ b/tests/core/tag_wait_api.c
@@ -26,9 +26,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/core/task_wait_api.c
+++ b/tests/core/task_wait_api.c
@@ -26,10 +26,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																-	.model = NULL,
															
 
																+	.opencl_func = dummy_func,
															
 
																+        .model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/datawizard/dining_philosophers.c
+++ b/tests/datawizard/dining_philosophers.c
@@ -27,9 +27,10 @@ static void eat_kernel(void *descr[], void *arg)
 
																 }
															
 
																 static starpu_codelet eating_cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cuda_func = eat_kernel,
															
 
																 	.cpu_func = eat_kernel,
															
 
																+        .opencl_func = eat_kernel,
															
 
																 	.nbuffers = 2
															
 
																 };
															
--- a/tests/datawizard/dsm_stress.c
+++ b/tests/datawizard/dsm_stress.c
@@ -54,6 +54,10 @@ static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_arg
 
																 {
															
 
																 }
															
 
																+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+}
															
 
																+
															
 
																 static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																 {
															
 
																 }
															
@@ -74,9 +78,10 @@ static starpu_access_mode select_random_mode(void)
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = cpu_codelet_null,
															
 
																 	.cuda_func = cuda_codelet_null,
															
 
																+        .opencl_func = opencl_codelet_null,
															
 
																 	.nbuffers = 2
															
 
																 };
															
--- a/tests/datawizard/readers_and_writers.c
+++ b/tests/datawizard/readers_and_writers.c
@@ -24,9 +24,10 @@ static void dummy_kernel(void *descr[], void *arg)
 
																 }
															
 
																 static starpu_codelet rw_cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cuda_func = dummy_kernel,
															
 
																 	.cpu_func = dummy_kernel,
															
 
																+	.opencl_func = dummy_kernel,
															
 
																 	.nbuffers = 1
															
 
																 };
															
--- a/tests/datawizard/sync_and_notify_data.c
+++ b/tests/datawizard/sync_and_notify_data.c
@@ -46,6 +46,11 @@ void cuda_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
 
																 void cuda_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																 #define VECTORSIZE	16
															
 
																 starpu_data_handle v_handle;
															
@@ -79,8 +84,12 @@ int main(int argc, char **argv)
 
																 	fprintf(stderr, "kernel incA %d incC %d elf %d\n", kernel_incA_id, kernel_incC_id, elf_id);
															
 
																 #endif
															
 
																-	
															
 
																-	starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        _starpu_opencl_compile_source_to_opencl("tests/datawizard/sync_and_notify_data_opencl_codelet.cl");
															
 
																+#endif
															
 
																+
															
 
																+        starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
															
 
																 	unsigned iter;
															
 
																 	for (iter = 0; iter < K; iter++)
															
@@ -91,11 +100,14 @@ int main(int argc, char **argv)
 
																 		{
															
 
																 			/* increment a = v[0] */
															
 
																 			starpu_codelet cl_inc_a = {
															
 
																-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
															
 
																+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
															
 
																 				.cpu_func = cpu_codelet_incA,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 				.cuda_func = cuda_codelet_incA,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+				.opencl_func = opencl_codelet_incA,
															
 
																+#endif
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 				.gordon_func = kernel_incA_id,
															
 
																 #endif
															
@@ -127,11 +139,14 @@ int main(int argc, char **argv)
 
																 		{
															
 
																 			/* increment c = v[2] */
															
 
																 			starpu_codelet cl_inc_c = {
															
 
																-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
															
 
																+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
															
 
																 				.cpu_func = cpu_codelet_incC,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 				.cuda_func = cuda_codelet_incC,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+				.opencl_func = opencl_codelet_incC,
															
 
																+#endif
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 				.gordon_func = kernel_incC_id,
															
 
																 #endif
															
--- a/tests/datawizard/sync_and_notify_data_opencl.c
+++ b/tests/datawizard/sync_and_notify_data_opencl.c
@@ -0,0 +1,77 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+#include <CL/cl.h>
															
 
																+
															
 
																+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+        unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err;
															
 
																+
															
 
																+	id = starpu_get_worker_id();
															
 
																+	devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incA", devid);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+		size_t global=100;
															
 
																+		size_t local=100;
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+	starpu_opencl_release(kernel);
															
 
																+}
															
 
																+
															
 
																+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+	cl_kernel kernel;
															
 
																+	cl_command_queue queue;
															
 
																+	int id, devid, err;
															
 
																+
															
 
																+	id = starpu_get_worker_id();
															
 
																+	devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incC", devid);
															
 
																+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	err = 0;
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+	{
															
 
																+		size_t global=100;
															
 
																+		size_t local=100;
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+	}
															
 
																+
															
 
																+	clFinish(queue);
															
 
																+
															
 
																+	starpu_opencl_release(kernel);
															
 
																+}
															
--- a/tests/datawizard/sync_and_notify_data_opencl_codelet.cl
+++ b/tests/datawizard/sync_and_notify_data_opencl_codelet.cl
@@ -0,0 +1,30 @@
 
																+/*
															
 
																+ * StarPU
															
 
																+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
															
 
																+ *
															
 
																+ * This program is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * This program is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+__kernel void incA(__global unsigned* input) 
															
 
																+{
															
 
																+	const int i = get_global_id(0);
															
 
																+	if (i == 0)
															
 
																+		input[i] ++;
															
 
																+}
															
 
																+
															
 
																+__kernel void incC(__global unsigned* input) 
															
 
																+{
															
 
																+	const int i = get_global_id(0);
															
 
																+	if (i == 2)
															
 
																+		input[i] ++;
															
 
																+}
															
 
																+
															
--- a/tests/datawizard/sync_with_data_with_mem.c
+++ b/tests/datawizard/sync_with_data_with_mem.c
@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
																 }
															
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_codelet,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_func = dummy_codelet,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        .opencl_func = dummy_codelet,
															
 
																+#endif
															
 
																 	.nbuffers = 1
															
 
																 };
															
--- a/tests/datawizard/sync_with_data_with_mem_non_blocking.c
+++ b/tests/datawizard/sync_with_data_with_mem_non_blocking.c
@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
																 }
															
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_codelet,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_func = dummy_codelet,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.opencl_func = dummy_codelet,
															
 
																+#endif
															
 
																 	.nbuffers = 1
															
 
																 };
															
--- a/tests/datawizard/unpartition.c
+++ b/tests/datawizard/unpartition.c
@@ -32,11 +32,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
																 }
															
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_codelet,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_func = dummy_codelet,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.opencl_func = dummy_codelet,
															
 
																+#endif
															
 
																 	.nbuffers = 1
															
 
																 };
															
--- a/tests/datawizard/write_only_tmp_buffer.c
+++ b/tests/datawizard/write_only_tmp_buffer.c
@@ -24,6 +24,21 @@
 
																 starpu_data_handle v_handle;
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+#include <CL/cl.h>
															
 
																+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	cl_mem buf = (cl_mem)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																+        char ptr = 42;
															
 
																+        cl_command_queue queue;
															
 
																+        int id = starpu_get_worker_id();
															
 
																+        int devid = starpu_get_worker_devid(id);
															
 
																+
															
 
																+        starpu_opencl_get_queue(devid, &queue);
															
 
																+        clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, sizeof(char), &ptr, 0, NULL, NULL);
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																 {
															
@@ -51,11 +66,14 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 
																 }
															
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = cpu_codelet_null,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_func = cuda_codelet_null,
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.opencl_func = opencl_codelet_null,
															
 
																+#endif
															
 
																 	.nbuffers = 1
															
 
																 };
															
--- a/tests/errorcheck/invalid_blocking_calls.c
+++ b/tests/errorcheck/invalid_blocking_calls.c
@@ -38,9 +38,10 @@ static void wrong_func(void *descr[], void *arg)
 
																 static starpu_codelet wrong_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = wrong_func,
															
 
																 	.cuda_func = wrong_func,
															
 
																+        .opencl_func = wrong_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/errorcheck/starpu_init_noworker.c
+++ b/tests/errorcheck/starpu_init_noworker.c
@@ -29,9 +29,11 @@ int main(int argc, char **argv)
 
																 		.sched_policy_name = NULL, /* default */
															
 
																 		.ncpus = 0,
															
 
																 		.ncuda = 0,
															
 
																+                .nopencl = 0,
															
 
																 		.nspus = 0,
															
 
																 		.use_explicit_workers_bindid = 0,
															
 
																-		.use_explicit_workers_gpuid = 0,
															
 
																+		.use_explicit_workers_cuda_gpuid = 0,
															
 
																+		.use_explicit_workers_opencl_gpuid = 0,
															
 
																 		.calibrate = 0
															
 
																 	};
															
--- a/tests/helper/execute_on_all.c
+++ b/tests/helper/execute_on_all.c
@@ -32,12 +32,14 @@ int main(int argc, char **argv)
 
																 	int arg = 0x42;
															
 
																-	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA);
															
 
																+	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA|STARPU_OPENCL);
															
 
																 	starpu_execute_on_each_worker(func, &arg, STARPU_CPU);
															
 
																 	starpu_execute_on_each_worker(func, &arg, STARPU_CUDA);
															
 
																+        starpu_execute_on_each_worker(func, &arg, STARPU_OPENCL);
															
 
																+
															
 
																 	starpu_shutdown();
															
 
																 	return 0;
															
--- a/tests/helper/starpu_create_sync_task.c
+++ b/tests/helper/starpu_create_sync_task.c
@@ -25,9 +25,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet =
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+        .opencl_func = dummy_func,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/microbenchs/async_tasks_overhead.c
+++ b/tests/microbenchs/async_tasks_overhead.c
@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+        .opencl_func = dummy_func,
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 	.gordon_func = 0, /* this will be defined later */
															
 
																 #endif
															
@@ -91,9 +92,11 @@ static struct starpu_conf conf = {
 
																 	.sched_policy_name = NULL,
															
 
																 	.ncpus = -1,
															
 
																 	.ncuda = -1,
															
 
																+        .nopencl = -1,
															
 
																 	.nspus = -1,
															
 
																 	.use_explicit_workers_bindid = 0,
															
 
																-	.use_explicit_workers_gpuid = 0,
															
 
																+	.use_explicit_workers_cuda_gpuid = 0,
															
 
																+	.use_explicit_workers_opencl_gpuid = 0,
															
 
																 	.calibrate = 0
															
 
																 };
															
--- a/tests/microbenchs/prefetch_data_on_node.c
+++ b/tests/microbenchs/prefetch_data_on_node.c
@@ -75,9 +75,10 @@ static starpu_access_mode select_random_mode(void)
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = codelet_null,
															
 
																 	.cuda_func = codelet_null,
															
 
																+	.opencl_func = codelet_null,
															
 
																 	.nbuffers = 1
															
 
																 };
															
--- a/tests/microbenchs/redundant_buffer.c
+++ b/tests/microbenchs/redundant_buffer.c
@@ -27,6 +27,10 @@
 
																 starpu_data_handle v_handle;
															
 
																 static unsigned *v;
															
 
																+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+}
															
 
																+
															
 
																 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																 {
															
 
																 }
															
@@ -36,10 +40,11 @@ static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args
 
																 }
															
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = cpu_codelet_null,
															
 
																 	.cuda_func = cuda_codelet_null,
															
 
																-	.nbuffers = 2
															
 
																+	.opencl_func = opencl_codelet_null,
															
 
																+        .nbuffers = 2
															
 
																 };
															
--- a/tests/microbenchs/sync_tasks_overhead.c
+++ b/tests/microbenchs/sync_tasks_overhead.c
@@ -28,9 +28,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+        .opencl_func = dummy_func,
															
 
																 #ifdef STARPU_USE_GORDON
															
 
																 	.gordon_func = 0, /* this will be defined later */
															
 
																 #endif
															
--- a/tests/microbenchs/tasks_overhead.c
+++ b/tests/microbenchs/tasks_overhead.c
@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
																 static starpu_codelet dummy_codelet = 
															
 
																 {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = dummy_func,
															
 
																 	.cuda_func = dummy_func,
															
 
																+	.opencl_func = dummy_func,
															
 
																 	.model = NULL,
															
 
																 	.nbuffers = 0
															
 
																 };
															
--- a/tests/overlap/overlap.c
+++ b/tests/overlap/overlap.c
@@ -59,9 +59,10 @@ static struct starpu_perfmodel_t model = {
 
																 };
															
 
																 static starpu_codelet cl = {
															
 
																-	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
															
 
																 	.cpu_func = codelet_sleep,
															
 
																 	.cuda_func = codelet_sleep,
															
 
																+        .opencl_func = codelet_sleep,
															
 
																 	.nbuffers = 1,
															
 
																 	.model =  &model
															
 
																 };