Browse Source

merge branches/opencl --> trunk

Nathalie Furmento 15 years ago
parent
commit
59c29c44ec
90 changed files with 4205 additions and 192 deletions
  1. 1 0
      Makefile.am
  2. 79 6
      configure.ac
  3. 13 1
      doc/starpu.texi
  4. 91 5
      examples/Makefile.am
  5. 167 0
      examples/block/block.c
  6. 34 0
      examples/block/block_cuda.cu
  7. 23 0
      examples/block/block_kernel.cl
  8. 1 1
      examples/heat/Makefile.in
  9. 23 10
      examples/incrementer/incrementer.c
  10. 49 0
      examples/incrementer/incrementer_kernels_opencl.c
  11. 23 0
      examples/incrementer/incrementer_kernels_opencl_codelet.cl
  12. 193 0
      examples/matvecmult/matvecmult.c
  13. 48 0
      examples/matvecmult/matvecmult_kernel.cl
  14. 2 2
      examples/ppm_downscaler/yuv_downscaler.c
  15. 76 2
      examples/spmv/dw_spmv.c
  16. 0 1
      examples/spmv/dw_spmv.h
  17. 42 0
      examples/spmv/spmv_opencl.cl
  18. 147 0
      examples/stencil/stencil.c
  19. 43 0
      examples/stencil/stencil.h
  20. 66 0
      examples/stencil/stencil_opencl.c
  21. 100 0
      examples/variable/variable.c
  22. 30 0
      examples/variable/variable_kernels.cu
  23. 48 0
      examples/variable/variable_kernels_opencl.c
  24. 23 0
      examples/variable/variable_kernels_opencl_codelet.cl
  25. 14 2
      include/starpu.h
  26. 4 2
      include/starpu_config.h.in
  27. 6 0
      include/starpu_data_interfaces.h
  28. 192 0
      include/starpu_opencl.h
  29. 4 2
      include/starpu_perfmodel.h
  30. 2 0
      include/starpu_task.h
  31. 5 0
      src/Makefile.am
  32. 1 0
      src/core/jobs.h
  33. 3 0
      src/core/perfmodel/perfmodel.c
  34. 6 3
      src/core/perfmodel/perfmodel.h
  35. 335 66
      src/core/perfmodel/perfmodel_bus.c
  36. 173 21
      src/core/topology.c
  37. 28 0
      src/core/workers.c
  38. 16 3
      src/core/workers.h
  39. 6 4
      src/datawizard/coherency.c
  40. 67 0
      src/datawizard/copy_driver.c
  41. 23 0
      src/datawizard/copy_driver.h
  42. 121 0
      src/datawizard/interfaces/bcsr_interface.c
  43. 122 0
      src/datawizard/interfaces/block_interface.c
  44. 114 0
      src/datawizard/interfaces/csr_interface.c
  45. 4 0
      src/datawizard/interfaces/matrix_filters.c
  46. 120 1
      src/datawizard/interfaces/matrix_interface.c
  47. 109 0
      src/datawizard/interfaces/variable_interface.c
  48. 8 0
      src/datawizard/interfaces/vector_filters.c
  49. 118 2
      src/datawizard/interfaces/vector_interface.c
  50. 2 1
      src/datawizard/memory_nodes.h
  51. 422 0
      src/drivers/opencl/driver_opencl.c
  52. 57 0
      src/drivers/opencl/driver_opencl.h
  53. 430 0
      src/drivers/opencl/driver_opencl_utils.c
  54. 41 0
      src/drivers/opencl/driver_opencl_utils.h
  55. 1 0
      src/util/execute_on_all.c
  56. 93 16
      src/util/malloc.c
  57. 6 1
      tests/Makefile.am
  58. 2 1
      tests/core/empty_task_sync_point.c
  59. 2 1
      tests/core/execute_on_a_specific_worker.c
  60. 2 1
      tests/core/multithreaded.c
  61. 2 1
      tests/core/regenerate.c
  62. 5 2
      tests/core/starpu_wait_all_tasks.c
  63. 4 2
      tests/core/starpu_wait_task.c
  64. 3 2
      tests/core/static_restartable.c
  65. 2 1
      tests/core/static_restartable_tag.c
  66. 2 1
      tests/core/static_restartable_using_initializer.c
  67. 2 1
      tests/core/subgraph_repeat.c
  68. 2 1
      tests/core/subgraph_repeat_regenerate.c
  69. 2 1
      tests/core/tag_wait_api.c
  70. 3 2
      tests/core/task_wait_api.c
  71. 2 1
      tests/datawizard/dining_philosophers.c
  72. 6 1
      tests/datawizard/dsm_stress.c
  73. 2 1
      tests/datawizard/readers_and_writers.c
  74. 19 4
      tests/datawizard/sync_and_notify_data.c
  75. 77 0
      tests/datawizard/sync_and_notify_data_opencl.c
  76. 30 0
      tests/datawizard/sync_and_notify_data_opencl_codelet.cl
  77. 4 1
      tests/datawizard/sync_with_data_with_mem.c
  78. 4 1
      tests/datawizard/sync_with_data_with_mem_non_blocking.c
  79. 4 1
      tests/datawizard/unpartition.c
  80. 19 1
      tests/datawizard/write_only_tmp_buffer.c
  81. 2 1
      tests/errorcheck/invalid_blocking_calls.c
  82. 3 1
      tests/errorcheck/starpu_init_noworker.c
  83. 3 1
      tests/helper/execute_on_all.c
  84. 2 1
      tests/helper/starpu_create_sync_task.c
  85. 5 2
      tests/microbenchs/async_tasks_overhead.c
  86. 2 1
      tests/microbenchs/prefetch_data_on_node.c
  87. 7 2
      tests/microbenchs/redundant_buffer.c
  88. 2 1
      tests/microbenchs/sync_tasks_overhead.c
  89. 2 1
      tests/microbenchs/tasks_overhead.c
  90. 2 1
      tests/overlap/overlap.c

+ 1 - 0
Makefile.am

@@ -34,4 +34,5 @@ include_HEADERS = 				\
 	include/starpu_data.h			\
 	include/starpu_data.h			\
 	include/starpu_perfmodel.h		\
 	include/starpu_perfmodel.h		\
 	include/starpu_util.h			\
 	include/starpu_util.h			\
+	include/starpu_opencl.h			\
 	include/starpu_expert.h
 	include/starpu_expert.h

+ 79 - 6
configure.ac

@@ -52,8 +52,10 @@ esac
 
 
 # This will be useful for program which use CUDA (and .cubin files) which need
 # This will be useful for program which use CUDA (and .cubin files) which need
 # some path to the CUDA code at runtime.
 # some path to the CUDA code at runtime.
-AC_DEFINE_UNQUOTED(STARPU_DIR, "$PWD", [location of StarPU sources])
-AC_SUBST(STARPU_DIR, $PWD)
+AC_DEFINE_UNQUOTED(STARPU_BUILD_DIR, "$PWD", [location of StarPU build directory])
+AC_SUBST(STARPU_BUILD_DIR, $PWD)
+AC_DEFINE_UNQUOTED(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})", [location of StarPU sources])
+AC_SUBST(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})")
 
 
 AC_CHECK_LIB([pthread], [pthread_create])
 AC_CHECK_LIB([pthread], [pthread_create])
 case "$target" in
 case "$target" in
@@ -300,6 +302,71 @@ fi
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
+#                                 OpenCL settings                             #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(maximum number of OpenCL devices)
+AC_ARG_ENABLE(maxopencldev, [AS_HELP_STRING([--enable-maxopencldev=<number>],
+			[maximum number of OPENCL devices])],
+			nmaxopencldev=$enableval, nmaxopencldev=8)
+AC_MSG_RESULT($nmaxopencldev)
+AC_DEFINE_UNQUOTED(STARPU_MAXOPENCLDEVS, [$nmaxopencldev], 
+		[maximum number of OPENCL devices])
+AC_ARG_ENABLE(opencl, [AS_HELP_STRING([--disable-opencl],
+		[do not use OpenCL device(s)])],, [enable_opencl=maybe])
+
+if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
+	#AC_MSG_CHECKING(whether OpenCL is available)
+	AC_ARG_WITH(opencl-dir, 
+		[AS_HELP_STRING([--with-opencl-dir=<path>],
+		[specify OpenCL installation directory (default is /usr/)])],
+		[
+			opencl_dir=$withval
+			# in case this was not explicit yet
+			enable_opencl=yes
+		], opencl_dir=/usr/)
+	
+	if test -d "$opencl_dir/include/"; then
+		CPPFLAGS="${CPPFLAGS} -I$opencl_dir/include/ "
+	fi
+
+	# do we have a valid OpenCL setup ?
+	have_valid_opencl=yes
+	AC_CHECK_HEADER([CL/cl.h],,[have_valid_opencl=no])
+
+	# we are looking for the proper option in LDFLAGS, so we save the
+	# current value of LDFLAGS so that we can add new things in it and
+	# restore it in case it's not working.
+	SAVED_LDFLAGS="${LDFLAGS}"
+
+	found_opencllib=no
+	if test -d "$opencl_dir/lib/"; then
+		LDFLAGS="${SAVED_LDFLAGS} -L$opencl_dir/lib/ "
+		AC_SEARCH_LIBS([clCreateKernel],[OpenCL],[found_opencllib=yes],[found_opencllib=no])
+	fi
+
+	# in case OpenCL was explicitely required, but is not available, this is an error
+	if test x$enable_opencl = xyes -a x$have_valid_opencl = no; then
+		AC_MSG_ERROR([cannot find OpenCL])
+	fi
+
+	# now we enable OpenCL if and only if a proper setup is available
+	enable_opencl=$have_valid_opencl
+fi
+
+AC_MSG_CHECKING(whether OpenCL should be used)
+AC_MSG_RESULT($enable_opencl)
+AC_SUBST(STARPU_USE_OPENCL, $enable_opencl)
+AM_CONDITIONAL(STARPU_USE_OPENCL, test x$enable_opencl = xyes)
+if test x$enable_opencl = xyes; then
+	AC_DEFINE(STARPU_USE_OPENCL, [1], [OpenCL support is activated])
+        AC_DEFINE_UNQUOTED(STARPU_OPENCL_DATADIR, "$(eval echo ${datarootdir}/starpu/opencl)", [Path to OpenCL codelets])
+        AC_SUBST(STARPU_OPENCL_DATAdir, "$(eval echo ${datarootdir}/starpu/opencl/examples)")
+fi
+
+###############################################################################
+#                                                                             #
 #                                 Cell settings                               #
 #                                 Cell settings                               #
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
@@ -506,13 +573,19 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 
 
 # We have one memory node shared by all CPU workers, one node per GPU, and
 # We have one memory node shared by all CPU workers, one node per GPU, and
 # currently the Cell driver is using the same memory node as the CPU.
 # currently the Cell driver is using the same memory node as the CPU.
-if test x$enable_cuda = xyes; then
+maxnodes=1
+if test x$enable_cuda = xyes ; then
 	# we could have used nmaxcudadev + 1, but this would certainly give an
 	# we could have used nmaxcudadev + 1, but this would certainly give an
 	# odd number.
 	# odd number.
-	maxnodes=`expr 2 \* $nmaxcudadev`
-else
-	maxnodes=1	
+	maxnodes=`expr $maxnodes + $nmaxcudadev`
 fi
 fi
+if test x$enable_opencl = xyes ; then
+	# we could have used nmaxcudadev + 1, but this would certainly give an
+	# odd number.
+	maxnodes=`expr $maxnodes + $nmaxopencldev`
+fi
+# todo: set maxnodes to the next power of 2 greater than maxnodes
+
 AC_MSG_CHECKING(maximum number of memory nodes)
 AC_MSG_CHECKING(maximum number of memory nodes)
 AC_MSG_RESULT($maxnodes)
 AC_MSG_RESULT($maxnodes)
 AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],
 AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],

+ 13 - 1
doc/starpu.texi

@@ -476,6 +476,10 @@ specified with the @code{STARPU_NCPUS} environment variable.
 This is the maximum number of CUDA devices that StarPU can use. This can also be
 This is the maximum number of CUDA devices that StarPU can use. This can also be
 specified with the @code{STARPU_NCUDA} environment variable.
 specified with the @code{STARPU_NCUDA} environment variable.
 
 
+@item @code{nopencl} (default = -1):
+This is the maximum number of OpenCL devices that StarPU can use. This can also be
+specified with the @code{STARPU_NOPENCL} environment variable.
+
 @item @code{nspus} (default = -1):
 @item @code{nspus} (default = -1):
 This is the maximum number of Cell SPUs that StarPU can use. This can also be
 This is the maximum number of Cell SPUs that StarPU can use. This can also be
 specified with the @code{STARPU_NGORDON} environment variable.
 specified with the @code{STARPU_NGORDON} environment variable.
@@ -600,7 +604,8 @@ an integer between 0 and @code{starpu_get_worker_count() - 1}.
 This function returns the type of worker associated to an identifier (as
 This function returns the type of worker associated to an identifier (as
 returned by the @code{starpu_get_worker_id} function). The returned value
 returned by the @code{starpu_get_worker_id} function). The returned value
 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
-core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
+core, @code{STARPU_CUDA_WORKER} for a CUDA device,
+@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
 @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
 @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
 identifier is unspecified.
 identifier is unspecified.
 
 
@@ -716,6 +721,13 @@ be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
 field is ignored if @code{STARPU_CUDA} does not appear in the @code{.where}
 field is ignored if @code{STARPU_CUDA} does not appear in the @code{.where}
 field, it must be non-null otherwise.
 field, it must be non-null otherwise.
 
 
+@item @code{opencl_func} (optionnal):
+Is a function pointer to the OpenCL implementation of the codelet. Its
+prototype must be:
+@code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
+This pointer is ignored if @code{OPENCL} does not appear in the
+@code{.where} field, it must be non-null otherwise.
+
 @item @code{gordon_func} (optionnal):
 @item @code{gordon_func} (optionnal):
 This is the index of the Cell SPU implementation within the Gordon library.
 This is the index of the Cell SPU implementation within the Gordon library.
 TODO
 TODO

+ 91 - 5
examples/Makefile.am

@@ -431,11 +431,97 @@ check_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
 	incrementer/incrementer
 	incrementer/incrementer
 
 
-if STARPU_USE_CUDA
-incrementer_incrementer_SOURCES =	\
-	incrementer/incrementer.c	\
-	incrementer/incrementer_kernels.cu
-else
 incrementer_incrementer_SOURCES =	\
 incrementer_incrementer_SOURCES =	\
 	incrementer/incrementer.c
 	incrementer/incrementer.c
+if STARPU_USE_CUDA
+incrementer_incrementer_SOURCES +=	\
+	incrementer/incrementer_kernels.cu
+endif
+if STARPU_USE_OPENCL
+incrementer_incrementer_SOURCES +=	\
+	incrementer/incrementer_kernels_opencl.c
+endif
+
+nobase_STARPU_OPENCL_DATA_DATA = \
+	incrementer/incrementer_kernels_opencl_codelet.cl
+
+###################
+# Stencil example #
+###################
+
+if STARPU_USE_OPENCL
+check_PROGRAMS +=		\
+	stencil/stencil
+examplebin_PROGRAMS +=		\
+	stencil/stencil
+stencil_stencil_SOURCES =	\
+	stencil/stencil.c       \
+	stencil/stencil_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	stencil/stencil_opencl_codelet.cl \
+	stencil/stencil.h
+endif
+
+####################
+# Variable example #
+####################
+
+check_PROGRAMS +=				\
+	variable/variable
+
+examplebin_PROGRAMS +=				\
+	variable/variable
+
+variable_variable_SOURCES =	\
+	variable/variable.c
+if STARPU_USE_CUDA
+variable_variable_SOURCES +=	\
+	variable/variable_kernels.cu
+endif
+if STARPU_USE_OPENCL
+variable_variable_SOURCES +=	\
+	variable/variable_kernels_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	variable/variable_kernels_opencl_codelet.cl
+endif
+
+######################
+# matVecMult example #
+######################
+
+check_PROGRAMS +=				\
+	matvecmult/matvecmult
+
+examplebin_PROGRAMS +=				\
+	matvecmult/matvecmult
+
+matvecmult_matvecmult_SOURCES =	\
+	matvecmult/matvecmult.c
+
+if STARPU_USE_OPENCL
+nobase_STARPU_OPENCL_DATA_DATA += \
+	matvecmult/matvecmult_kernel.cl
+endif
+
+#################
+# block example #
+#################
+
+check_PROGRAMS +=				\
+	block/block
+
+examplebin_PROGRAMS +=				\
+	block/block
+
+block_block_SOURCES =	\
+	block/block.c
+
+if STARPU_USE_CUDA
+block_block_SOURCES +=				\
+	block/block_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+nobase_STARPU_OPENCL_DATA_DATA += \
+	block/block_kernel.cl
 endif
 endif

+ 167 - 0
examples/block/block.c

@@ -0,0 +1,167 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <pthread.h>
+#include <math.h>
+
+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
+        int i;
+
+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
+}
+
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err, n;
+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/block/block_kernel.cl", "block", devid);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+        n=0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
+	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
+	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
+	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
+	err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &multiplier);
+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+                size_t global=1024;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+        starpu_opencl_release(kernel);
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+typedef void (*device_func)(void **, void *);
+
+int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny, int pnz, float multiplier)
+{
+	starpu_codelet cl;
+	starpu_data_handle block_handle;
+        starpu_data_handle multiplier_handle;
+        int i, j, k;
+
+	starpu_register_block_data(&block_handle, 0, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
+	starpu_register_variable_data(&multiplier_handle, 0, (uintptr_t)&multiplier, sizeof(float));
+
+	cl.where = where;
+        cl.cuda_func = func;
+        cl.cpu_func = func;
+        cl.opencl_func = func;
+        cl.nbuffers = 2;
+        cl.model = NULL;
+
+        struct starpu_task *task = starpu_task_create();
+        task->cl = &cl;
+        task->callback_func = NULL;
+        task->buffers[0].handle = block_handle;
+        task->buffers[0].mode = STARPU_RW;
+        task->buffers[1].handle = multiplier_handle;
+        task->buffers[1].mode = STARPU_RW;
+
+        int ret = starpu_submit_task(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                fprintf(stderr, "No worker may execute this task\n");
+                return 1;
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+        starpu_sync_data_with_mem(block_handle, STARPU_R);
+
+        for(i=0 ; i<pnx*pny*pnz; i++) {
+          fprintf(stderr, "%f ", block[i]);
+        }
+        fprintf(stderr, "\n");
+
+        starpu_release_data_from_mem(block_handle);
+
+        return 0;
+}
+
+int main(int argc, char **argv)
+{
+	starpu_codelet cl;
+        float *block;
+        int i, ret;
+        int nx=3;
+        int ny=2;
+        int nz=4;
+        float multiplier=1.0;
+
+        starpu_init(NULL);
+
+        block = (float*)malloc(nx*ny*nz*sizeof(float));
+        assert(block);
+        for(i=0 ; i<nx*ny*nz ; i++) block[i] = i+1;
+
+        ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
+        if (!ret) multiplier *= 1.0;
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("examples/block/block_kernel.cl");
+        ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
+        if (!ret) multiplier *= 2.0;
+#endif
+#ifdef STARPU_USE_CUDA
+        ret = execute_on(STARPU_CUDA, cuda_codelet, block, nx, ny, nz, 3.0);
+        if (!ret) multiplier *= 3.0;
+#endif
+
+        // Check result is correct
+        ret=1;
+        for(i=0 ; i<nx*ny*nz ; i++) {
+          if (block[i] != (i+1) * multiplier) {
+            ret=0;
+            break;
+          }
+        }
+
+        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
+        starpu_shutdown();
+
+	return 0;
+}

+ 34 - 0
examples/block/block_cuda.cu

@@ -0,0 +1,34 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+static __global__ void cuda_block(float *block, int nx, int ny, int nz, float *multiplier)
+{
+        int i;
+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
+}
+
+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+        float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
+	int nx = STARPU_GET_BLOCK_NX(descr[0]);
+	int ny = STARPU_GET_BLOCK_NY(descr[0]);
+	int nz = STARPU_GET_BLOCK_NZ(descr[0]);
+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
+
+        cuda_block<<<1,1>>>(block, nx, ny, nz, multiplier);
+}

+ 23 - 0
examples/block/block_kernel.cl

@@ -0,0 +1,23 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void block(__global float *b, int nx, int ny, int nz, __global float *multiplier)
+{
+        const int i = get_global_id(0);
+        if (i < nx*ny*nz) {
+                b[i] *= *multiplier;
+        }
+}

+ 1 - 1
examples/heat/Makefile.in

@@ -14,7 +14,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 #
 
 
-export PKG_CONFIG_PATH=@STARPU_DIR@
+export PKG_CONFIG_PATH=@STARPU_BUILD_DIR@
 
 
 LIBS+=$$(pkg-config --libs libstarpu)
 LIBS+=$$(pkg-config --libs libstarpu)
 CFLAGS+=$$(pkg-config --cflags libstarpu)
 CFLAGS+=$$(pkg-config --cflags libstarpu)

+ 23 - 10
examples/incrementer/incrementer.c

@@ -23,6 +23,10 @@ static unsigned niter = 50000;
 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 #endif
 
 
+#ifdef STARPU_USE_OPENCL
+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
 extern void cuda_codelet_host(float *tab);
 extern void cuda_codelet_host(float *tab);
 
 
 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
@@ -39,20 +43,27 @@ int main(int argc, char **argv)
 	if (argc == 2)
 	if (argc == 2)
 		niter = atoi(argv[1]);
 		niter = atoi(argv[1]);
 
 
-	float float_array[3] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f}; 
+	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
 
 
 	starpu_data_handle float_array_handle;
 	starpu_data_handle float_array_handle;
 	starpu_register_vector_data(&float_array_handle, 0 /* home node */,
 	starpu_register_vector_data(&float_array_handle, 0 /* home node */,
-			(uintptr_t)&float_array, 3, sizeof(float));
+			(uintptr_t)&float_array, 4, sizeof(float));
+
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("examples/incrementer/incrementer_kernels_opencl_codelet.cl");
+#endif
 
 
 	starpu_codelet cl =
 	starpu_codelet cl =
 	{
 	{
 		/* CUBLAS stands for CUDA kernels controlled from the host */
 		/* CUBLAS stands for CUDA kernels controlled from the host */
-		.where = STARPU_CPU|STARPU_CUDA,
+		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 		.cpu_func = cpu_codelet,
 		.cpu_func = cpu_codelet,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		.cuda_func = cuda_codelet,
 		.cuda_func = cuda_codelet,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		.opencl_func = opencl_codelet,
+#endif
 		.nbuffers = 1
 		.nbuffers = 1
 	};
 	};
 
 
@@ -67,7 +78,7 @@ int main(int argc, char **argv)
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
 
 
 		task->cl = &cl;
 		task->cl = &cl;
-		
+
 		task->callback_func = NULL;
 		task->callback_func = NULL;
 
 
 		task->buffers[0].handle = float_array_handle;
 		task->buffers[0].handle = float_array_handle;
@@ -85,15 +96,17 @@ int main(int argc, char **argv)
 
 
 	/* update the array in RAM */
 	/* update the array in RAM */
 	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
 	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
-	
+
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
-	fprintf(stderr, "array -> %f, %f, %f\n", float_array[0], 
-			float_array[1], float_array[2]);
-	
-	if (float_array[0] != float_array[1] + float_array[2])
+	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
+                float_array[1], float_array[2], float_array[3]);
+
+	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
+		fprintf(stderr, "Incorrect result\n");
 		return 1;
 		return 1;
-	
+	}
+
 	starpu_release_data_from_mem(float_array_handle);
 	starpu_release_data_from_mem(float_array_handle);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +

+ 49 - 0
examples/incrementer/incrementer_kernels_opencl.c

@@ -0,0 +1,49 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <CL/cl.h>
+
+void opencl_codelet(void *descr[], void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/incrementer/incrementer_kernels_opencl_codelet.cl", "incrementer", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=4;
+		size_t local=4;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 23 - 0
examples/incrementer/incrementer_kernels_opencl_codelet.cl

@@ -0,0 +1,23 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void incrementer(__global float* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 0 || i == 3)
+		input[i] = input[i] + 1.0;
+}
+

+ 193 - 0
examples/matvecmult/matvecmult.c

@@ -0,0 +1,193 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <pthread.h>
+#include <math.h>
+
+//static int width=1100;
+//static int height=244021;
+static int width=20;
+static int height=4;
+
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err, n;
+	float *matrix = (float *)STARPU_GET_MATRIX_PTR(descr[0]);
+	float *vector = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
+	float *mult = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/matvecmult/matvecmult_kernel.cl", "matVecMult", devid);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+        n=0;
+        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&width);
+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&height);
+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+                size_t global=512;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}
+
+void fillArray(float* pfData, int iSize) {
+    int i;
+    const float fScale = 1.0f / (float)RAND_MAX;
+    for (i = 0; i < iSize; ++i) {
+            pfData[i] = fScale * rand();
+    }
+}
+
+void printArray(float* pfData, int iSize) {
+    int i;
+    for (i = 0; i < iSize; ++i) {
+            fprintf(stderr, "%f ", pfData[i]);
+    }
+    fprintf(stderr, "\n");
+}
+
+void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
+    int i, j;
+    for (i = 0; i < height; ++i) {
+        double sum = 0;
+        for (j = 0; j < width; ++j) {
+            double a = matrix[i * width + j];
+            double b = vector[j];
+            sum += a * b;
+        }
+        mult[i] = (float)sum;
+    }
+}
+
+int compareL2fe(const float* reference, const float* data, const unsigned int len, const float epsilon) {
+    float error = 0;
+    float ref = 0;
+    unsigned int i;
+
+    for(i = 0; i < len; ++i) {
+        float diff = reference[i] - data[i];
+        error += diff * diff;
+        ref += reference[i] * reference[i];
+    }
+
+    float normRef = sqrtf(ref);
+    if (fabs(ref) < 1e-7) return 1;
+
+    float normError = sqrtf(error);
+    error = normError / normRef;
+
+    return error < epsilon ? 0 : 1;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	starpu_codelet cl;
+        starpu_init(NULL);
+
+#ifdef STARPU_USE_OPENCL
+        float *matrix, *vector, *mult;
+        float *correctResult;
+        unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
+
+	starpu_data_handle matrix_handle, vector_handle, mult_handle;
+
+        mem_size_matrix = width * height * sizeof(float);
+        matrix = (float*)malloc(mem_size_matrix);
+        mem_size_vector = width * sizeof(float);
+        vector = (float*)malloc(mem_size_vector);
+        mem_size_mult = height * sizeof(float);
+        mult = (float*)malloc(mem_size_mult);
+        correctResult = (float*)malloc(mem_size_mult);
+
+        assert(matrix);
+        assert(vector);
+        assert(mult);
+        assert(correctResult);
+
+        fillArray(matrix, width*height);
+        fillArray(vector, width);
+        fillArray(mult, height);
+        matVecMult(matrix, vector, width, height, correctResult);
+
+	starpu_register_matrix_data(&matrix_handle, 0, (uintptr_t)matrix, width, width, height, sizeof(float));
+	starpu_register_vector_data(&vector_handle, 0, (uintptr_t)vector, width, sizeof(float));
+	starpu_register_vector_data(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
+
+        _starpu_opencl_compile_source_to_opencl("examples/matvecmult/matvecmult_kernel.cl");
+
+	cl.where = STARPU_OPENCL;
+        cl.opencl_func = opencl_codelet;
+        cl.nbuffers = 3;
+        cl.model = NULL;
+
+        struct starpu_task *task = starpu_task_create();
+        task->cl = &cl;
+        task->callback_func = NULL;
+        task->buffers[0].handle = matrix_handle;
+        task->buffers[0].mode = STARPU_R;
+        task->buffers[1].handle = vector_handle;
+        task->buffers[1].mode = STARPU_R;
+        task->buffers[2].handle = mult_handle;
+        task->buffers[2].mode = STARPU_RW;
+
+        int ret = starpu_submit_task(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                fprintf(stderr, "No worker may execute this task\n");
+                exit(0);
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+        starpu_sync_data_with_mem(matrix_handle, STARPU_R);
+        starpu_sync_data_with_mem(vector_handle, STARPU_R);
+        starpu_sync_data_with_mem(mult_handle, STARPU_R);
+
+        int res = compareL2fe(correctResult, mult, height, 1e-6f);
+        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
+#if 0
+        printArray(matrix, width*height);
+        printArray(vector, width);
+        printArray(mult, height);
+#endif
+        starpu_release_data_from_mem(matrix_handle);
+        starpu_release_data_from_mem(vector_handle);
+        starpu_release_data_from_mem(mult_handle);
+
+        starpu_shutdown();
+#endif
+
+	return 0;
+}

+ 48 - 0
examples/matvecmult/matvecmult_kernel.cl

@@ -0,0 +1,48 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.
+ * Any use, reproduction, disclosure, or distribution of this software
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA)
+ * associated with this source code for terms and conditions that govern
+ * your use of this NVIDIA software.
+ *
+ */
+
+/* Matrix-vector multiplication: W = M * V.
+ * Device code.
+ *
+ * This sample implements matrix-vector multiplication.
+ * It has been written for clarity of exposition to illustrate various OpenCL
+ * programming principles and optimizatoins, not with the goal of providing
+ * the most performant generic kernel for matrix-vector multiplication.
+ *
+ * CUBLAS provides high-performance matrix-vector multiplication on GPU.
+ */
+
+__kernel void matVecMult(
+                         __global float* M,
+                         __global float* V,
+                         int width, int height,
+                         __global float* W
+                         )
+{
+        // Row index
+        uint y = get_global_id(0);
+        if (y < height) {
+                // Row pointer
+                const __global float* row = M + y * width;
+
+                // Compute dot product
+                float dotProduct = 0;
+                for (int x = 0; x < width; ++x)
+                        dotProduct += row[x] * V[x];
+
+                // Write result to global memory
+                W[y] = dotProduct;
+        }
+}

+ 2 - 2
examples/ppm_downscaler/yuv_downscaler.c

@@ -40,8 +40,8 @@ void parse_args(int argc, char **argv)
 		strcpy(filename_out, argv[2]);
 		strcpy(filename_out, argv[2]);
 	}
 	}
 	else {
 	else {
-		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_in_default);
-		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_out_default);
+		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_in_default);
+		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_out_default);
 	}
 	}
 }
 }
 
 

+ 76 - 2
examples/spmv/dw_spmv.c

@@ -27,6 +27,60 @@ extern void spmv_kernel_cuda(void *descr[], void *args);
 struct timeval start;
 struct timeval start;
 struct timeval end;
 struct timeval end;
 
 
+#ifdef STARPU_USE_OPENCL
+#include "starpu_opencl.h"
+void spmv_kernel_opencl(void *descr[], void *args)
+{
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err, n;
+
+	uint32_t nnz = STARPU_GET_CSR_NNZ(descr[0]);
+	uint32_t nrow = STARPU_GET_CSR_NROW(descr[0]);
+	float *nzval = (float *)STARPU_GET_CSR_NZVAL(descr[0]);
+	uint32_t *colind = STARPU_GET_CSR_COLIND(descr[0]);
+	uint32_t *rowptr = STARPU_GET_CSR_ROWPTR(descr[0]);
+	uint32_t firstentry = STARPU_GET_CSR_FIRSTENTRY(descr[0]);
+
+	float *vecin = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
+	uint32_t nx_in = STARPU_GET_VECTOR_NX(descr[1]);
+
+	float *vecout = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
+	uint32_t nx_out = STARPU_GET_VECTOR_NX(descr[2]);
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/spmv/spmv_opencl.cl", "spvm", devid);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+        n=0;
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nnz);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nrow);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &nzval);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &colind);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &rowptr);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &firstentry);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecin);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_in);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecout);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_out);
+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+                size_t global=1024;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+        starpu_opencl_release(kernel);
+}
+#endif
+
 unsigned nblocks = 2;
 unsigned nblocks = 2;
 uint32_t size = 4194304;
 uint32_t size = 4194304;
 
 
@@ -189,14 +243,28 @@ void call_spmv_codelet_filters(void)
 	starpu_partition_data(sparse_matrix, &csr_f);
 	starpu_partition_data(sparse_matrix, &csr_f);
 	starpu_partition_data(vector_out, &vector_f);
 	starpu_partition_data(vector_out, &vector_f);
 
 
+#ifdef STARPU_USE_OPENCL
+        {
+                int ret = _starpu_opencl_compile_source_to_opencl("examples/spmv/spmv_opencl.cl");
+                if (ret)
+		{
+			fprintf(stderr, "Failed to compile OpenCL codelet\n");
+			exit(ret);
+		}
+        }
+#endif
+
 	starpu_codelet cl;
 	starpu_codelet cl;
 	memset(&cl, 0, sizeof(starpu_codelet));
 	memset(&cl, 0, sizeof(starpu_codelet));
 
 
-	cl.where = STARPU_CPU|STARPU_CUDA;
+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
 	cl.cpu_func =  cpu_spmv;
 	cl.cpu_func =  cpu_spmv;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	cl.cuda_func = spmv_kernel_cuda;
 	cl.cuda_func = spmv_kernel_cuda;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+        cl.opencl_func = spmv_kernel_opencl;
+#endif
 	cl.nbuffers = 3;
 	cl.nbuffers = 3;
 	cl.model = NULL;
 	cl.model = NULL;
 
 
@@ -206,6 +274,7 @@ void call_spmv_codelet_filters(void)
 	for (part = 0; part < nblocks; part++)
 	for (part = 0; part < nblocks; part++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
+                int ret;
 
 
 		task->callback_func = NULL;
 		task->callback_func = NULL;
 
 
@@ -219,7 +288,12 @@ void call_spmv_codelet_filters(void)
 		task->buffers[2].handle = starpu_get_sub_data(vector_out, 1, part);
 		task->buffers[2].handle = starpu_get_sub_data(vector_out, 1, part);
 		task->buffers[2].mode = STARPU_W;
 		task->buffers[2].mode = STARPU_W;
 	
 	
-		starpu_submit_task(task);
+		ret = starpu_submit_task(task);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			fprintf(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
 	}
 	}
 
 
 	starpu_wait_all_tasks();
 	starpu_wait_all_tasks();

+ 0 - 1
examples/spmv/dw_spmv.h

@@ -24,7 +24,6 @@
 #include <sys/types.h>
 #include <sys/types.h>
 #include <pthread.h>
 #include <pthread.h>
 #include <signal.h>
 #include <signal.h>
-#include <cblas.h>
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 

+ 42 - 0
examples/spmv/spmv_opencl.cl

@@ -0,0 +1,42 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void spvm(unsigned nnz, unsigned nrow,
+                   __global float* nzval, __global unsigned* colind,
+                   __global unsigned* rowptr, unsigned firstentry,
+                   __global float *vecin, unsigned nx_in,
+                   __global float *vecout, unsigned nx_out)
+{
+	unsigned row;
+	for (row = 0; row < nrow; row++)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstentry;
+		unsigned lastindex = rowptr[row+1] - firstentry;
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			unsigned col;
+
+			col = colind[index];
+			tmp += nzval[index]*vecin[col];
+		}
+
+		vecout[row] = tmp;
+	}
+}

+ 147 - 0
examples/stencil/stencil.c

@@ -0,0 +1,147 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "stencil.h"
+
+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+
+static int verbose = 0;
+
+static
+void display_non_zero_values(TYPE *ptr, char *msg)
+{
+        if(verbose) {
+                int x, y, z;
+
+                for(z = 0; z < DIM; z++)
+                        for(y = 0; y < DIM; y++)
+                                for(x = 0; x < DIM; x++) {
+                                        TYPE r = ptr[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD];
+                                        if(r != 0.0)
+                                                printf("%s[%d, %d, %d] == %f\n", msg, z, y, x, r);
+                                }
+        }
+}
+
+int main(int argc, char **argv)
+{
+        TYPE *data;                         // original data set given to device
+        TYPE *results;                      // results returned from device
+        TYPE C0 = 0.25;
+        TYPE C1 = 0.75;
+        starpu_data_handle data_handle;
+        starpu_data_handle results_handle;
+        starpu_data_handle C0_handle;
+        starpu_data_handle C1_handle;
+
+	starpu_init(NULL);
+
+        // Filter args
+        argv++;
+        while (argc > 1) {
+                if(!strcmp(*argv, "--verbose")) {
+                        verbose = 1;
+                } else
+                        break;
+                argc--; argv++;
+        }
+
+        // Fill our data set with random float values
+        {
+                long i, x, y, z;
+
+                data = (TYPE *)malloc(SIZE * sizeof(TYPE));
+                results = (TYPE *)malloc(SIZE * sizeof(TYPE));
+                if (!data || !results) {
+                        fprintf(stderr, "Malloc failed!\n");
+                        return;
+                }
+
+                for(i = 0; i < SIZE; i++) {
+                        data[i] = 0.0;
+                        results[i] = 0.0;
+                }
+
+                z = ZBLOCK-1;
+                y = YBLOCK-1;
+                x = XBLOCK-1;
+
+                data[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD] = 2.0;
+        }
+
+        display_non_zero_values(data, "data");
+
+        starpu_register_vector_data(&data_handle, 0 /* home node */,
+                                    (uintptr_t)data, SIZE, sizeof(TYPE));
+        starpu_register_vector_data(&results_handle, 0 /* home node */,
+                                    (uintptr_t)results, SIZE, sizeof(TYPE));
+        starpu_register_vector_data(&C0_handle, 0 /* home node */,
+                                    (uintptr_t)&C0, 1, sizeof(TYPE));
+        starpu_register_vector_data(&C1_handle, 0 /* home node */,
+                                    (uintptr_t)&C1, 1, sizeof(TYPE));
+
+        _starpu_opencl_compile_source_to_opencl("examples/stencil/stencil_opencl_codelet.cl");
+
+	starpu_codelet cl =
+	{
+		.where = STARPU_OPENCL,
+		.opencl_func = opencl_codelet,
+		.nbuffers = 4
+	};
+
+	{
+		struct starpu_task *task = starpu_task_create();
+
+                task->cl = &cl;
+                task->callback_func = NULL;
+
+		task->buffers[0].handle = data_handle;
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = results_handle;
+		task->buffers[1].mode = STARPU_W;
+		task->buffers[2].handle = C0_handle;
+		task->buffers[2].mode = STARPU_R;
+		task->buffers[3].handle = C1_handle;
+		task->buffers[3].mode = STARPU_R;
+
+		int ret = starpu_submit_task(task);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			fprintf(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+	starpu_sync_data_with_mem(data_handle, STARPU_R);
+	starpu_sync_data_with_mem(results_handle, STARPU_R);
+	starpu_sync_data_with_mem(C0_handle, STARPU_R);
+	starpu_sync_data_with_mem(C1_handle, STARPU_R);
+
+	display_non_zero_values(results, "results");
+
+	starpu_release_data_from_mem(data_handle);
+	starpu_release_data_from_mem(results_handle);
+	starpu_release_data_from_mem(C0_handle);
+	starpu_release_data_from_mem(C1_handle);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 43 - 0
examples/stencil/stencil.h

@@ -0,0 +1,43 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STENCIL_H__
+#define __STENCIL_H__
+
+#define TYPE    float
+
+#define DIM     (128)
+#define BORDER  (1)
+
+#define PADDING (64 / sizeof(TYPE) - 2*BORDER)
+#define FIRST_PAD (PADDING+1)
+
+#define REALDIM (DIM + 2 * BORDER + PADDING)
+
+#define SURFACE   ((DIM + 2 * BORDER) * REALDIM)
+
+#define SIZE    ((DIM + 2 * BORDER) * SURFACE + 1)
+
+#define XBLOCK  (16)
+#define YBLOCK  (16)
+#define ZBLOCK  (64)
+
+#define X_PER_THREAD (1)
+#define Y_PER_THREAD (4)
+#define Z_PER_THREAD (ZBLOCK)
+
+#endif
+

+ 66 - 0
examples/stencil/stencil_opencl.c

@@ -0,0 +1,66 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <starpu_util.h>
+#include "stencil.h"
+
+void opencl_codelet(void *descr[], void *_args)
+{
+	float *data = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+	float *results = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
+	float *C0 = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
+	float *C1 = (float *)STARPU_GET_VECTOR_PTR(descr[3]);
+
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue, "examples/stencil/stencil_opencl_codelet.cl", "stencil", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
+	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &results);
+	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &C0);
+	err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &C1);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global[3];
+		size_t local[3];
+
+                // Execute the kernel over the entire range of our 3d input data set
+                local[0] = XBLOCK / X_PER_THREAD;   // threads along the X axis
+                local[1] = YBLOCK / Y_PER_THREAD;   // threads along the Y axis
+                local[2] = ZBLOCK / Z_PER_THREAD;   // threads along the Z axis
+
+                global[0] = DIM / X_PER_THREAD;  // virtual size of global X axis
+                global[1] = DIM / Y_PER_THREAD;  // virtual size of global Y axis
+                global[2] = DIM / Z_PER_THREAD;  // virtual size of global Z axis
+
+                err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 100 - 0
examples/variable/variable.c

@@ -0,0 +1,100 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <pthread.h>
+
+static unsigned niter = 50000;
+
+#ifdef STARPU_USE_CUDA
+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+#ifdef STARPU_USE_OPENCL
+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+extern void cuda_codelet_host(float *tab);
+
+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+
+	*val += 1.0f;
+}
+
+int main(int argc, char **argv)
+{
+	unsigned i;
+        float foo;
+	starpu_data_handle float_array_handle;
+	starpu_codelet cl;
+
+	starpu_init(NULL);
+        if (argc == 2) niter = atoi(argv[1]);
+        foo = 0.0f;
+
+	starpu_register_variable_data(&float_array_handle, 0 /* home node */,
+                                      (uintptr_t)&foo, sizeof(float));
+
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("examples/variable/variable_kernels_opencl_codelet.cl");
+#endif
+
+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
+        cl.cpu_func = cpu_codelet;
+#ifdef STARPU_USE_CUDA
+        cl.cuda_func = cuda_codelet;
+#endif
+#ifdef STARPU_USE_OPENCL
+        cl.opencl_func = opencl_codelet;
+#endif
+        cl.nbuffers = 1;
+        cl.model = NULL;
+
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+                int ret;
+
+		task->cl = &cl;
+
+		task->callback_func = NULL;
+
+		task->buffers[0].handle = float_array_handle;
+		task->buffers[0].mode = STARPU_RW;
+
+		ret = starpu_submit_task(task);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			fprintf(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
+
+	fprintf(stderr, "variable -> %f\n", foo);
+
+	starpu_release_data_from_mem(float_array_handle);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 30 - 0
examples/variable/variable_kernels.cu

@@ -0,0 +1,30 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+static __global__ void cuda_variable(float * tab)
+{
+	*tab += 2;
+	return;
+}
+
+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+
+	cuda_variable<<<1,1>>>(val);
+}

+ 48 - 0
examples/variable/variable_kernels_opencl.c

@@ -0,0 +1,48 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+void opencl_codelet(void *descr[], void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/variable/variable_kernels_opencl_codelet.cl", "variable", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local=1;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 23 - 0
examples/variable/variable_kernels_opencl_codelet.cl

@@ -0,0 +1,23 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void variable(__global float* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 0)
+		input[i] = input[i] + 4.0;
+}
+

+ 14 - 2
include/starpu.h

@@ -46,14 +46,19 @@ struct starpu_conf {
 	int ncpus;
 	int ncpus;
 	/* maximum number of CUDA GPUs (-1 for default) */
 	/* maximum number of CUDA GPUs (-1 for default) */
 	int ncuda;
 	int ncuda;
+	/* maximum number of OpenCL GPUs (-1 for default) */
+	int nopencl;
 	/* maximum number of Cell's SPUs (-1 for default) */
 	/* maximum number of Cell's SPUs (-1 for default) */
 	int nspus;
 	int nspus;
 
 
 	unsigned use_explicit_workers_bindid;
 	unsigned use_explicit_workers_bindid;
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 
 
-	unsigned use_explicit_workers_gpuid;
-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
+	unsigned use_explicit_workers_cuda_gpuid;
+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
+
+	unsigned use_explicit_workers_opencl_gpuid;
+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 
 
 	/* calibrate performance models, if any */
 	/* calibrate performance models, if any */
 	unsigned calibrate;
 	unsigned calibrate;
@@ -75,6 +80,7 @@ unsigned starpu_get_worker_count(void);
 unsigned starpu_get_cpu_worker_count(void);
 unsigned starpu_get_cpu_worker_count(void);
 unsigned starpu_get_cuda_worker_count(void);
 unsigned starpu_get_cuda_worker_count(void);
 unsigned starpu_get_spu_worker_count(void);
 unsigned starpu_get_spu_worker_count(void);
+unsigned starpu_get_opencl_worker_count(void);
 
 
 /* Return the identifier of the thread in case this is associated to a worker.
 /* Return the identifier of the thread in case this is associated to a worker.
  * This will return -1 if this function is called directly from the application
  * This will return -1 if this function is called directly from the application
@@ -84,6 +90,7 @@ int starpu_get_worker_id(void);
 enum starpu_archtype {
 enum starpu_archtype {
 	STARPU_CPU_WORKER, /* CPU core */
 	STARPU_CPU_WORKER, /* CPU core */
 	STARPU_CUDA_WORKER, /* NVIDIA CUDA device */
 	STARPU_CUDA_WORKER, /* NVIDIA CUDA device */
+	STARPU_OPENCL_WORKER, /* OpenCL CUDA device */
 	STARPU_GORDON_WORKER /* Cell SPU */
 	STARPU_GORDON_WORKER /* Cell SPU */
 };
 };
 
 
@@ -103,6 +110,11 @@ enum starpu_archtype starpu_get_worker_type(int id);
  * behaviour. */
  * behaviour. */
 void starpu_get_worker_name(int id, char *dst, size_t maxlen);
 void starpu_get_worker_name(int id, char *dst, size_t maxlen);
 
 
+/* This functions returns the device id of the worker associated to an
+ *  identifier (as returned by the starpu_get_worker_id() function)
+ */
+int starpu_get_worker_devid(int id);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 4 - 2
include/starpu_config.h.in

@@ -3,16 +3,17 @@
 
 
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CUDA
 #undef STARPU_USE_CUDA
+#undef STARPU_USE_OPENCL
 #undef STARPU_USE_GORDON
 #undef STARPU_USE_GORDON
 
 
 #undef STARPU_ATLAS
 #undef STARPU_ATLAS
 #undef STARPU_GOTO
 #undef STARPU_GOTO
 #undef STARPU_SYSTEM_BLAS
 #undef STARPU_SYSTEM_BLAS
 
 
+#undef STARPU_BUILD_DIR
+#undef STARPU_OPENCL_DATADIR
 #undef STARPU_HAVE_MAGMA
 #undef STARPU_HAVE_MAGMA
 
 
-#undef STARPU_DIR
-
 #undef STARPU_OPENGL_RENDER
 #undef STARPU_OPENGL_RENDER
 
 
 #undef STARPU_USE_GTK
 #undef STARPU_USE_GTK
@@ -37,6 +38,7 @@
 
 
 #undef STARPU_NMAXBUFS
 #undef STARPU_NMAXBUFS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXCUDADEVS
+#undef STARPU_MAXOPENCLDEVS
 
 
 #undef STARPU_HAVE_LIBNUMA
 #undef STARPU_HAVE_LIBNUMA
 
 

+ 6 - 0
include/starpu_data_interfaces.h

@@ -36,6 +36,8 @@ void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memo
 /* Matrix interface for dense matrices */
 /* Matrix interface for dense matrices */
 typedef struct starpu_matrix_interface_s {
 typedef struct starpu_matrix_interface_s {
 	uintptr_t ptr;
 	uintptr_t ptr;
+        uintptr_t dev_handle;
+        size_t offset;
 	uint32_t nx;
 	uint32_t nx;
 	uint32_t ny;
 	uint32_t ny;
 	uint32_t ld;
 	uint32_t ld;
@@ -62,6 +64,8 @@ size_t starpu_get_matrix_elemsize(starpu_data_handle handle);
 /* BLOCK interface for 3D dense blocks */
 /* BLOCK interface for 3D dense blocks */
 typedef struct starpu_block_interface_s {
 typedef struct starpu_block_interface_s {
 	uintptr_t ptr;
 	uintptr_t ptr;
+        uintptr_t dev_handle;
+        size_t offset;
 	uint32_t nx;
 	uint32_t nx;
 	uint32_t ny;
 	uint32_t ny;
 	uint32_t nz;
 	uint32_t nz;
@@ -93,6 +97,8 @@ size_t starpu_get_block_elemsize(starpu_data_handle handle);
 /* vector interface for contiguous (non-strided) buffers */
 /* vector interface for contiguous (non-strided) buffers */
 typedef struct starpu_vector_interface_s {
 typedef struct starpu_vector_interface_s {
 	uintptr_t ptr;
 	uintptr_t ptr;
+        uintptr_t dev_handle;
+        size_t offset;
 	uint32_t nx;
 	uint32_t nx;
 	size_t elemsize;
 	size_t elemsize;
 } starpu_vector_interface_t;
 } starpu_vector_interface_t;

+ 192 - 0
include/starpu_opencl.h

@@ -0,0 +1,192 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_OPENCL_H__
+#define __STARPU_OPENCL_H__
+
+#ifdef STARPU_USE_OPENCL
+#include <CL/cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define STARPU_OPENCL_REPORT_ERROR(status)                                     \
+	do {                                                            \
+		char *errormsg;                                         \
+		switch (status) {                                       \
+		case CL_SUCCESS:                                        \
+			errormsg = "success";                           \
+			break;                                          \
+		case CL_DEVICE_NOT_FOUND:				\
+			errormsg = "Device not found";                  \
+			break;                                          \
+		case CL_DEVICE_NOT_AVAILABLE:				\
+			errormsg = "Device not available";              \
+			break;                                          \
+		case CL_COMPILER_NOT_AVAILABLE:				\
+			errormsg = "Compiler not available";            \
+			break;                                          \
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:			\
+			errormsg = "Memory object allocation failure";  \
+			break;                                          \
+		case CL_OUT_OF_RESOURCES:				\
+			errormsg = "Out of resources";                  \
+			break;                                          \
+		case CL_OUT_OF_HOST_MEMORY:				\
+			errormsg = "Out of host memory";                \
+			break;                                          \
+		case CL_PROFILING_INFO_NOT_AVAILABLE:			\
+			errormsg = "Profiling info not available";      \
+			break;                                          \
+		case CL_MEM_COPY_OVERLAP:				\
+			errormsg = "Memory copy overlap";               \
+			break;                                          \
+		case CL_IMAGE_FORMAT_MISMATCH:				\
+			errormsg = "Image format mismatch";             \
+			break;                                          \
+		case CL_IMAGE_FORMAT_NOT_SUPPORTED:			\
+			errormsg = "Image format not supported";        \
+			break;                                          \
+		case CL_BUILD_PROGRAM_FAILURE:				\
+			errormsg = "Build program failure";             \
+			break;                                          \
+		case CL_MAP_FAILURE:				        \
+			errormsg = "Map failure";                       \
+			break;                                          \
+		case CL_INVALID_VALUE:				        \
+			errormsg = "Invalid value";                     \
+			break;                                          \
+		case CL_INVALID_DEVICE_TYPE:				\
+			errormsg = "Invalid device type";               \
+			break;                                          \
+		case CL_INVALID_PLATFORM:				\
+			errormsg = "Invalid platform";                  \
+			break;                                          \
+		case CL_INVALID_DEVICE:				        \
+			errormsg = "Invalid device";                    \
+			break;                                          \
+		case CL_INVALID_CONTEXT:				\
+			errormsg = "Invalid context";                   \
+			break;                                          \
+		case CL_INVALID_QUEUE_PROPERTIES:			\
+			errormsg = "Invalid queue properties";          \
+			break;                                          \
+		case CL_INVALID_COMMAND_QUEUE:				\
+			errormsg = "Invalid command queue";             \
+			break;                                          \
+		case CL_INVALID_HOST_PTR:				\
+			errormsg = "Invalid host pointer";              \
+			break;                                          \
+		case CL_INVALID_MEM_OBJECT:				\
+			errormsg = "Invalid memory object";             \
+			break;                                          \
+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:		\
+			errormsg = "Invalid image format descriptor";   \
+			break;                                          \
+		case CL_INVALID_IMAGE_SIZE:				\
+			errormsg = "Invalid image size";                \
+			break;                                          \
+		case CL_INVALID_SAMPLER:				\
+			errormsg = "Invalid sampler";                   \
+			break;                                          \
+		case CL_INVALID_BINARY:				        \
+			errormsg = "Invalid binary";                    \
+			break;                                          \
+		case CL_INVALID_BUILD_OPTIONS:				\
+			errormsg = "Invalid build options";             \
+			break;                                          \
+		case CL_INVALID_PROGRAM:				\
+			errormsg = "Invalid program";                   \
+			break;                                          \
+		case CL_INVALID_PROGRAM_EXECUTABLE:			\
+			errormsg = "Invalid program executable";        \
+			break;                                          \
+		case CL_INVALID_KERNEL_NAME:				\
+			errormsg = "Invalid kernel name";               \
+			break;                                          \
+		case CL_INVALID_KERNEL_DEFINITION:			\
+			errormsg = "Invalid kernel definition";         \
+			break;                                          \
+		case CL_INVALID_KERNEL:				        \
+			errormsg = "Invalid kernel";                    \
+			break;                                          \
+		case CL_INVALID_ARG_INDEX:				\
+			errormsg = "Invalid argument index";            \
+			break;                                          \
+		case CL_INVALID_ARG_VALUE:				\
+			errormsg = "Invalid argument value";            \
+			break;                                          \
+		case CL_INVALID_ARG_SIZE:				\
+			errormsg = "Invalid argument size";             \
+			break;                                          \
+		case CL_INVALID_KERNEL_ARGS:				\
+			errormsg = "Invalid kernel arguments";          \
+			break;                                          \
+		case CL_INVALID_WORK_DIMENSION:				\
+			errormsg = "Invalid work dimension";            \
+			break;                                          \
+		case CL_INVALID_WORK_GROUP_SIZE:			\
+			errormsg = "Invalid work group size";           \
+			break;                                          \
+		case CL_INVALID_WORK_ITEM_SIZE:				\
+			errormsg = "Invalid work item size";            \
+			break;                                          \
+		case CL_INVALID_GLOBAL_OFFSET:				\
+			errormsg = "Invalid global offset";             \
+			break;                                          \
+		case CL_INVALID_EVENT_WAIT_LIST:			\
+			errormsg = "Invalid event wait list";           \
+			break;                                          \
+		case CL_INVALID_EVENT:				        \
+			errormsg = "Invalid event";                     \
+			break;                                          \
+		case CL_INVALID_OPERATION:				\
+			errormsg = "Invalid operation";                 \
+			break;                                          \
+		case CL_INVALID_GL_OBJECT:				\
+			errormsg = "Invalid GL object";                 \
+			break;                                          \
+		case CL_INVALID_BUFFER_SIZE:				\
+			errormsg = "Invalid buffer size";               \
+			break;                                          \
+		case CL_INVALID_MIP_LEVEL:				\
+			errormsg = "Invalid MIP level";                 \
+			break;                                          \
+		case CL_INVALID_GLOBAL_WORK_SIZE:			\
+			errormsg = "Invalid global work size";          \
+			break;                                          \
+		default:						\
+			errormsg = "unknown error";			\
+			break;			                        \
+		}                                                       \
+		printf("oops in %s ... <%s> \n", __func__, errormsg);	\
+		assert(0);	                                        \
+	} while (0)
+
+void starpu_opencl_get_context(int devid, cl_context *context);
+void starpu_opencl_get_device(int devid, cl_device_id *device);
+void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid);
+int starpu_opencl_release(cl_kernel kernel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // STARPU_USE_OPENCL
+#endif // __STARPU_OPENCL_H__
+

+ 4 - 2
include/starpu_perfmodel.h

@@ -21,6 +21,7 @@
 #include <pthread.h>
 #include <pthread.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_config.h>
 #include <starpu_config.h>
+#include <starpu_task.h>
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C" {
 extern "C" {
@@ -39,8 +40,9 @@ struct starpu_buffer_descr_t;
 enum starpu_perf_archtype {
 enum starpu_perf_archtype {
 	STARPU_CPU_DEFAULT = 0,
 	STARPU_CPU_DEFAULT = 0,
 	STARPU_CUDA_DEFAULT = 1,
 	STARPU_CUDA_DEFAULT = 1,
-	/* STARPU_CUDA_DEFAULT + devid */
-	STARPU_GORDON_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS
+	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
+	/* STARPU_OPENCL_DEFAULT + devid */
+	STARPU_GORDON_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS
 };
 };
 
 
 #define STARPU_NARCH_VARIATIONS	(STARPU_GORDON_DEFAULT+1)
 #define STARPU_NARCH_VARIATIONS	(STARPU_GORDON_DEFAULT+1)

+ 2 - 0
include/starpu_task.h

@@ -31,6 +31,7 @@
 #define STARPU_CUDA	((1ULL)<<3)
 #define STARPU_CUDA	((1ULL)<<3)
 #define STARPU_SPU	((1ULL)<<4)
 #define STARPU_SPU	((1ULL)<<4)
 #define STARPU_GORDON	((1ULL)<<5)
 #define STARPU_GORDON	((1ULL)<<5)
+#define STARPU_OPENCL	((1ULL)<<6)
 
 
 #define STARPU_MIN_PRIO        (-4)
 #define STARPU_MIN_PRIO        (-4)
 #define STARPU_MAX_PRIO        5
 #define STARPU_MAX_PRIO        5
@@ -53,6 +54,7 @@ typedef struct starpu_codelet_t {
 	/* the different implementations of the codelet */
 	/* the different implementations of the codelet */
 	void (*cuda_func)(void **, void *);
 	void (*cuda_func)(void **, void *);
 	void (*cpu_func)(void **, void *);
 	void (*cpu_func)(void **, void *);
+	void (*opencl_func)(void **, void *);
 	uint8_t gordon_func;
 	uint8_t gordon_func;
 
 
 	/* how many buffers do the codelet takes as argument ? */
 	/* how many buffers do the codelet takes as argument ? */

+ 5 - 0
src/Makefile.am

@@ -157,3 +157,8 @@ endif
 if STARPU_USE_GORDON
 if STARPU_USE_GORDON
 libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
 libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
 endif
 endif
+
+if STARPU_USE_OPENCL
+libstarpu_la_SOURCES += drivers/opencl/driver_opencl.c
+libstarpu_la_SOURCES += drivers/opencl/driver_opencl_utils.c
+endif

+ 1 - 0
src/core/jobs.h

@@ -49,6 +49,7 @@ typedef void (*callback)(void *);
 #define STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
 #define STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
 #define STARPU_SPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SPU)
 #define STARPU_SPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SPU)
 #define STARPU_GORDON_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_GORDON)
 #define STARPU_GORDON_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_GORDON)
+#define STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
 
 
 /* a job is the internal representation of a task */
 /* a job is the internal representation of a task */
 LIST_TYPE(starpu_job,
 LIST_TYPE(starpu_job,

+ 3 - 0
src/core/perfmodel/perfmodel.c

@@ -82,6 +82,9 @@ static double common_task_expected_length(struct starpu_perfmodel_t *model, uint
 			case STARPU_CUDA:
 			case STARPU_CUDA:
 				alpha = STARPU_CUDA_ALPHA;
 				alpha = STARPU_CUDA_ALPHA;
 				break;
 				break;
+  		        case STARPU_OPENCL:
+	                        alpha = STARPU_OPENCL_ALPHA;
+                                break;
 			default:
 			default:
 				/* perhaps there are various worker types on that queue */
 				/* perhaps there are various worker types on that queue */
 				alpha = 1.0; // this value is not significant ...
 				alpha = 1.0; // this value is not significant ...

+ 6 - 3
src/core/perfmodel/perfmodel.h

@@ -106,8 +106,11 @@ double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_
 void _starpu_set_calibrate_flag(unsigned val);
 void _starpu_set_calibrate_flag(unsigned val);
 unsigned _starpu_get_calibrate_flag(void);
 unsigned _starpu_get_calibrate_flag(void);
 
 
-#ifdef STARPU_USE_CUDA
-int *_starpu_get_gpu_affinity_vector(unsigned gpuid);
+#if defined(STARPU_USE_CUDA)
+int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 #endif
 #endif
- 
+#if defined(STARPU_USE_OPENCL)
+int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
+#endif
+
 #endif // __PERFMODEL_H__
 #endif // __PERFMODEL_H__

+ 335 - 66
src/core/perfmodel/perfmodel_bus.c

@@ -23,8 +23,10 @@
 #include <unistd.h>
 #include <unistd.h>
 #include <sys/time.h>
 #include <sys/time.h>
 #include <stdlib.h>
 #include <stdlib.h>
+#include <math.h>
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_opencl.h>
 #include <common/config.h>
 #include <common/config.h>
 #include <core/workers.h>
 #include <core/workers.h>
 #include <core/perfmodel/perfmodel.h>
 #include <core/perfmodel/perfmodel.h>
@@ -34,7 +36,7 @@
 
 
 #define MAXCPUS	32
 #define MAXCPUS	32
 
 
-struct cudadev_timing {
+struct dev_timing {
 	int cpu_id;
 	int cpu_id;
 	double timing_htod;
 	double timing_htod;
 	double timing_dtoh;
 	double timing_dtoh;
@@ -43,19 +45,29 @@ struct cudadev_timing {
 static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
 static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
 static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
 static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
 static unsigned was_benchmarked = 0;
 static unsigned was_benchmarked = 0;
+static unsigned ncpus = 0;
 static int ncuda = 0;
 static int ncuda = 0;
-
-static int affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
+static int nopencl = 0;
 
 
 /* Benchmarking the performance of the bus */
 /* Benchmarking the performance of the bus */
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
+static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
+static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+#endif
+#ifdef STARPU_USE_OPENCL
+static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
+static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
+static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
+static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+#endif
 
 
-static struct cudadev_timing cudadev_timing_per_cpu[STARPU_MAXNODES][MAXCPUS];
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 
 
-static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
+#ifdef STARPU_USE_CUDA
+static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 {
 {
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -84,7 +96,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
 
 	/* Allocate a buffer on the host */
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
 	unsigned char *h_buffer;
-	cudaHostAlloc((void **)&h_buffer, SIZE, 0); 
+	cudaHostAlloc((void **)&h_buffer, SIZE, 0);
 	assert(h_buffer);
 	assert(h_buffer);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
@@ -104,7 +116,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	struct timeval start;
 	struct timeval start;
 	struct timeval end;
 	struct timeval end;
 
 
-	cudadev_timing_per_cpu[dev+1][cpu].cpu_id = cpu;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
 
 
 	/* Measure upload bandwidth */
 	/* Measure upload bandwidth */
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
@@ -116,7 +128,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-	cudadev_timing_per_cpu[dev+1][cpu].timing_htod = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
 
 
 	/* Measure download bandwidth */
 	/* Measure download bandwidth */
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
@@ -128,7 +140,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-	cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
 
 
 	/* Free buffers */
 	/* Free buffers */
 	cudaFreeHost(h_buffer);
 	cudaFreeHost(h_buffer);
@@ -137,18 +149,101 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	cudaThreadExit();
 	cudaThreadExit();
 
 
 }
 }
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
+{
+        cl_context context;
+        cl_command_queue queue;
+
+        struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+	/* Initialize OpenCL context on the device */
+        _starpu_opencl_init_context(dev);
+        starpu_opencl_get_context(dev, &context);
+        starpu_opencl_get_queue(dev, &queue);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+	/* Allocate a buffer on the device */
+        int err;
+	cl_mem d_buffer;
+	d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, SIZE, NULL, &err);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+        /* Allocate a buffer on the host */
+	unsigned char *h_buffer;
+        h_buffer = malloc(SIZE);
+	assert(h_buffer);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+        /* Fill them */
+	memset(h_buffer, 0, SIZE);
+        err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+        unsigned iter;
+	double timing;
+	struct timeval start;
+	struct timeval end;
+
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
+
+	/* Measure upload bandwidth */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; iter++)
+	{
+                err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
+
+	/* Measure download bandwidth */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; iter++)
+	{
+                err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
+
+	/* Free buffers */
+	clReleaseMemObject(d_buffer);
+	free(h_buffer);
+
+	/* Uninitiliaze OpenCL context on the device */
+        _starpu_opencl_deinit_context(dev);
+}
+#endif
 
 
 /* NB: we want to sort the bandwidth by DECREASING order */
 /* NB: we want to sort the bandwidth by DECREASING order */
-static int compar_cudadev_timing(const void *left_cudadev_timing, const void *right_cudadev_timing)
+static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
 {
 {
-	const struct cudadev_timing *left = left_cudadev_timing;
-	const struct cudadev_timing *right = right_cudadev_timing;
-	
+	const struct dev_timing *left = left_dev_timing;
+	const struct dev_timing *right = right_dev_timing;
+
 	double left_dtoh = left->timing_dtoh;
 	double left_dtoh = left->timing_dtoh;
 	double left_htod = left->timing_htod;
 	double left_htod = left->timing_htod;
 	double right_dtoh = right->timing_dtoh;
 	double right_dtoh = right->timing_dtoh;
 	double right_htod = right->timing_htod;
 	double right_htod = right->timing_htod;
-	
+
 	double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
 	double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
 	double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
 	double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
 
 
@@ -156,47 +251,55 @@ static int compar_cudadev_timing(const void *left_cudadev_timing, const void *ri
 	return (bandwidth_sum2_left < bandwidth_sum2_right);
 	return (bandwidth_sum2_left < bandwidth_sum2_right);
 }
 }
 
 
-static void measure_bandwidth_between_host_and_dev(int dev, unsigned ncpus)
+static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
+                                                   struct dev_timing *dev_timing_per_cpu, char type)
 {
 {
 	unsigned cpu;
 	unsigned cpu;
 	for (cpu = 0; cpu < ncpus; cpu++)
 	for (cpu = 0; cpu < ncpus; cpu++)
 	{
 	{
-		measure_bandwidth_between_host_and_dev_on_cpu(dev, cpu);
-	}
+#ifdef STARPU_USE_CUDA
+                if (type == 'C')
+                        measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
+#endif
+#ifdef STARPU_USE_OPENCL
+                if (type == 'O')
+                        measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
+#endif
+        }
 
 
 	/* sort the results */
 	/* sort the results */
-	qsort(cudadev_timing_per_cpu[dev+1], ncpus,
-			sizeof(struct cudadev_timing),
-			compar_cudadev_timing);
-	
+	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
+              sizeof(struct dev_timing),
+			compar_dev_timing);
+
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 	for (cpu = 0; cpu < ncpus; cpu++)
 	for (cpu = 0; cpu < ncpus; cpu++)
 	{
 	{
-		unsigned current_cpu = cudadev_timing_per_cpu[dev+1][cpu].cpu_id;
-		double bandwidth_dtoh = cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh;
-		double bandwidth_htod = cudadev_timing_per_cpu[dev+1][cpu].timing_htod;
+		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
+		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
+		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
 
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
 
 		fprintf(stderr, "BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 		fprintf(stderr, "BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 	}
 
 
-	unsigned best_cpu = cudadev_timing_per_cpu[dev+1][0].cpu_id;
+	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
 
 
 	fprintf(stderr, "BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
 	fprintf(stderr, "BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
 #endif
 #endif
 
 
 	/* The results are sorted in a decreasing order, so that the best
 	/* The results are sorted in a decreasing order, so that the best
 	 * measurement is currently the first entry. */
 	 * measurement is currently the first entry. */
-	cudadev_timing_dtoh[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_dtoh;
-	cudadev_timing_htod[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_htod;
+	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
+	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
 }
 }
-#endif
+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
 
 
-static void benchmark_all_cuda_devices(void)
+static void benchmark_all_gpu_devices(void)
 {
 {
-#ifdef STARPU_USE_CUDA
-	int ret;
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+	int i, ret;
 
 
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 	fprintf(stderr, "Benchmarking the speed of the bus\n");
 	fprintf(stderr, "Benchmarking the speed of the bus\n");
@@ -213,15 +316,24 @@ static void benchmark_all_cuda_devices(void)
 	}
 	}
 
 
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = _starpu_topology_get_nhwcpu(config);
 
 
+#ifdef STARPU_USE_CUDA
         cudaGetDeviceCount(&ncuda);
         cudaGetDeviceCount(&ncuda);
-	int i;
 	for (i = 0; i < ncuda; i++)
 	for (i = 0; i < ncuda; i++)
 	{
 	{
 		/* measure bandwidth between Host and Device i */
 		/* measure bandwidth between Host and Device i */
-		measure_bandwidth_between_host_and_dev(i, ncpus);
+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+        nopencl = _starpu_opencl_get_device_count();
+	for (i = 0; i < nopencl; i++)
+	{
+		/* measure bandwith between Host and Device i */
+		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
 	}
 	}
+#endif
 
 
 	/* FIXME: use hwloc */
 	/* FIXME: use hwloc */
 	/* Restore the former affinity */
 	/* Restore the former affinity */
@@ -235,7 +347,7 @@ static void benchmark_all_cuda_devices(void)
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 	fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
 	fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
 #endif
 #endif
-#endif
+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
 
 
 	was_benchmarked = 1;
 	was_benchmarked = 1;
 }
 }
@@ -244,7 +356,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 {
 {
 	_starpu_get_perf_model_dir_bus(path, maxlen);
 	_starpu_get_perf_model_dir_bus(path, maxlen);
 	strncat(path, type, maxlen);
 	strncat(path, type, maxlen);
-	
+
 	char hostname[32];
 	char hostname[32];
 	gethostname(hostname, 32);
 	gethostname(hostname, 32);
 	strncat(path, ".", maxlen);
 	strncat(path, ".", maxlen);
@@ -270,13 +382,13 @@ static void load_bus_affinity_file_content(void)
 	f = fopen(path, "r");
 	f = fopen(path, "r");
 	STARPU_ASSERT(f);
 	STARPU_ASSERT(f);
 
 
-#ifdef STARPU_USE_CUDA
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = _starpu_topology_get_nhwcpu(config);
+        int gpu;
 
 
+#ifdef STARPU_USE_CUDA
         cudaGetDeviceCount(&ncuda);
         cudaGetDeviceCount(&ncuda);
-
-	int gpu;
 	for (gpu = 0; gpu < ncuda; gpu++)
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 	{
 		int ret;
 		int ret;
@@ -292,7 +404,7 @@ static void load_bus_affinity_file_content(void)
 		unsigned cpu;
 		unsigned cpu;
 		for (cpu = 0; cpu < ncpus; cpu++)
 		for (cpu = 0; cpu < ncpus; cpu++)
 		{
 		{
-			ret = fscanf(f, "%d\t", &affinity_matrix[gpu][cpu]);
+			ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
 			STARPU_ASSERT(ret == 1);
 			STARPU_ASSERT(ret == 1);
 		}
 		}
 
 
@@ -300,6 +412,32 @@ static void load_bus_affinity_file_content(void)
 		STARPU_ASSERT(ret == 0);
 		STARPU_ASSERT(ret == 0);
 	}
 	}
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+        nopencl = _starpu_opencl_get_device_count();
+	for (gpu = 0; gpu < nopencl; gpu++)
+	{
+		int ret;
+
+		int dummy;
+
+		starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &dummy);
+		STARPU_ASSERT(ret == 1);
+
+		STARPU_ASSERT(dummy == gpu);
+
+		unsigned cpu;
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+			ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
+			STARPU_ASSERT(ret == 1);
+		}
+
+		ret = fscanf(f, "\n");
+		STARPU_ASSERT(ret == 0);
+	}
+#endif
+#endif
 
 
 	fclose(f);
 	fclose(f);
 }
 }
@@ -320,24 +458,36 @@ static void write_bus_affinity_file_content(void)
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
 
 
-#ifdef STARPU_USE_CUDA
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	unsigned cpu;
 	unsigned cpu;
+        int gpu;
 
 
-	fprintf(f, "# GPU\t");
+        fprintf(f, "# GPU\t");
 	for (cpu = 0; cpu < ncpus; cpu++)
 	for (cpu = 0; cpu < ncpus; cpu++)
 		fprintf(f, "CPU%d\t", cpu);
 		fprintf(f, "CPU%d\t", cpu);
 	fprintf(f, "\n");
 	fprintf(f, "\n");
 
 
-	int gpu;
+#ifdef STARPU_USE_CUDA
 	for (gpu = 0; gpu < ncuda; gpu++)
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 	{
 		fprintf(f, "%d\t", gpu);
 		fprintf(f, "%d\t", gpu);
 
 
 		for (cpu = 0; cpu < ncpus; cpu++)
 		for (cpu = 0; cpu < ncpus; cpu++)
 		{
 		{
-			fprintf(f, "%d\t", cudadev_timing_per_cpu[gpu+1][cpu].cpu_id);
+			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
+		}
+
+		fprintf(f, "\n");
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+	for (gpu = 0; gpu < nopencl; gpu++)
+	{
+		fprintf(f, "%d\t", gpu);
+
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
 		}
 		}
 
 
 		fprintf(f, "\n");
 		fprintf(f, "\n");
@@ -345,12 +495,13 @@ static void write_bus_affinity_file_content(void)
 #endif
 #endif
 
 
 	fclose(f);
 	fclose(f);
+#endif
 }
 }
 
 
 static void generate_bus_affinity_file(void)
 static void generate_bus_affinity_file(void)
 {
 {
 	if (!was_benchmarked)
 	if (!was_benchmarked)
-		benchmark_all_cuda_devices();
+		benchmark_all_gpu_devices();
 
 
 	write_bus_affinity_file_content();
 	write_bus_affinity_file_content();
 }
 }
@@ -372,10 +523,19 @@ static void load_bus_affinity_file(void)
 	load_bus_affinity_file_content();
 	load_bus_affinity_file_content();
 }
 }
 
 
-int *_starpu_get_gpu_affinity_vector(unsigned gpuid)
+#ifdef STARPU_USE_CUDA
+int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
+{
+        return cuda_affinity_matrix[gpuid];
+}
+#endif /* STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
 {
 {
-	return affinity_matrix[gpuid];
+        return opencl_affinity_matrix[gpuid];
 }
 }
+#endif /* STARPU_USE_OPENCL */
 
 
 /*
 /*
  *	Latency
  *	Latency
@@ -420,7 +580,7 @@ static void load_bus_latency_file_content(void)
 
 
 static void write_bus_latency_file_content(void)
 static void write_bus_latency_file_content(void)
 {
 {
-	int src, dst;
+        int src, dst, maxnode;
 	FILE *f;
 	FILE *f;
 
 
 	STARPU_ASSERT(was_benchmarked);
 	STARPU_ASSERT(was_benchmarked);
@@ -440,13 +600,17 @@ static void write_bus_latency_file_content(void)
 		fprintf(f, "to %d\t\t", dst);
 		fprintf(f, "to %d\t\t", dst);
 	fprintf(f, "\n");
 	fprintf(f, "\n");
 
 
-	for (src = 0; src < STARPU_MAXNODES; src++)
+        maxnode = ncuda;
+#ifdef STARPU_USE_OPENCL
+        maxnode += nopencl;
+#endif
+        for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		{
 		{
 			double latency;
 			double latency;
 
 
-			if ((src > ncuda) || (dst > ncuda))
+			if ((src > maxnode) || (dst > maxnode))
 			{
 			{
 				/* convention */
 				/* convention */
 				latency = -1.0;
 				latency = -1.0;
@@ -471,7 +635,7 @@ static void write_bus_latency_file_content(void)
 static void generate_bus_latency_file(void)
 static void generate_bus_latency_file(void)
 {
 {
 	if (!was_benchmarked)
 	if (!was_benchmarked)
-		benchmark_all_cuda_devices();
+		benchmark_all_gpu_devices();
 
 
 	write_bus_latency_file_content();
 	write_bus_latency_file_content();
 }
 }
@@ -494,7 +658,7 @@ static void load_bus_latency_file(void)
 }
 }
 
 
 
 
-/* 
+/*
  *	Bandwidth
  *	Bandwidth
  */
  */
 static void get_bandwidth_path(char *path, size_t maxlen)
 static void get_bandwidth_path(char *path, size_t maxlen)
@@ -540,7 +704,7 @@ static void load_bus_bandwidth_file_content(void)
 
 
 static void write_bus_bandwidth_file_content(void)
 static void write_bus_bandwidth_file_content(void)
 {
 {
-	int src, dst;
+	int src, dst, maxnode;
 	FILE *f;
 	FILE *f;
 
 
 	STARPU_ASSERT(was_benchmarked);
 	STARPU_ASSERT(was_benchmarked);
@@ -556,25 +720,38 @@ static void write_bus_bandwidth_file_content(void)
 		fprintf(f, "to %d\t\t", dst);
 		fprintf(f, "to %d\t\t", dst);
 	fprintf(f, "\n");
 	fprintf(f, "\n");
 
 
+        maxnode = ncuda;
+#ifdef STARPU_USE_OPENCL
+        maxnode += nopencl;
+#endif
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		{
 		{
 			double bandwidth;
 			double bandwidth;
-			
-			if ((src > ncuda) || (dst > ncuda))
+
+			if ((src > maxnode) || (dst > maxnode))
 			{
 			{
 				bandwidth = -1.0;
 				bandwidth = -1.0;
 			}
 			}
-#ifdef STARPU_USE_CUDA
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 			else if (src != dst)
 			else if (src != dst)
 			{
 			{
-			/* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
-				double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
-				double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
-				
+                                double time_src_to_ram=0.0, time_ram_to_dst=0.0;
+                                /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
+#ifdef STARPU_USE_CUDA
+				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
+                                time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
+#endif
+#ifdef STARPU_USE_OPENCL
+                                if (src > ncuda)
+                                        time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
+                                if (dst > ncuda)
+                                        time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
+#endif
+
 				double timing =time_src_to_ram + time_ram_to_dst;
 				double timing =time_src_to_ram + time_ram_to_dst;
-				
+
 				bandwidth = 1.0*SIZE/timing;
 				bandwidth = 1.0*SIZE/timing;
 			}
 			}
 #endif
 #endif
@@ -582,7 +759,7 @@ static void write_bus_bandwidth_file_content(void)
 			        /* convention */
 			        /* convention */
 			        bandwidth = 0.0;
 			        bandwidth = 0.0;
 			}
 			}
-			
+
 			fprintf(f, "%lf\t", bandwidth);
 			fprintf(f, "%lf\t", bandwidth);
 		}
 		}
 
 
@@ -595,7 +772,7 @@ static void write_bus_bandwidth_file_content(void)
 static void generate_bus_bandwidth_file(void)
 static void generate_bus_bandwidth_file(void)
 {
 {
 	if (!was_benchmarked)
 	if (!was_benchmarked)
-		benchmark_all_cuda_devices();
+		benchmark_all_gpu_devices();
 
 
 	write_bus_bandwidth_file_content();
 	write_bus_bandwidth_file_content();
 }
 }
@@ -618,6 +795,96 @@ static void load_bus_bandwidth_file(void)
 }
 }
 
 
 /*
 /*
+ *	Config
+ */
+static void get_config_path(char *path, size_t maxlen)
+{
+	get_bus_path("config", path, maxlen);
+}
+
+static void check_bus_config_file()
+{
+        int res;
+        char path[256];
+
+        get_config_path(path, 256);
+        res = access(path, F_OK);
+        if (res) {
+                starpu_force_bus_sampling();
+        }
+        else {
+                FILE *f;
+                int ret, read_cuda, read_opencl;
+                unsigned read_cpus;
+                struct starpu_machine_config_s *config = _starpu_get_machine_config();
+
+                // Loading configuration from file
+                f = fopen(path, "r");
+                STARPU_ASSERT(f);
+                starpu_drop_comments(f);
+                ret = fscanf(f, "%d\t", &read_cpus);
+		STARPU_ASSERT(ret == 1);
+                starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &read_cuda);
+		STARPU_ASSERT(ret == 1);
+                starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &read_opencl);
+		STARPU_ASSERT(ret == 1);
+                starpu_drop_comments(f);
+                fclose(f);
+
+                // Loading current configuration
+                ncpus = _starpu_topology_get_nhwcpu(config);
+#ifdef STARPU_USE_CUDA
+                cudaGetDeviceCount(&ncuda);
+#endif
+#ifdef STARPU_USE_OPENCL
+                nopencl = _starpu_opencl_get_device_count();
+#endif
+
+                // Checking if both configurations match
+                if (read_cpus != ncpus) {
+                        fprintf(stderr, "Current configuration does not match the performance model (CPUS: %d != %d)\n", read_cpus, ncpus);
+                        starpu_force_bus_sampling();
+                }
+                else if (read_cuda != ncuda) {
+                        fprintf(stderr, "Current configuration does not match the performance model (CUDA: %d != %d)\n", read_cuda, ncuda);
+                        starpu_force_bus_sampling();
+                }
+                else if (read_opencl != nopencl) {
+                        fprintf(stderr, "Current configuration does not match the performance model (OpenCL: %d != %d)\n", read_opencl, nopencl);
+                        starpu_force_bus_sampling();
+                }
+        }
+}
+
+static void write_bus_config_file_content(void)
+{
+	FILE *f;
+	char path[256];
+
+	STARPU_ASSERT(was_benchmarked);
+        get_config_path(path, 256);
+        f = fopen(path, "w+");
+	STARPU_ASSERT(f);
+
+        fprintf(f, "# Current configuration\n");
+        fprintf(f, "%d # Number of CPUs\n", ncpus);
+        fprintf(f, "%d # Number of CUDA devices\n", ncuda);
+        fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
+
+        fclose(f);
+}
+
+static void generate_bus_config_file()
+{
+	if (!was_benchmarked)
+		benchmark_all_gpu_devices();
+
+	write_bus_config_file_content();
+}
+
+/*
  *	Generic
  *	Generic
  */
  */
 
 
@@ -628,12 +895,14 @@ void starpu_force_bus_sampling(void)
 	generate_bus_affinity_file();
 	generate_bus_affinity_file();
 	generate_bus_latency_file();
 	generate_bus_latency_file();
 	generate_bus_bandwidth_file();
 	generate_bus_bandwidth_file();
+        generate_bus_config_file();
 }
 }
 
 
 void _starpu_load_bus_performance_files(void)
 void _starpu_load_bus_performance_files(void)
 {
 {
 	_starpu_create_sampling_directory_if_needed();
 	_starpu_create_sampling_directory_if_needed();
 
 
+        check_bus_config_file();
 	load_bus_affinity_file();
 	load_bus_affinity_file();
 	load_bus_latency_file();
 	load_bus_latency_file();
 	load_bus_bandwidth_file();
 	load_bus_bandwidth_file();

+ 173 - 21
src/core/topology.c

@@ -21,6 +21,7 @@
 #include <core/debug.h>
 #include <core/debug.h>
 #include <core/topology.h>
 #include <core/topology.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
+#include <common/hash.h>
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
 #include <hwloc.h>
@@ -37,8 +38,16 @@ static unsigned topology_is_initialized = 0;
 
 
 static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *config);
 static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *config);
 
 
-#ifdef STARPU_USE_CUDA
-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config);
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+#  ifdef STARPU_USE_CUDA
+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config);
+static struct starpu_htbl32_node_s *devices_using_cuda = NULL;
+#  endif
+#  ifdef STARPU_USE_OPENCL
+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config);
+#  endif
+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus);
 static unsigned may_bind_automatically = 0;
 static unsigned may_bind_automatically = 0;
 #endif
 #endif
 
 
@@ -47,12 +56,70 @@ static unsigned may_bind_automatically = 0;
  */
  */
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config)
+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config)
+{
+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_cuda_gpuid,
+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_cuda_gpuid,
+                                         &(config->current_cuda_gpuid), (int *)config->workers_cuda_gpuid, "STARPU_WORKERS_CUDAID",
+                                         config->nhwcudagpus);
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config)
+{
+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_opencl_gpuid,
+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_opencl_gpuid,
+                                         &(config->current_opencl_gpuid), (int *)config->workers_opencl_gpuid, "STARPU_WORKERS_OPENCLID",
+                                         config->nhwopenclgpus);
+
+#ifdef STARPU_USE_CUDA
+        // Detect devices which are already used with CUDA
+        {
+                unsigned tmp[STARPU_NMAXWORKERS];
+                unsigned nb=0;
+                int i;
+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
+                        if (_starpu_htbl_search_32(devices_using_cuda, key) == NULL) {
+                                tmp[nb] = config->workers_opencl_gpuid[i];
+                                nb++;
+                        }
+                }
+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
+        }
+#endif /* STARPU_USE_CUDA */
+        {
+                // Detect identical devices
+                struct starpu_htbl32_node_s *devices_already_used = NULL;
+                unsigned tmp[STARPU_NMAXWORKERS];
+                unsigned nb=0;
+                int i;
+
+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
+                        if (_starpu_htbl_search_32(devices_already_used, key) == NULL) {
+                                _starpu_htbl_insert_32(&devices_already_used, key, config);
+                                tmp[nb] = config->workers_opencl_gpuid[i];
+                                nb ++;
+                        }
+                }
+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
+        }
+}
+#endif
+
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus)
 {
 {
 	char *strval;
 	char *strval;
 	unsigned i;
 	unsigned i;
 
 
-	config->current_gpuid = 0;
+	*current = 0;
 
 
 	/* conf->workers_bindid indicates the successive cpu identifier that
 	/* conf->workers_bindid indicates the successive cpu identifier that
 	 * should be used to bind the workers. It should be either filled
 	 * should be used to bind the workers. It should be either filled
@@ -62,14 +129,14 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 	 * cpus. */
 	 * cpus. */
 
 
 	/* what do we use, explicit value, env. variable, or round-robin ? */
 	/* what do we use, explicit value, env. variable, or round-robin ? */
-	if (config->user_conf && config->user_conf->use_explicit_workers_gpuid)
+	if (use_explicit_workers_gpuid)
 	{
 	{
 		/* we use the explicit value from the user */
 		/* we use the explicit value from the user */
-		memcpy(config->workers_gpuid,
-			config->user_conf->workers_gpuid,
-			STARPU_NMAXWORKERS*sizeof(unsigned));
+		memcpy(workers_gpuid,
+                       explicit_workers_gpuid,
+                       STARPU_NMAXWORKERS*sizeof(unsigned));
 	}
 	}
-	else if ((strval = getenv("STARPU_WORKERS_CUDAID")))
+	else if ((strval = getenv(varname)))
 	{
 	{
 		/* STARPU_WORKERS_CUDAID certainly contains less entries than
 		/* STARPU_WORKERS_CUDAID certainly contains less entries than
 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
@@ -86,7 +153,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 				val = strtol(strval, &endptr, 10);
 				val = strtol(strval, &endptr, 10);
 				if (endptr != strval)
 				if (endptr != strval)
 				{
 				{
-					config->workers_gpuid[i] = (unsigned)val;
+					workers_gpuid[i] = (unsigned)val;
 					strval = endptr;
 					strval = endptr;
 				}
 				}
 				else {
 				else {
@@ -97,11 +164,11 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 					/* there is no more values in the string */
 					/* there is no more values in the string */
 					wrap = 1;
 					wrap = 1;
 
 
-					config->workers_gpuid[i] = config->workers_gpuid[0];
+					workers_gpuid[i] = workers_gpuid[0];
 				}
 				}
 			}
 			}
 			else {
 			else {
-				config->workers_gpuid[i] = config->workers_gpuid[i % number_of_entries];
+				workers_gpuid[i] = workers_gpuid[i % number_of_entries];
 			}
 			}
 		}
 		}
 	}
 	}
@@ -109,7 +176,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 	{
 	{
 		/* by default, we take a round robin policy */
 		/* by default, we take a round robin policy */
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
-			config->workers_gpuid[i] = (unsigned)i;
+			workers_gpuid[i] = (unsigned)(i % nhwgpus);
 
 
 		/* StarPU can use sampling techniques to bind threads correctly */
 		/* StarPU can use sampling techniques to bind threads correctly */
 		may_bind_automatically = 1;
 		may_bind_automatically = 1;
@@ -117,11 +184,18 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 }
 }
 #endif
 #endif
 
 
-static inline int _starpu_get_next_gpuid(struct starpu_machine_config_s *config)
+static inline int _starpu_get_next_cuda_gpuid(struct starpu_machine_config_s *config)
 {
 {
-	unsigned i = ((config->current_gpuid++) % config->ncudagpus);
+	unsigned i = ((config->current_cuda_gpuid++) % config->ncudagpus);
 
 
-	return (int)config->workers_gpuid[i];
+	return (int)config->workers_cuda_gpuid[i];
+}
+
+static inline int _starpu_get_next_opencl_gpuid(struct starpu_machine_config_s *config)
+{
+	unsigned i = ((config->current_opencl_gpuid++) % config->nopenclgpus);
+
+	return (int)config->workers_opencl_gpuid[i];
 }
 }
 
 
 static void _starpu_init_topology(struct starpu_machine_config_s *config)
 static void _starpu_init_topology(struct starpu_machine_config_s *config)
@@ -152,7 +226,14 @@ static void _starpu_init_topology(struct starpu_machine_config_s *config)
 #warning no way to know number of cores, assuming 1
 #warning no way to know number of cores, assuming 1
 		config->nhwcpus = 1;
 		config->nhwcpus = 1;
 #endif
 #endif
-	
+
+#ifdef STARPU_USE_CUDA
+                config->nhwcudagpus = _starpu_get_cuda_device_count();
+#endif
+#ifdef STARPU_USE_OPENCL
+                config->nhwopenclgpus = _starpu_opencl_get_device_count();
+#endif
+
 		topology_is_initialized = 1;
 		topology_is_initialized = 1;
 	}
 	}
 }
 }
@@ -208,22 +289,79 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 	if (config->ncudagpus > 0)
 	if (config->ncudagpus > 0)
 		use_accelerator = 1;
 		use_accelerator = 1;
 
 
-	_starpu_initialize_workers_gpuid(config);
+	_starpu_initialize_workers_cuda_gpuid(config);
 
 
 	unsigned cudagpu;
 	unsigned cudagpu;
 	for (cudagpu = 0; cudagpu < config->ncudagpus; cudagpu++)
 	for (cudagpu = 0; cudagpu < config->ncudagpus; cudagpu++)
 	{
 	{
 		config->workers[config->nworkers + cudagpu].arch = STARPU_CUDA_WORKER;
 		config->workers[config->nworkers + cudagpu].arch = STARPU_CUDA_WORKER;
-		int devid = _starpu_get_next_gpuid(config);
+		int devid = _starpu_get_next_cuda_gpuid(config);
 		enum starpu_perf_archtype arch = STARPU_CUDA_DEFAULT + devid;
 		enum starpu_perf_archtype arch = STARPU_CUDA_DEFAULT + devid;
 		config->workers[config->nworkers + cudagpu].devid = devid;
 		config->workers[config->nworkers + cudagpu].devid = devid;
 		config->workers[config->nworkers + cudagpu].perf_arch = arch; 
 		config->workers[config->nworkers + cudagpu].perf_arch = arch; 
 		config->workers[config->nworkers + cudagpu].worker_mask = STARPU_CUDA;
 		config->workers[config->nworkers + cudagpu].worker_mask = STARPU_CUDA;
 		config->worker_mask |= STARPU_CUDA;
 		config->worker_mask |= STARPU_CUDA;
-	}
+
+                uint32_t key = _starpu_crc32_be(devid, 0);
+                _starpu_htbl_insert_32(&devices_using_cuda, key, config);
+        }
 
 
 	config->nworkers += config->ncudagpus;
 	config->nworkers += config->ncudagpus;
 #endif
 #endif
+
+#ifdef STARPU_USE_OPENCL
+	if (user_conf && (user_conf->nopencl == 0))
+	{
+		/* the user explicitely disabled OpenCL */
+		config->nopenclgpus = 0;
+	}
+	else {
+		/* we need to initialize OpenCL early to count the number of devices */
+		_starpu_opencl_init();
+
+		if (user_conf && (user_conf->nopencl != -1))
+		{
+			explicitval = user_conf->nopencl;
+		}
+		else {
+			explicitval = starpu_get_env_number("STARPU_NOPENCL");
+		}
+
+		if (explicitval < 0) {
+			config->nopenclgpus =
+				STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
+		} else {
+			/* use the specified value */
+			config->nopenclgpus = (unsigned)explicitval;
+			STARPU_ASSERT(config->nopenclgpus <= STARPU_MAXOPENCLDEVS);
+		}
+		STARPU_ASSERT(config->nopenclgpus + config->nworkers <= STARPU_NMAXWORKERS);
+	}
+
+	if (config->nopenclgpus > 0)
+		use_accelerator = 1;
+	// TODO: use_accelerator pour les OpenCL?
+
+	_starpu_initialize_workers_opencl_gpuid(config);
+
+	unsigned openclgpu;
+	for (openclgpu = 0; openclgpu < config->nopenclgpus; openclgpu++)
+	{
+		int devid = _starpu_get_next_opencl_gpuid(config);
+                if (devid == -1) { // There is no more devices left
+                  config->nopenclgpus = openclgpu;
+                  break;
+                }
+		config->workers[config->nworkers + openclgpu].arch = STARPU_OPENCL_WORKER;
+		enum starpu_perf_archtype arch = STARPU_OPENCL_DEFAULT + devid;
+		config->workers[config->nworkers + openclgpu].devid = devid;
+		config->workers[config->nworkers + openclgpu].perf_arch = arch; 
+		config->workers[config->nworkers + openclgpu].worker_mask = STARPU_OPENCL;
+		config->worker_mask |= STARPU_OPENCL;
+	}
+
+	config->nworkers += config->nopenclgpus;
+#endif
 	
 	
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 	if (user_conf && (user_conf->ncuda != -1)) {
 	if (user_conf && (user_conf->ncuda != -1)) {
@@ -500,13 +638,27 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				if (may_bind_automatically)
 				if (may_bind_automatically)
 				{
 				{
 					/* StarPU is allowed to bind threads automatically */
 					/* StarPU is allowed to bind threads automatically */
-					preferred_binding = _starpu_get_gpu_affinity_vector(workerarg->devid);
+					preferred_binding = _starpu_get_cuda_affinity_vector(workerarg->devid);
 					npreferred = config->nhwcpus;
 					npreferred = config->nhwcpus;
 				}
 				}
 				is_a_set_of_accelerators = 0;
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
 				break;
 				break;
 #endif
 #endif
+
+#ifdef STARPU_USE_OPENCL
+		        case STARPU_OPENCL_WORKER:
+				if (may_bind_automatically)
+				{
+					/* StarPU is allowed to bind threads automatically */
+					preferred_binding = _starpu_get_opencl_affinity_vector(workerarg->devid);
+					npreferred = config->nhwcpus;
+				}
+				is_a_set_of_accelerators = 0;
+				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
+				break;
+#endif
+
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
 		}
 		}

+ 28 - 0
src/core/workers.c

@@ -59,6 +59,11 @@ inline uint32_t _starpu_may_submit_cpu_task(void)
 	return (STARPU_CPU & config.worker_mask);
 	return (STARPU_CPU & config.worker_mask);
 }
 }
 
 
+inline uint32_t _starpu_may_submit_opencl_task(void)
+{
+	return (STARPU_OPENCL & config.worker_mask);
+}
+
 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where)
 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where)
 {
 {
 	return (where & config.workers[workerid].worker_mask);
 	return (where & config.workers[workerid].worker_mask);
@@ -90,6 +95,9 @@ static void _starpu_init_worker_queue(struct starpu_worker_s *workerarg)
 		case STARPU_CUDA_WORKER:
 		case STARPU_CUDA_WORKER:
 			jobq->alpha = STARPU_CUDA_ALPHA;
 			jobq->alpha = STARPU_CUDA_ALPHA;
 			break;
 			break;
+		case STARPU_OPENCL_WORKER:
+			jobq->alpha = STARPU_OPENCL_ALPHA;
+			break;
 		case STARPU_GORDON_WORKER:
 		case STARPU_GORDON_WORKER:
 			jobq->alpha = STARPU_GORDON_ALPHA;
 			jobq->alpha = STARPU_GORDON_ALPHA;
 			break;
 			break;
@@ -151,6 +159,15 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 
 
 				break;
 				break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+			case STARPU_OPENCL_WORKER:
+				workerarg->set = NULL;
+				workerarg->worker_is_initialized = 0;
+				pthread_create(&workerarg->worker_thread, 
+						NULL, _starpu_opencl_worker, workerarg);
+
+				break;
+#endif
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 			case STARPU_GORDON_WORKER:
 				/* we will only launch gordon once, but it will handle 
 				/* we will only launch gordon once, but it will handle 
@@ -192,6 +209,7 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 		switch (workerarg->arch) {
 		switch (workerarg->arch) {
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
+			case STARPU_OPENCL_WORKER:			  
 				PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 				PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 				while (!workerarg->worker_is_initialized)
 				while (!workerarg->worker_is_initialized)
 					PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
 					PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
@@ -523,6 +541,11 @@ unsigned starpu_get_cuda_worker_count(void)
 	return config.ncudagpus;
 	return config.ncudagpus;
 }
 }
 
 
+unsigned starpu_get_opencl_worker_count(void)
+{
+	return config.nopenclgpus;
+}
+
 unsigned starpu_get_spu_worker_count(void)
 unsigned starpu_get_spu_worker_count(void)
 {
 {
 	return config.ngordon_spus;
 	return config.ngordon_spus;
@@ -549,6 +572,11 @@ int starpu_get_worker_id(void)
 	}
 	}
 }
 }
 
 
+int starpu_get_worker_devid(int id)
+{
+	return config.workers[id].devid;
+}
+
 struct starpu_worker_s *_starpu_get_worker_struct(unsigned id)
 struct starpu_worker_s *_starpu_get_worker_struct(unsigned id)
 {
 {
 	return &config.workers[id];
 	return &config.workers[id];

+ 16 - 3
src/core/workers.h

@@ -42,6 +42,10 @@
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
 #endif
 #endif
 
 
+#ifdef STARPU_USE_OPENCL
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 #include <drivers/gordon/driver_gordon.h>
 #include <drivers/gordon/driver_gordon.h>
 #endif
 #endif
@@ -52,6 +56,7 @@
 
 
 #define STARPU_CPU_ALPHA	1.0f
 #define STARPU_CPU_ALPHA	1.0f
 #define STARPU_CUDA_ALPHA	13.33f
 #define STARPU_CUDA_ALPHA	13.33f
+#define STARPU_OPENCL_ALPHA	12.22f
 #define STARPU_GORDON_ALPHA	6.0f /* XXX this is a random value ... */
 #define STARPU_GORDON_ALPHA	6.0f /* XXX this is a random value ... */
 
 
 #ifdef STARPU_DATA_STATS
 #ifdef STARPU_DATA_STATS
@@ -105,18 +110,25 @@ struct starpu_machine_config_s {
 #endif
 #endif
 
 
 	unsigned nhwcpus;
 	unsigned nhwcpus;
+        unsigned nhwcudagpus;
+        unsigned nhwopenclgpus;
 
 
 	unsigned ncpus;
 	unsigned ncpus;
 	unsigned ncudagpus;
 	unsigned ncudagpus;
+	unsigned nopenclgpus;
 	unsigned ngordon_spus;
 	unsigned ngordon_spus;
 
 
 	/* Where to bind workers ? */
 	/* Where to bind workers ? */
 	int current_bindid;
 	int current_bindid;
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	
 	
-	/* Which GPU(s) do we use ? */
-	int current_gpuid;
-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
+	/* Which GPU(s) do we use for CUDA ? */
+	int current_cuda_gpuid;
+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
+
+	/* Which GPU(s) do we use for OpenCL ? */
+	int current_opencl_gpuid;
+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 	
 	
 	struct starpu_worker_s workers[STARPU_NMAXWORKERS];
 	struct starpu_worker_s workers[STARPU_NMAXWORKERS];
 	uint32_t worker_mask;
 	uint32_t worker_mask;
@@ -138,6 +150,7 @@ unsigned _starpu_machine_is_running(void);
 inline uint32_t _starpu_worker_exists(uint32_t task_mask);
 inline uint32_t _starpu_worker_exists(uint32_t task_mask);
 inline uint32_t _starpu_may_submit_cuda_task(void);
 inline uint32_t _starpu_may_submit_cuda_task(void);
 inline uint32_t _starpu_may_submit_cpu_task(void);
 inline uint32_t _starpu_may_submit_cpu_task(void);
+inline uint32_t _starpu_may_submit_opencl_task(void);
 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where);
 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where);
 unsigned _starpu_worker_can_block(unsigned memnode);
 unsigned _starpu_worker_can_block(unsigned memnode);
 
 

+ 6 - 4
src/datawizard/coherency.c

@@ -25,8 +25,8 @@ uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_n
 	/* in case one of the node is a GPU, it needs to perform the transfer,
 	/* in case one of the node is a GPU, it needs to perform the transfer,
 	 * if both of them are GPU, it's a bit more complicated (TODO !) */
 	 * if both of them are GPU, it's a bit more complicated (TODO !) */
 
 
-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM);
+	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
+	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
 
 
 	/* we do not handle GPU->GPU transfers yet ! */
 	/* we do not handle GPU->GPU transfers yet ! */
 	STARPU_ASSERT( !(src_is_a_gpu && dst_is_a_gpu) );
 	STARPU_ASSERT( !(src_is_a_gpu && dst_is_a_gpu) );
@@ -77,6 +77,8 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 			 * 	other should be ok */
 			 * 	other should be ok */
 			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM)
 			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM)
 				break;
 				break;
+			if (_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)
+				break;
 
 
 			/* XXX do a better algorithm to distribute the memory copies */
 			/* XXX do a better algorithm to distribute the memory copies */
 			/* TODO : use the "requesting_node" as an argument to do so */
 			/* TODO : use the "requesting_node" as an argument to do so */
@@ -181,8 +183,8 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, uint32_t requesting_no
 			STARPU_ASSERT(src_node != requesting_node);
 			STARPU_ASSERT(src_node != requesting_node);
 		}
 		}
 	
 	
-		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
-		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM);
+		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
+		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(requesting_node) == STARPU_OPENCL_RAM);
 
 
 		/* we have to perform 2 successive requests for GPU->GPU transfers */
 		/* we have to perform 2 successive requests for GPU->GPU transfers */
 		if (read && (src_is_a_gpu && dst_is_a_gpu)) {
 		if (read && (src_is_a_gpu && dst_is_a_gpu)) {

+ 67 - 0
src/datawizard/copy_driver.c

@@ -135,6 +135,28 @@ cudaStream_t *stream;
 				}
 				}
 				break;
 				break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+    		        case STARPU_OPENCL_RAM:
+				/* OpenCL -> RAM */
+				if (_starpu_get_local_memory_node() == src_node)
+				{
+					STARPU_ASSERT(copy_methods->opencl_to_ram);
+					if (!req || !copy_methods->opencl_to_ram_async)
+					{
+						/* this is not associated to a request so it's synchronous */
+                                                copy_methods->opencl_to_ram(handle, src_node, dst_node);
+                                        }
+                                        else {
+                                                ret = copy_methods->opencl_to_ram_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
+                                        }
+				}
+				else
+				{
+					/* we should not have a blocking call ! */
+					STARPU_ABORT();
+				}
+				break;
+#endif
 			case STARPU_SPU_LS:
 			case STARPU_SPU_LS:
 				STARPU_ABORT(); // TODO
 				STARPU_ABORT(); // TODO
 				break;
 				break;
@@ -180,6 +202,34 @@ cudaStream_t *stream;
 		}
 		}
 		break;
 		break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	case STARPU_OPENCL_RAM:
+		switch (src_kind) {
+		        case STARPU_RAM:
+				/* STARPU_RAM -> STARPU_OPENCL_RAM */
+				STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
+				STARPU_ASSERT(copy_methods->ram_to_opencl);
+				if (!req || !copy_methods->ram_to_opencl_async)
+				{
+					/* this is not associated to a request so it's synchronous */
+					copy_methods->ram_to_opencl(handle, src_node, dst_node);
+				}
+				else {
+                                        ret = copy_methods->ram_to_opencl_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
+				}
+				break;
+			case STARPU_CUDA_RAM:
+			case STARPU_OPENCL_RAM:
+			case STARPU_SPU_LS:
+				STARPU_ABORT(); // TODO 
+				break;
+			case STARPU_UNUSED:
+			default:
+				STARPU_ABORT();
+				break;
+		}
+		break;
+#endif
 	case STARPU_SPU_LS:
 	case STARPU_SPU_LS:
 		STARPU_ABORT(); // TODO
 		STARPU_ABORT(); // TODO
 		break;
 		break;
@@ -275,6 +325,12 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        fprintf(stderr, "not implemented yet\n");
+			STARPU_ABORT();
+                        break;
+#endif
 		case STARPU_RAM:
 		case STARPU_RAM:
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
@@ -301,6 +357,17 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        {
+                                cl_int event_status;
+                                cl_event opencl_event = (*async_channel).opencl_event;
+                                if (opencl_event == NULL) STARPU_ABORT();
+                                clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
+                                success = (event_status == CL_COMPLETE);
+                                break;
+                        }
+#endif
 		case STARPU_RAM:
 		case STARPU_RAM:
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();

+ 23 - 0
src/datawizard/copy_driver.h

@@ -28,6 +28,10 @@
 #include <cublas.h>
 #include <cublas.h>
 #endif
 #endif
 
 
+#ifdef STARPU_USE_OPENCL
+#include <CL/cl.h>
+#endif
+
 struct starpu_data_request_s;
 struct starpu_data_request_s;
 
 
 /* this is a structure that can be queried to see whether an asynchronous
 /* this is a structure that can be queried to see whether an asynchronous
@@ -37,24 +41,36 @@ typedef union {
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t cuda_event;
 	cudaEvent_t cuda_event;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+        cl_event opencl_event;
+#endif
 } starpu_async_channel;
 } starpu_async_channel;
 
 
 struct starpu_copy_data_methods_s {
 struct starpu_copy_data_methods_s {
 	/* src type is ram */
 	/* src type is ram */
 	int (*ram_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*ram_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*ram_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*ram_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*ram_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*ram_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*ram_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 
 
 	/* src type is cuda */
 	/* src type is cuda */
 	int (*cuda_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*cuda_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*cuda_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*cuda_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*cuda_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*cuda_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*cuda_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 
 
 	/* src type is spu */
 	/* src type is spu */
 	int (*spu_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*spu_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 
 
+	/* src type is opencl */
+	int (*opencl_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*opencl_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*opencl_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*opencl_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* for asynchronous CUDA transfers */
 	/* for asynchronous CUDA transfers */
 	int (*ram_to_cuda_async)(starpu_data_handle handle, uint32_t src,
 	int (*ram_to_cuda_async)(starpu_data_handle handle, uint32_t src,
@@ -64,6 +80,13 @@ struct starpu_copy_data_methods_s {
 	int (*cuda_to_cuda_async)(starpu_data_handle handle, uint32_t src,
 	int (*cuda_to_cuda_async)(starpu_data_handle handle, uint32_t src,
 					uint32_t dst, cudaStream_t *stream);
 					uint32_t dst, cudaStream_t *stream);
 #endif
 #endif
+
+#ifdef STARPU_USE_OPENCL
+	/* for asynchronous OpenCL transfers */
+        int (*ram_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
+	int (*opencl_to_ram_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
+	int (*opencl_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
+#endif
 };
 };
 
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);

+ 121 - 0
src/datawizard/interfaces/bcsr_interface.c

@@ -22,6 +22,11 @@
 #include <datawizard/filters.h>
 #include <datawizard/filters.h>
 #include <common/hash.h>
 #include <common/hash.h>
 
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
 /*
 /*
  * BCSR : blocked CSR, we use blocks of size (r x c)
  * BCSR : blocked CSR, we use blocks of size (r x c)
  */
  */
@@ -31,6 +36,10 @@ static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, u
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+#endif
 
 
 static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
 static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -39,6 +48,10 @@ static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
 	.ram_to_cuda = copy_ram_to_cuda,
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 	.cuda_to_ram = copy_cuda_to_ram,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_ram = NULL,
@@ -274,6 +287,27 @@ static size_t allocate_bcsr_buffer_on_node(starpu_data_handle handle, uint32_t d
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+                        {
+                                int ret;
+                                void *ptr;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
+                                addr_nzval = (uintptr_t)ptr;
+                                if (ret) goto fail_nzval;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_colind = ptr;
+				if (ret) goto fail_colind;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_rowptr = ptr;
+				if (ret) goto fail_rowptr;
+
+                                break;
+                        }
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -298,6 +332,11 @@ fail_rowptr:
 			cudaFree((void*)addr_colind);
 			cudaFree((void*)addr_colind);
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_colind);
+			break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -311,6 +350,11 @@ fail_colind:
 			cudaFree((void*)addr_nzval);
 			cudaFree((void*)addr_nzval);
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_nzval);
+			break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -341,6 +385,13 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)bcsr_interface->rowptr);
 			cudaFree((void*)bcsr_interface->rowptr);
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)bcsr_interface->nzval);
+			clReleaseMemObject((void*)bcsr_interface->colind);
+			clReleaseMemObject((void*)bcsr_interface->rowptr);
+			break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -420,6 +471,76 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 }
 }
 #endif // STARPU_USE_CUDA
 #endif // STARPU_USE_CUDA
 
 
+#ifdef STARPU_USE_OPENCL
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_bcsr_interface_t *src_bcsr;
+	starpu_bcsr_interface_t *dst_bcsr;
+
+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_bcsr->nnz;
+	uint32_t nrow = src_bcsr->nrow;
+	size_t elemsize = src_bcsr->elemsize;
+
+	uint32_t r = src_bcsr->r;
+	uint32_t c = src_bcsr->c;
+
+        int err;
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->nzval, (void *)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->colind, (void *)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->rowptr, (void *)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_bcsr_interface_t *src_bcsr;
+	starpu_bcsr_interface_t *dst_bcsr;
+
+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_bcsr->nnz;
+	uint32_t nrow = src_bcsr->nrow;
+	size_t elemsize = src_bcsr->elemsize;
+
+	uint32_t r = src_bcsr->r;
+	uint32_t c = src_bcsr->c;
+
+        int err;
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->nzval, (cl_mem)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->colind, (cl_mem)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->rowptr, (cl_mem)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+#endif // STARPU_USE_OPENCL
+
 /* as not all platform easily have a BLAS lib installed ... */
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 {

+ 122 - 0
src/datawizard/interfaces/block_interface.c

@@ -22,6 +22,11 @@
 
 
 #include <common/hash.h>
 #include <common/hash.h>
 
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
@@ -29,6 +34,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 
 static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
 static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -39,6 +50,12 @@ static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_ram = NULL,
@@ -94,11 +111,15 @@ static void register_block_handle(starpu_data_handle handle, uint32_t home_node,
 
 
 		if (node == home_node) {
 		if (node == home_node) {
 			local_interface->ptr = block_interface->ptr;
 			local_interface->ptr = block_interface->ptr;
+                        local_interface->dev_handle = block_interface->dev_handle;
+                        local_interface->offset = block_interface->offset;
 			local_interface->ldy  = block_interface->ldy;
 			local_interface->ldy  = block_interface->ldy;
 			local_interface->ldz  = block_interface->ldz;
 			local_interface->ldz  = block_interface->ldz;
 		}
 		}
 		else {
 		else {
 			local_interface->ptr = 0;
 			local_interface->ptr = 0;
+                        local_interface->dev_handle = 0;
+                        local_interface->offset = 0;
 			local_interface->ldy  = 0;
 			local_interface->ldy  = 0;
 			local_interface->ldz  = 0;
 			local_interface->ldz  = 0;
 		}
 		}
@@ -117,6 +138,8 @@ void starpu_register_block_data(starpu_data_handle *handleptr, uint32_t home_nod
 {
 {
 	starpu_block_interface_t interface = {
 	starpu_block_interface_t interface = {
 		.ptr = ptr,
 		.ptr = ptr,
+                .dev_handle = ptr,
+                .offset = 0,
 		.ldy = ldy,
 		.ldy = ldy,
 		.ldz = ldz,
 		.ldz = ldz,
 		.nx = nx,
 		.nx = nx,
@@ -278,6 +301,19 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -288,6 +324,8 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
 
 		/* update the data properly in consequence */
 		/* update the data properly in consequence */
 		dst_block->ptr = addr;
 		dst_block->ptr = addr;
+                dst_block->dev_handle = addr;
+                dst_block->offset = 0;
 		dst_block->ldy = nx;
 		dst_block->ldy = nx;
 		dst_block->ldz = nx*ny;
 		dst_block->ldz = nx*ny;
 	} else {
 	} else {
@@ -319,6 +357,11 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node)
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void *)block_interface->ptr);
+                        break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -660,6 +703,85 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 }
 }
 #endif // STARPU_USE_CUDA
 #endif // STARPU_USE_CUDA
 
 
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                dst_block->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                  src_block->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                dst_block->offset, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                  src_block->offset, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return 0;
+}
+
+#endif
+
 /* as not all platform easily have a BLAS lib installed ... */
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 {

+ 114 - 0
src/datawizard/interfaces/csr_interface.c

@@ -22,12 +22,20 @@
 
 
 #include <common/hash.h>
 #include <common/hash.h>
 
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+#endif
 
 
 static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
 static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -36,6 +44,10 @@ static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
 	.ram_to_cuda = copy_ram_to_cuda,
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 	.cuda_to_ram = copy_cuda_to_ram,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_ram = NULL,
@@ -246,6 +258,27 @@ static size_t allocate_csr_buffer_on_node(starpu_data_handle handle, uint32_t ds
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
+                                addr_nzval = (uintptr_t)ptr;
+				if (ret) goto fail_nzval;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_colind = ptr;
+				if (ret) goto fail_colind;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_rowptr = ptr;
+				if (ret) goto fail_rowptr;
+
+				break;
+			}
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -270,6 +303,11 @@ fail_rowptr:
 			cudaFree((void*)addr_colind);
 			cudaFree((void*)addr_colind);
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_colind);
+			break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -283,6 +321,11 @@ fail_colind:
 			cudaFree((void*)addr_nzval);
 			cudaFree((void*)addr_nzval);
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_nzval);
+			break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -313,6 +356,13 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)csr_interface->rowptr);
 			cudaFree((void*)csr_interface->rowptr);
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)csr_interface->nzval);
+			clReleaseMemObject((void*)csr_interface->colind);
+			clReleaseMemObject((void*)csr_interface->rowptr);
+			break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -386,6 +436,70 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 }
 }
 #endif // STARPU_USE_CUDA
 #endif // STARPU_USE_CUDA
 
 
+#ifdef STARPU_USE_OPENCL
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_csr_interface_t *src_csr;
+	starpu_csr_interface_t *dst_csr;
+
+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+        int err;
+
+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_csr_interface_t *src_csr;
+	starpu_csr_interface_t *dst_csr;
+
+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+        int err;
+
+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+#endif // STARPU_USE_OPENCL
+
 /* as not all platform easily have a BLAS lib installed ... */
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 {

+ 4 - 0
src/datawizard/interfaces/matrix_filters.c

@@ -69,6 +69,8 @@ void starpu_block_filter_func(starpu_filter *f, starpu_data_handle root_handle)
 
 
 				local->ptr = local_root->ptr + offset;
 				local->ptr = local_root->ptr + offset;
 				local->ld = local_root->ld;
 				local->ld = local_root->ld;
+                                local->dev_handle = local_root->dev_handle;
+                                local->offset = local_root->offset + offset;
 			}
 			}
 		}
 		}
 	}
 	}
@@ -122,6 +124,8 @@ void starpu_vertical_block_filter_func(starpu_filter *f, starpu_data_handle root
 					(size_t)chunk*chunk_size*local_root->ld*elemsize;
 					(size_t)chunk*chunk_size*local_root->ld*elemsize;
 				local->ptr = local_root->ptr + offset;
 				local->ptr = local_root->ptr + offset;
 				local->ld = local_root->ld;
 				local->ld = local_root->ld;
+                                local->dev_handle = local_root->dev_handle;
+                                local->offset = local_root->offset + offset;
 			}
 			}
 		}
 		}
 	}
 	}

+ 120 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -26,6 +26,10 @@
 #include <cuda.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime.h>
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 
 static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
 static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_ram = NULL,
@@ -106,10 +122,14 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 
 
 		if (node == home_node) {
 		if (node == home_node) {
 			local_interface->ptr = matrix_interface->ptr;
 			local_interface->ptr = matrix_interface->ptr;
+                        local_interface->dev_handle = matrix_interface->dev_handle;
+                        local_interface->offset = matrix_interface->offset;
 			local_interface->ld  = matrix_interface->ld;
 			local_interface->ld  = matrix_interface->ld;
 		}
 		}
 		else {
 		else {
 			local_interface->ptr = 0;
 			local_interface->ptr = 0;
+			local_interface->dev_handle = 0;
+			local_interface->offset = 0;
 			local_interface->ld  = 0;
 			local_interface->ld  = 0;
 		}
 		}
 
 
@@ -129,7 +149,9 @@ void starpu_register_matrix_data(starpu_data_handle *handleptr, uint32_t home_no
 		.ld = ld,
 		.ld = ld,
 		.nx = nx,
 		.nx = nx,
 		.ny = ny,
 		.ny = ny,
-		.elemsize = elemsize
+		.elemsize = elemsize,
+                .dev_handle = ptr,
+                .offset = 0
 	};
 	};
 
 
 	_starpu_register_data_handle(handleptr, home_node, &interface, &_starpu_interface_matrix_ops);
 	_starpu_register_data_handle(handleptr, home_node, &interface, &_starpu_interface_matrix_ops);
@@ -256,6 +278,19 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -266,6 +301,8 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
 
 		/* update the data properly in consequence */
 		/* update the data properly in consequence */
 		interface->ptr = addr;
 		interface->ptr = addr;
+                interface->dev_handle = addr;
+                interface->offset = 0;
 		interface->ld = ld;
 		interface->ld = ld;
 	} else {
 	} else {
 		/* allocation failed */
 		/* allocation failed */
@@ -296,6 +333,11 @@ static void liberate_matrix_buffer_on_node(void *interface, uint32_t node)
 
 
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void *)matrix_interface->ptr);
+                        break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -423,6 +465,83 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
 
 #endif // STARPU_USE_CUDA
 #endif // STARPU_USE_CUDA
 
 
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                dst_matrix->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                  src_matrix->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                dst_matrix->offset, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                  src_matrix->offset, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return 0;
+}
+
+#endif
+
 /* as not all platform easily have a BLAS lib installed ... */
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 {

+ 109 - 0
src/datawizard/interfaces/variable_interface.c

@@ -26,6 +26,10 @@
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 
 static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
 static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_ram = NULL,
@@ -192,6 +208,19 @@ static size_t allocate_variable_buffer_on_node(starpu_data_handle handle, uint32
 			}
 			}
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -220,6 +249,11 @@ static void liberate_variable_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)STARPU_GET_VARIABLE_PTR(interface));
 			cudaFree((void*)STARPU_GET_VARIABLE_PTR(interface));
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void*)STARPU_GET_VARIABLE_PTR(interface));
+                        break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -324,6 +358,81 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
 
 #endif // STARPU_USE_CUDA
 #endif // STARPU_USE_CUDA
 
 
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
+                                                0, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
+                                                  0, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
+                                                0, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
+                                                  0, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return 0;
+}
+
+#endif
+
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 {
 	starpu_variable_interface_t *src_variable;
 	starpu_variable_interface_t *src_variable;

+ 8 - 0
src/datawizard/interfaces/vector_filters.c

@@ -62,6 +62,8 @@ void starpu_block_filter_func_vector(starpu_filter *f, starpu_data_handle root_h
 					starpu_data_get_interface_on_node(root_handle, node);
 					starpu_data_get_interface_on_node(root_handle, node);
 
 
 				local->ptr = local_root->ptr + offset;
 				local->ptr = local_root->ptr + offset;
+                                local->dev_handle = local_root->dev_handle;
+                                local->offset = local_root->offset + offset;
 			}
 			}
 		}
 		}
 	}
 	}
@@ -100,6 +102,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 				starpu_data_get_interface_on_node(root_handle, node);
 				starpu_data_get_interface_on_node(root_handle, node);
 
 
 			local->ptr = local_root->ptr;
 			local->ptr = local_root->ptr;
+                        local->offset = local_root->offset;
+                        local->dev_handle = local_root->dev_handle;
 		}
 		}
 	}
 	}
 
 
@@ -119,6 +123,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 				starpu_data_get_interface_on_node(root_handle, node);
 				starpu_data_get_interface_on_node(root_handle, node);
 
 
 			local->ptr = local_root->ptr + length_first*elemsize;
 			local->ptr = local_root->ptr + length_first*elemsize;
+                        local->offset = local_root->offset + length_first*elemsize;
+                        local->dev_handle = local_root->dev_handle;
 		}
 		}
 	}
 	}
 }
 }
@@ -161,6 +167,8 @@ void starpu_list_filter_func_vector(starpu_filter *f, starpu_data_handle root_ha
 					starpu_data_get_interface_on_node(root_handle, node);
 					starpu_data_get_interface_on_node(root_handle, node);
 
 
 				local->ptr = local_root->ptr + current_pos*elemsize;
 				local->ptr = local_root->ptr + current_pos*elemsize;
+                                local->offset = local_root->offset + current_pos*elemsize;
+                                local->dev_handle = local_root->dev_handle;
 			}
 			}
 		}
 		}
 
 

+ 118 - 2
src/datawizard/interfaces/vector_interface.c

@@ -1,6 +1,6 @@
 /*
 /*
  * StarPU
  * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,6 +25,10 @@
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda.h>
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -33,6 +37,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 
 static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
 static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -43,6 +53,12 @@ static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_ram = NULL,
@@ -87,9 +103,13 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 
 
 		if (node == home_node) {
 		if (node == home_node) {
 			local_interface->ptr = vector_interface->ptr;
 			local_interface->ptr = vector_interface->ptr;
+                        local_interface->dev_handle = vector_interface->dev_handle;
+                        local_interface->offset = vector_interface->offset;
 		}
 		}
 		else {
 		else {
 			local_interface->ptr = 0;
 			local_interface->ptr = 0;
+                        local_interface->dev_handle = 0;
+                        local_interface->offset = 0;
 		}
 		}
 
 
 		local_interface->nx = vector_interface->nx;
 		local_interface->nx = vector_interface->nx;
@@ -116,7 +136,9 @@ void starpu_register_vector_data(starpu_data_handle *handleptr, uint32_t home_no
 	starpu_vector_interface_t vector = {
 	starpu_vector_interface_t vector = {
 		.ptr = ptr,
 		.ptr = ptr,
 		.nx = nx,
 		.nx = nx,
-		.elemsize = elemsize
+		.elemsize = elemsize,
+                .dev_handle = ptr,
+                .offset = 0
 	};	
 	};	
 
 
 	_starpu_register_data_handle(handleptr, home_node, &vector, &interface_vector_ops); 
 	_starpu_register_data_handle(handleptr, home_node, &vector, &interface_vector_ops); 
@@ -216,6 +238,19 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 			}
 			}
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -228,6 +263,8 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
 
 	/* update the data properly in consequence */
 	/* update the data properly in consequence */
 	interface->ptr = addr;
 	interface->ptr = addr;
+        interface->dev_handle = addr;
+        interface->offset = 0;
 	
 	
 	return allocated_memory;
 	return allocated_memory;
 }
 }
@@ -246,6 +283,11 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)vector_interface->ptr);
 			cudaFree((void*)vector_interface->ptr);
 			break;
 			break;
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void *)vector_interface->ptr);
+                        break;
+#endif
 		default:
 		default:
 			assert(0);
 			assert(0);
 	}
 	}
@@ -349,6 +391,80 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
 
 
 
 #endif // STARPU_USE_CUDA
 #endif // STARPU_USE_CUDA
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
+                                                dst_vector->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
+                                                  src_vector->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
+                                                dst_vector->offset, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
+                                                  src_vector->offset, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return 0;
+}
+
+#endif
 
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 {

+ 2 - 1
src/datawizard/memory_nodes.h

@@ -26,7 +26,8 @@ typedef enum {
 	STARPU_UNUSED,
 	STARPU_UNUSED,
 	STARPU_SPU_LS,
 	STARPU_SPU_LS,
 	STARPU_RAM,
 	STARPU_RAM,
-	STARPU_CUDA_RAM
+	STARPU_CUDA_RAM,
+        STARPU_OPENCL_RAM,
 } starpu_node_kind;
 } starpu_node_kind;
 
 
 typedef struct {
 typedef struct {

+ 422 - 0
src/drivers/opencl/driver_opencl.c

@@ -0,0 +1,422 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <common/config.h>
+#include <common/utils.h>
+#include <core/debug.h>
+#include <starpu_opencl.h>
+#include "driver_opencl.h"
+#include "driver_opencl_utils.h"
+#include <common/utils.h>
+
+static cl_context contexts[STARPU_MAXOPENCLDEVS];
+static cl_device_id devices[STARPU_MAXOPENCLDEVS];
+static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
+static cl_uint nb_devices = -1;
+static int init_done = 0;
+extern char *_starpu_opencl_codelet_dir;
+
+void starpu_opencl_get_context(int devid, cl_context *context)
+{
+        *context = contexts[devid];
+}
+
+void starpu_opencl_get_device(int devid, cl_device_id *device)
+{
+        *device = devices[devid];
+}
+
+void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
+{
+        *queue = queues[devid];
+}
+
+int _starpu_opencl_init_context(int devid)
+{
+	cl_int err;
+        cl_device_id device;
+
+        _STARPU_OPENCL_DEBUG("Initialising context for dev %d\n", devid);
+
+        // Create a compute context
+        device = devices[devid];
+        contexts[devid] = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        // Create queue for the given device
+        queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        _starpu_opencl_init_programs(devid);
+
+	return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_deinit_context(int devid)
+{
+        int err;
+
+        _STARPU_OPENCL_DEBUG("De-initialising context for dev %d\n", devid);
+
+        err = clReleaseContext(contexts[devid]);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        err = clReleaseCommandQueue(queues[devid]);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        _starpu_opencl_release_programs(devid);
+
+        return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags)
+{
+	cl_int err;
+        cl_mem address;
+        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+
+	address = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        *addr = address;
+        return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event)
+{
+      int err;
+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+
+      if (event == NULL) {
+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
+      }
+      else {
+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
+      }
+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+      return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event)
+{
+      int err;
+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+
+      if (event == NULL) {
+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
+      }
+      else {
+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
+      }
+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+      return EXIT_SUCCESS;
+}
+
+void _starpu_opencl_init()
+{
+   if (!init_done) {
+           cl_platform_id platform_id[STARPU_OPENCL_PLATFORM_MAX];
+           cl_uint nb_platforms;
+           cl_device_type device_type = CL_DEVICE_TYPE_GPU;
+           cl_int err;
+
+           _STARPU_OPENCL_DEBUG("Initialising OpenCL\n");
+
+           // Get Platforms
+           err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
+           if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+           _STARPU_OPENCL_DEBUG("Platforms detected: %d\n", nb_platforms);
+
+           // Get devices
+           nb_devices = 0;
+           {
+                   unsigned int i;
+                   for (i=0; i<nb_platforms; i++) {
+                           cl_uint num;
+
+#ifdef STARPU_VERBOSE
+                           {
+                                   char name[1024], vendor[1024];
+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
+                                   _STARPU_OPENCL_DEBUG("Platform: %s - %s\n", name, vendor);
+                           }
+#endif
+                           err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
+                           if (err == CL_DEVICE_NOT_FOUND) {
+                                   _STARPU_OPENCL_DEBUG("  No devices detected on this platform\n");
+                           }
+                           else {
+                                   if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                                   _STARPU_OPENCL_DEBUG("  %d devices detected\n", num);
+                                   nb_devices += num;
+                           }
+         }
+      }
+
+      // Get location of OpenCl codelet source files
+      _starpu_opencl_codelet_dir = getenv("STARPU_OPENCL_CODELET_DIR");
+
+      init_done=1;
+   }
+}
+
+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args);
+
+void *_starpu_opencl_worker(void *arg)
+{
+	struct starpu_worker_s* args = arg;
+
+	int devid = args->devid;
+	unsigned memory_node = args->memory_node;
+
+#ifdef USE_FXT
+	fxt_register_thread(args->bindid);
+#endif
+
+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+
+	_starpu_set_local_memory_node_key(&(args->memory_node));
+
+	_starpu_set_local_queue(args->jobq);
+
+	_starpu_set_local_worker_key(args);
+
+	/* this is only useful (and meaningful) is there is a single
+	   memory node "related" to that queue */
+	args->jobq->memory_node = memory_node;
+
+	args->jobq->total_computation_time = 0.0;
+	args->jobq->total_communication_time = 0.0;
+	args->jobq->total_computation_time_error = 0.0;
+	args->jobq->total_job_performed = 0;
+
+	_starpu_opencl_init_context(devid);
+
+	/* one more time to avoid hacks from third party lib :) */
+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+
+	args->status = STATUS_UNKNOWN;
+
+	/* get the device's name */
+	char devname[128];
+	_starpu_opencl_get_device_name(devid, devname, 128);
+	snprintf(args->name, 32, "OpenCL %d (%s)", args->devid, devname);
+
+	_STARPU_OPENCL_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
+
+	STARPU_TRACE_WORKER_INIT_END
+
+	/* tell the main thread that this one is ready */
+	PTHREAD_MUTEX_LOCK(&args->mutex);
+	args->worker_is_initialized = 1;
+	PTHREAD_COND_SIGNAL(&args->ready_cond);
+	PTHREAD_MUTEX_UNLOCK(&args->mutex);
+
+	struct starpu_job_s * j;
+	int res;
+
+	struct starpu_sched_policy_s *policy = _starpu_get_sched_policy();
+	struct starpu_jobq_s *queue = policy->starpu_get_local_queue(policy);
+	unsigned memnode = args->memory_node;
+
+	while (_starpu_machine_is_running())
+	{
+		STARPU_TRACE_START_PROGRESS(memnode);
+		_starpu_datawizard_progress(memnode, 1);
+		STARPU_TRACE_END_PROGRESS(memnode);
+
+		_starpu_execute_registered_progression_hooks();
+
+		_starpu_jobq_lock(queue);
+
+		/* perhaps there is some local task to be executed first */
+		j = _starpu_pop_local_task(args);
+
+		/* otherwise ask a task to the scheduler */
+		if (!j)
+			j = _starpu_pop_task();
+
+		if (j == NULL) {
+			if (_starpu_worker_can_block(memnode))
+				PTHREAD_COND_WAIT(&queue->activity_cond, &queue->activity_mutex);
+			_starpu_jobq_unlock(queue);
+			continue;
+		}
+
+		_starpu_jobq_unlock(queue);
+
+		/* can OpenCL do that task ? */
+		if (!STARPU_OPENCL_MAY_PERFORM(j))
+		{
+			/* this is not a OpenCL task */
+			_starpu_push_task(j);
+			continue;
+		}
+
+		_starpu_set_current_task(j->task);
+
+		res = _starpu_opencl_execute_job(j, args);
+
+		_starpu_set_current_task(NULL);
+
+                if (res) {
+			switch (res) {
+				case -EAGAIN:
+					fprintf(stderr, "ouch, put the codelet %p back ... \n", j);
+					_starpu_push_task(j);
+					STARPU_ABORT();
+					continue;
+				default:
+					assert(0);
+			}
+		}
+
+		_starpu_handle_job_termination(j);
+	}
+
+	STARPU_TRACE_WORKER_DEINIT_START
+
+          _starpu_opencl_deinit_context(devid);
+
+#ifdef DATA_STATS
+	fprintf(stderr, "OpenCL #%d computation %le comm %le (%lf \%%)\n", args->id, args->jobq->total_computation_time, args->jobq->total_communication_time, args->jobq->total_communication_time*100.0/args->jobq->total_computation_time);
+#endif
+
+#ifdef STARPU_VERBOSE
+	double ratio = 0;
+	if (args->jobq->total_job_performed != 0)
+	{
+		ratio = args->jobq->total_computation_time_error/args->jobq->total_computation_time;
+	}
+
+
+	_starpu_print_to_logfile("MODEL ERROR: OpenCL %d ERROR %lf EXEC %lf RATIO %lf NTASKS %d\n", args->devid, args->jobq->total_computation_time_error, args->jobq->total_computation_time, ratio, args->jobq->total_job_performed);
+#endif
+
+	pthread_exit(NULL);
+
+	return NULL;
+}
+
+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
+{
+	int err;
+
+        if (!init_done) {
+                _starpu_opencl_init();
+        }
+
+	// Get device name
+	err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	_STARPU_OPENCL_DEBUG("Device %d : [%s]\n", dev, name);
+	return EXIT_SUCCESS;
+}
+
+unsigned _starpu_opencl_get_device_count(void)
+{
+        if (!init_done) {
+                _starpu_opencl_init();
+        }
+	return nb_devices;
+}
+
+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args)
+{
+	int ret;
+//	uint32_t mask = (1<<0);
+	uint32_t mask = 0;
+
+	STARPU_ASSERT(j);
+	struct starpu_task *task = j->task;
+
+	starpu_tick_t codelet_start, codelet_end;
+	starpu_tick_t codelet_start_comm, codelet_end_comm;
+
+	unsigned calibrate_model = 0;
+
+	STARPU_ASSERT(task);
+	struct starpu_codelet_t *cl = task->cl;
+	STARPU_ASSERT(cl);
+
+	if (cl->model && cl->model->benchmarking)
+		calibrate_model = 1;
+
+	/* we do not take communication into account when modeling the performance */
+	if (STARPU_BENCHMARK_COMM)
+	{
+                //barrier(CLK_GLOBAL_MEM_FENCE);
+		STARPU_GET_TICK(codelet_start_comm);
+	}
+
+	ret = _starpu_fetch_task_input(task, mask);
+	if (ret != 0) {
+		/* there was not enough memory, so the input of
+		 * the codelet cannot be fetched ... put the
+		 * codelet back, and try it later */
+		return -EAGAIN;
+	}
+
+	if (calibrate_model || STARPU_BENCHMARK_COMM)
+	{
+                //barrier(CLK_GLOBAL_MEM_FENCE);
+		STARPU_GET_TICK(codelet_end_comm);
+	}
+
+	STARPU_TRACE_START_CODELET_BODY(j);
+
+	args->status = STATUS_EXECUTING;
+	cl_func func = cl->opencl_func;
+	STARPU_ASSERT(func);
+	STARPU_GET_TICK(codelet_start);
+	func(task->interface, task->cl_arg);
+
+	cl->per_worker_stats[args->workerid]++;
+
+	STARPU_GET_TICK(codelet_end);
+
+	args->status = STATUS_UNKNOWN;
+
+	STARPU_TRACE_END_CODELET_BODY(j);
+
+	if (calibrate_model || STARPU_BENCHMARK_COMM)
+	{
+		double measured = _starpu_timing_delay(&codelet_start, &codelet_end);
+		double measured_comm = _starpu_timing_delay(&codelet_start_comm, &codelet_end_comm);
+
+		args->jobq->total_computation_time += measured;
+		args->jobq->total_communication_time += measured_comm;
+
+		double error;
+		error = fabs(STARPU_MAX(measured, 0.0) - STARPU_MAX(j->predicted, 0.0));
+		args->jobq->total_computation_time_error += error;
+
+		if (calibrate_model)
+			_starpu_update_perfmodel_history(j, args->perf_arch, (unsigned)args->devid, measured);
+	}
+
+	args->jobq->total_job_performed++;
+
+	_starpu_push_task_output(task, mask);
+
+	return EXIT_SUCCESS;
+}

+ 57 - 0
src/drivers/opencl/driver_opencl.h

@@ -0,0 +1,57 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_OPENCL_H__
+#define __DRIVER_OPENCL_H__
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <CL/cl.h>
+
+extern
+int _starpu_opencl_init_context(int devid);
+
+extern
+int _starpu_opencl_deinit_context(int devid);
+
+extern
+unsigned _starpu_opencl_get_device_count(void);
+
+extern
+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
+
+extern
+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event);
+
+extern
+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event);
+
+extern
+void _starpu_opencl_init(void);
+
+extern
+void *_starpu_opencl_worker(void *);
+
+extern
+int _starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue,
+                               char *program_name, char *kernel_name, int dev);
+
+extern
+int _starpu_opencl_compile_source_to_opencl(char *source_file_name);
+
+#endif //  __DRIVER_OPENCL_H__

+ 430 - 0
src/drivers/opencl/driver_opencl_utils.c

@@ -0,0 +1,430 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <starpu_opencl.h>
+#include <common/list.h>
+#include <common/htable32.h>
+#include <core/workers.h>
+#include "driver_opencl_utils.h"
+#include "driver_opencl.h"
+
+#define CRC32C_POLY_BE 0x1EDC6F41
+
+static
+inline uint32_t __attribute__ ((pure)) crc32_be_8(uint8_t inputbyte, uint32_t inputcrc)
+{
+	unsigned i;
+	uint32_t crc;
+
+	crc = inputcrc ^ (inputbyte << 24);
+	for (i = 0; i < 8; i++)
+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE : 0);
+
+	return crc;
+}
+
+static
+uint32_t crc32_string(char *str)
+{
+	uint32_t hash = 0;
+
+	size_t len = strlen(str);
+
+	unsigned i;
+	for (i = 0; i < len; i++)
+	{
+		hash = crc32_be_8((uint8_t)str[i], hash);
+	}
+
+	return hash;
+}
+
+static
+cl_uint _starpu_opencl_device_uniqueid(cl_device_id id)
+{
+	char name[1024];
+	cl_int  err;
+
+	err = clGetDeviceInfo(id, CL_DEVICE_NAME, 1024, name, NULL);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	//  fprintf(stderr, "name %s\n", name);
+
+	return crc32_string(name);
+}
+
+char *_starpu_opencl_codelet_dir;
+
+static
+int _starpu_opencl_locate_file(char *source_file_name, char *located_file_name) {
+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", source_file_name);
+        if (access(source_file_name, R_OK) == 0) {
+                strcpy(located_file_name, source_file_name);
+                return EXIT_SUCCESS;
+        }
+        if (_starpu_opencl_codelet_dir) {
+                sprintf(located_file_name, "%s/%s", _starpu_opencl_codelet_dir, source_file_name);
+                _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
+                if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
+        }
+        sprintf(located_file_name, "%s/%s", STARPU_OPENCL_DATADIR, source_file_name);
+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
+        sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
+
+        strcpy(located_file_name, "");
+        OPENCL_ERROR("Cannot locate file <%s>\n", source_file_name);
+        return EXIT_FAILURE;
+}
+
+static
+unsigned char *_starpu_opencl_load_program_binary(char *filename, size_t *len)
+{
+	struct stat statbuf;
+	FILE        *fh;
+	unsigned char        *binary;
+
+	fh = fopen(filename, "r");
+	if (fh == 0)
+		return EXIT_SUCCESS;
+
+	stat(filename, &statbuf);
+
+	binary = (unsigned char *) malloc(statbuf.st_size);
+	if(!binary)
+		return binary;
+
+	fread(binary, statbuf.st_size, 1, fh);
+
+	*len = statbuf.st_size;
+	return binary;
+}
+
+static
+cl_int _starpu_opencl_load_program(cl_context context, char *program_name, cl_device_id device, cl_program *program)
+{
+        //	cl_program     program;
+        const unsigned char *binary;
+	size_t         len;
+	cl_int         err;
+	cl_int         status;
+        cl_device_type type;
+
+	cl_uint uniqueid;
+	char     located_program_name[1024];
+	char     binary_file_name[1024];
+	char    *p;
+
+        // locate file
+        _starpu_opencl_locate_file(program_name, located_program_name);
+
+        // Get type of device
+        err = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
+	if (err != CL_SUCCESS) {
+                STARPU_OPENCL_REPORT_ERROR(err);
+                return err;
+        }
+
+        // Get the name of the binary file
+	uniqueid = _starpu_opencl_device_uniqueid(device);
+	strcpy(binary_file_name, located_program_name);
+	p = strstr(binary_file_name, ".cl");
+	if(p == NULL) OPENCL_ERROR("Program file name doesn't have the '.cl' extension!\n");
+        strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
+	sprintf(p + strlen(p), "%u", uniqueid);
+
+        // Load the binary file
+	binary = _starpu_opencl_load_program_binary(binary_file_name, &len);
+	if(binary == NULL)
+		OPENCL_ERROR("Cannot load binary file %s\n", binary_file_name);
+
+	//_STARPU_OPENCL_DEBUG("[%s] binary file loaded.\n", binary_file_name);
+	*program = clCreateProgramWithBinary(context, 1, &device, &len, &binary, &status, &err);
+	if (err != CL_SUCCESS) {
+                STARPU_OPENCL_REPORT_ERROR(err);
+                return err;
+        }
+
+	// Build the program executable
+	err = clBuildProgram(*program, 0, NULL, NULL, NULL, NULL);
+	if (err != CL_SUCCESS) {
+		size_t len;
+		char buffer[2048];
+
+		fprintf(stderr, "Error: Failed to build program executable!\n");
+		clGetProgramBuildInfo(*program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+		fprintf(stderr, "%s\n", buffer);
+                return err;
+	}
+
+	return CL_SUCCESS;
+}
+
+static struct starpu_htbl32_node_s *history_program_hash[STARPU_MAXOPENCLDEVS] = {NULL};
+LIST_TYPE(program,
+          char *program_name;
+          cl_program program;
+          );
+program_list_t history_program_list[STARPU_MAXOPENCLDEVS];
+
+int _starpu_opencl_init_programs(int dev)
+{
+        history_program_list[dev] = program_list_new();
+        return CL_SUCCESS;
+}
+
+int _starpu_opencl_release_programs(int dev)
+{
+        while (!program_list_empty(history_program_list[dev])) {
+                program_t pp = program_list_pop_front(history_program_list[dev]);
+                _STARPU_OPENCL_DEBUG("Releasing program=<%s> on dev=<%d>\n", pp->program_name, dev);
+                clReleaseProgram(pp->program);
+        }
+        program_list_delete(history_program_list[dev]);
+        return CL_SUCCESS;
+}
+
+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid)
+{
+        int err;
+	cl_device_id device;
+        cl_context context;
+        uint32_t key;
+        cl_program program;
+
+        starpu_opencl_get_device(devid, &device);
+        starpu_opencl_get_context(devid, &context);
+        starpu_opencl_get_queue(devid, queue);
+
+        key = crc32_string(program_name);
+        program = _starpu_htbl_search_32(history_program_hash[devid], key);
+        if (!program) {
+                err = _starpu_opencl_load_program(context, program_name, device, &program);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                _starpu_htbl_insert_32(&(history_program_hash[devid]), key, program);
+                program_t pp = program_new();
+                pp->program_name = program_name;
+                pp->program = program;
+                program_list_push_front(history_program_list[devid], pp);
+        }
+
+        // Create the compute kernel in the program we wish to run
+        *kernel = clCreateKernel(program, kernel_name, &err);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	return CL_SUCCESS;
+}
+
+int starpu_opencl_release(cl_kernel kernel) {
+	cl_int err;
+
+	err = clReleaseKernel(kernel);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        return CL_SUCCESS;
+}
+
+static
+char *_starpu_opencl_load_program_source(const char *filename)
+{
+        struct stat statbuf;
+        FILE        *fh;
+        char        *source;
+
+        fh = fopen(filename, "r");
+        if (fh == 0)
+                return EXIT_SUCCESS;
+
+        stat(filename, &statbuf);
+        source = (char *) malloc(statbuf.st_size + 1);
+        fread(source, statbuf.st_size, 1, fh);
+        source[statbuf.st_size] = '\0';
+
+        fclose(fh);
+
+        return source;
+}
+
+static
+int _starpu_opencl_store_program_binary(const char *filename, const char *binary, size_t len)
+{
+        FILE *fh;
+
+        fh = fopen(filename, "w");
+        if(fh == NULL) {
+                perror("fopen"); return EXIT_FAILURE;
+        }
+
+        fwrite(binary, len, 1, fh);
+        fclose(fh);
+
+        return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_compile_source_to_opencl(char *source_file_name)
+{
+        int              err;
+        int              device_type = CL_DEVICE_TYPE_ALL;
+        cl_device_id     devices[STARPU_MAXOPENCLDEVS];
+        unsigned         max = STARPU_MAXOPENCLDEVS;
+        unsigned         nb_devices = 0;
+        cl_uint          history[STARPU_MAXOPENCLDEVS]; // To track similar devices
+        char             preproc_file_name[1024];
+        char             located_file_name[1024];
+        cl_platform_id   platform_ids[STARPU_OPENCL_PLATFORM_MAX];
+        cl_uint          platform, nb_platforms;
+
+        // Locate source file
+        _starpu_opencl_locate_file(source_file_name, located_file_name);
+        _STARPU_OPENCL_DEBUG("Source file name : <%s>\n", located_file_name);
+
+        // Prepare preprocessor temporary filename
+        {
+                char *p;
+                strcpy(preproc_file_name, located_file_name);
+                p = strstr(preproc_file_name, ".cl");
+                if(p == NULL)
+                        OPENCL_ERROR("Kernel file name doesn't have the '.cl' extension!\n");
+                strcpy(p, ".pre");
+        }
+
+        // Get Platforms
+        err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_ids, &nb_platforms);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        // Iterate over each platform
+        for(platform=0; platform<nb_platforms; platform++) {
+                // Get devices
+                err = clGetDeviceIDs(platform_ids[platform], device_type, max, devices, &nb_devices);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                if(nb_devices > max)
+                        nb_devices = max;
+
+                // Iterate over each device
+                unsigned int dev;
+                for(dev = 0; dev < nb_devices; dev ++) {
+                        cl_context       context;
+                        cl_program       program;
+                        cl_device_type   type;
+                        cl_int           err;
+                        cl_uint          uniqueid;
+
+                        uniqueid =_starpu_opencl_device_uniqueid(devices[dev]);
+                        // Look up and update history (to avoid unuseful compilations in the case of identical devices)
+                        {
+                                unsigned int d;
+                                for(d = 0; d < dev; d++)
+                                        if(history[d] == uniqueid)
+                                                break; // Just skip compiling for this device
+                                if(d != dev)
+                                        continue;
+                                history[dev] = uniqueid;
+                        }
+
+                        err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                        // Create a compute context
+                        context = clCreateContext(0, 1, devices + dev, NULL, NULL, &err);
+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                        // Run C preprocessor
+                        {
+                                pid_t pid;
+                                pid = fork();
+                                if(pid == 0) {
+                                        execlp("cpp", "cpp", located_file_name, "-o", preproc_file_name, NULL);
+                                        perror("execlp");
+                                        exit(EXIT_FAILURE);
+                                }
+                                else {
+                                        int status;
+                                        waitpid(pid, &status, 0);
+                                        if (WEXITSTATUS(status) != EXIT_SUCCESS)
+                                                OPENCL_ERROR("Cannot preprocess file [%s]\n", located_file_name);
+                                }
+                        }
+
+                        // Load the compute program from disk into a cstring buffer
+                        char *source = _starpu_opencl_load_program_source(preproc_file_name);
+                        if(!source)
+                                OPENCL_ERROR("Failed to load compute program from file <%s>!\n", preproc_file_name);
+
+                        // Create the compute program from the source buffer
+                        program = clCreateProgramWithSource(context, 1, (const char **) & source, NULL, &err);
+                        if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                        // Build the program executable
+                        err = clBuildProgram(program, 1, devices + dev, "-Werror -cl-mad-enable", NULL, NULL);
+                        if (err != CL_SUCCESS) {
+                                size_t len;
+                                static char buffer[4096];
+
+                                fprintf(stderr, "Error: Failed to build program executable!\n");
+                                clGetProgramBuildInfo(program, devices[dev], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+
+                                fprintf(stderr, "%s\n", buffer);
+                                return EXIT_FAILURE;
+                        }
+
+                        // Store program binary
+                        {
+                                char     binary_file_name[1024];
+                                char    *binary;
+                                size_t   binary_len;
+                                char    *p;
+
+                                strcpy(binary_file_name, located_file_name);
+                                p = strstr(binary_file_name, ".cl");
+                                if(p == NULL)
+                                        OPENCL_ERROR("Input file name doesn't have the '.cl' extension!\n");
+
+                                strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
+                                sprintf(p + strlen(p), "%u", uniqueid);
+
+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_len, NULL);
+                                if(err != CL_SUCCESS)
+                                        OPENCL_ERROR("Cannot get program binary size (err = %d)!\n", err);
+
+                                binary = malloc(binary_len);
+
+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binary), &binary, NULL);
+                                if(err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                                _starpu_opencl_store_program_binary(binary_file_name, binary, binary_len);
+
+                                free(binary);
+
+                                _STARPU_OPENCL_DEBUG("Binary file [%s] successfully built (%ld bytes).\n", binary_file_name, binary_len);
+                        }
+
+                        clReleaseProgram(program);
+                        clReleaseContext(context);
+                }
+        }
+
+        return EXIT_SUCCESS;
+}

+ 41 - 0
src/drivers/opencl/driver_opencl_utils.h

@@ -0,0 +1,41 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_OPENCL_UTILS_H__
+#define __STARPU_OPENCL_UTILS_H__
+
+#include <config.h>
+
+#ifdef STARPU_VERBOSE
+#  define _STARPU_OPENCL_DEBUG(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
+#else
+#  define _STARPU_OPENCL_DEBUG(fmt, args ...)
+#endif
+
+#define _STARPU_OPENCL_DISP(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
+
+#define OPENCL_ERROR(fmt, args ...)                                                   \
+	do {                                                                          \
+                fprintf(stderr, "[starpu][%s] Error: " fmt ,__func__ ,##args); \
+		assert(0);                                                            \
+	} while (0)
+
+#define STARPU_OPENCL_PLATFORM_MAX 4
+
+int _starpu_opencl_init_programs(int dev);
+int _starpu_opencl_release_programs(int dev);
+
+#endif /* __STARPU_OPENCL_UTILS_H__ */

+ 1 - 0
src/util/execute_on_all.c

@@ -39,6 +39,7 @@ void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t whe
 		.where = where,
 		.where = where,
 		.cuda_func = wrapper_func,
 		.cuda_func = wrapper_func,
 		.cpu_func = wrapper_func,
 		.cpu_func = wrapper_func,
+		.opencl_func = wrapper_func,
 		/* XXX we do not handle Cell .. */
 		/* XXX we do not handle Cell .. */
 		.nbuffers = 0,
 		.nbuffers = 0,
 		.model = NULL
 		.model = NULL

+ 93 - 16
src/util/malloc.c

@@ -24,13 +24,28 @@
 #include <cuda.h>
 #include <cuda.h>
 #endif
 #endif
 
 
-#ifdef STARPU_USE_CUDA
+#ifdef STARPU_USE_OPENCL
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 struct malloc_pinned_codelet_struct {
 struct malloc_pinned_codelet_struct {
 	void **ptr;
 	void **ptr;
 	size_t dim;
 	size_t dim;
 };
 };
+#endif
+
+//#ifdef STARPU_USE_OPENCL
+//static void malloc_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
+//{
+//	struct malloc_pinned_codelet_struct *s = arg;
+//        //        *(s->ptr) = malloc(s->dim);
+//        _starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
+//}
+//#endif
 
 
-static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
+#ifdef STARPU_USE_CUDA
+static void malloc_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
 {
 {
 	struct malloc_pinned_codelet_struct *s = arg;
 	struct malloc_pinned_codelet_struct *s = arg;
 
 
@@ -39,10 +54,16 @@ static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 }
+#endif
 
 
+#if defined(STARPU_USE_CUDA)// || defined(STARPU_USE_OPENCL)
 static starpu_codelet malloc_pinned_cl = {
 static starpu_codelet malloc_pinned_cl = {
-	.where = STARPU_CUDA,
-	.cuda_func = malloc_pinned_codelet,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = malloc_pinned_cuda_codelet,
+#endif
+//#ifdef STARPU_USE_OPENCL
+//	.opencl_func = malloc_pinned_opencl_codelet,
+//#endif
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };
@@ -59,24 +80,47 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 	{
 	{
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		int push_res;
 		int push_res;
-	
+
 		struct malloc_pinned_codelet_struct s = {
 		struct malloc_pinned_codelet_struct s = {
 			.ptr = A,
 			.ptr = A,
 			.dim = dim
 			.dim = dim
-		};	
-	
+		};
+
+                malloc_pinned_cl.where = STARPU_CUDA;
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
-			task->callback_func = NULL; 
+			task->callback_func = NULL;
 			task->cl = &malloc_pinned_cl;
 			task->cl = &malloc_pinned_cl;
 			task->cl_arg = &s;
 			task->cl_arg = &s;
 
 
 		task->synchronous = 1;
 		task->synchronous = 1;
-	
+
 		push_res = starpu_submit_task(task);
 		push_res = starpu_submit_task(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 		STARPU_ASSERT(push_res != -ENODEV);
 #endif
 #endif
 	}
 	}
-	else {
+//	else if (_starpu_may_submit_opencl_task())
+//	{
+//#ifdef STARPU_USE_OPENCL
+//		int push_res;
+//
+//		struct malloc_pinned_codelet_struct s = {
+//			.ptr = A,
+//			.dim = dim
+//		};
+//
+//                malloc_pinned_cl.where = STARPU_OPENCL;
+//		struct starpu_task *task = starpu_task_create();
+//			task->callback_func = NULL;
+//			task->cl = &malloc_pinned_cl;
+//			task->cl_arg = &s;
+//
+//		task->synchronous = 1;
+//
+//		push_res = starpu_submit_task(task);
+//		STARPU_ASSERT(push_res != -ENODEV);
+//#endif
+//        }
+        else {
 		*A = malloc(dim);
 		*A = malloc(dim);
 	}
 	}
 
 
@@ -86,17 +130,32 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static void free_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
+static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
 {
 {
 	cudaError_t cures;
 	cudaError_t cures;
 	cures = cudaFreeHost(arg);
 	cures = cudaFreeHost(arg);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 }
+#endif
+
+//#ifdef STARPU_USE_OPENCL
+//static void free_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
+//{
+//        //        free(arg);
+//        int err = clReleaseMemObject(arg);
+//        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+//}
+//#endif
 
 
+#if defined(STARPU_USE_CUDA) // || defined(STARPU_USE_OPENCL)
 static starpu_codelet free_pinned_cl = {
 static starpu_codelet free_pinned_cl = {
-	.where = STARPU_CUDA,
-	.cuda_func = free_pinned_codelet,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = free_pinned_cuda_codelet,
+#endif
+//#ifdef STARPU_USE_OPENCL
+//	.opencl_func = free_pinned_opencl_codelet,
+//#endif
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };
@@ -111,18 +170,36 @@ int starpu_free_pinned_if_possible(void *A)
 	{
 	{
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		int push_res;
 		int push_res;
-	
+
+                free_pinned_cl.where = STARPU_CUDA;
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
-			task->callback_func = NULL; 
+			task->callback_func = NULL;
 			task->cl = &free_pinned_cl;
 			task->cl = &free_pinned_cl;
 			task->cl_arg = A;
 			task->cl_arg = A;
 
 
 		task->synchronous = 1;
 		task->synchronous = 1;
-	
+
 		push_res = starpu_submit_task(task);
 		push_res = starpu_submit_task(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 		STARPU_ASSERT(push_res != -ENODEV);
 #endif
 #endif
 	}
 	}
+//	else if (_starpu_may_submit_opencl_task())
+//	{
+//#ifdef STARPU_USE_OPENCL
+//		int push_res;
+//
+//                free_pinned_cl.where = STARPU_OPENCL;
+//		struct starpu_task *task = starpu_task_create();
+//			task->callback_func = NULL;
+//			task->cl = &free_pinned_cl;
+//			task->cl_arg = A;
+//
+//		task->synchronous = 1;
+//
+//		push_res = starpu_submit_task(task);
+//		STARPU_ASSERT(push_res != -ENODEV);
+//#endif
+//	}
 	else {
 	else {
 		free(A);
 		free(A);
 	}
 	}

+ 6 - 1
tests/Makefile.am

@@ -33,7 +33,7 @@ if STARPU_USE_CUDA
 # TODO define NVCCFLAGS
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
 NVCC ?= nvcc
 
 
-NVCCFLAGS += -I$(top_srcdir)/include/
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
 
 
 .cu.cubin:
 .cu.cubin:
 	$(MKDIR_P) `dirname $@`
 	$(MKDIR_P) `dirname $@`
@@ -246,6 +246,11 @@ datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_kernels.cu
 	datawizard/sync_and_notify_data_kernels.cu
 endif
 endif
 
 
+if STARPU_USE_OPENCL
+datawizard_sync_and_notify_data_SOURCES +=	\
+	datawizard/sync_and_notify_data_opencl.c
+endif
+
 if STARPU_USE_GORDON
 if STARPU_USE_GORDON
 datawizard_sync_and_notify_data_SOURCES +=	\
 datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_gordon_kernels.c
 	datawizard/sync_and_notify_data_gordon_kernels.c

+ 2 - 1
tests/core/empty_task_sync_point.c

@@ -34,9 +34,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 2 - 1
tests/core/execute_on_a_specific_worker.c

@@ -71,9 +71,10 @@ static starpu_access_mode select_random_mode(void)
 
 
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = codelet_null,
 	.cpu_func = codelet_null,
 	.cuda_func = codelet_null,
 	.cuda_func = codelet_null,
+        .opencl_func = codelet_null,
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 2 - 1
tests/core/multithreaded.c

@@ -33,9 +33,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 2 - 1
tests/core/regenerate.c

@@ -51,9 +51,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 5 - 2
tests/core/starpu_wait_all_tasks.c

@@ -28,9 +28,10 @@ static void dummy_func(void *descr[], void *arg)
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 	.gordon_func = 0, /* this will be defined later */
 	.gordon_func = 0, /* this will be defined later */
 #endif
 #endif
@@ -70,9 +71,11 @@ static struct starpu_conf conf = {
 	.sched_policy_name = NULL,
 	.sched_policy_name = NULL,
 	.ncpus = -1,
 	.ncpus = -1,
 	.ncuda = -1,
 	.ncuda = -1,
+        .nopencl = -1,
 	.nspus = -1,
 	.nspus = -1,
 	.use_explicit_workers_bindid = 0,
 	.use_explicit_workers_bindid = 0,
-	.use_explicit_workers_gpuid = 0,
+	.use_explicit_workers_cuda_gpuid = 0,
+	.use_explicit_workers_opencl_gpuid = 0,
 	.calibrate = 0
 	.calibrate = 0
 };
 };
 
 

+ 4 - 2
tests/core/starpu_wait_task.c

@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };
@@ -86,7 +87,8 @@ int main(int argc, char **argv)
 		int ret = starpu_submit_task(task);
 		int ret = starpu_submit_task(task);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 
 
-		starpu_wait_task(task);
+		ret = starpu_wait_task(task);
+		STARPU_ASSERT(!ret);
 
 
 		starpu_task_destroy(task);
 		starpu_task_destroy(task);
 	}
 	}

+ 3 - 2
tests/core/static_restartable.c

@@ -28,10 +28,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
-	.model = NULL,
+	.opencl_func = dummy_func,
+        .model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };
 
 

+ 2 - 1
tests/core/static_restartable_tag.c

@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 2 - 1
tests/core/static_restartable_using_initializer.c

@@ -31,9 +31,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 2 - 1
tests/core/subgraph_repeat.c

@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 2 - 1
tests/core/subgraph_repeat_regenerate.c

@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 2 - 1
tests/core/tag_wait_api.c

@@ -26,9 +26,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 3 - 2
tests/core/task_wait_api.c

@@ -26,10 +26,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
-	.model = NULL,
+	.opencl_func = dummy_func,
+        .model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };
 
 

+ 2 - 1
tests/datawizard/dining_philosophers.c

@@ -27,9 +27,10 @@ static void eat_kernel(void *descr[], void *arg)
 }
 }
 
 
 static starpu_codelet eating_cl = {
 static starpu_codelet eating_cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cuda_func = eat_kernel,
 	.cuda_func = eat_kernel,
 	.cpu_func = eat_kernel,
 	.cpu_func = eat_kernel,
+        .opencl_func = eat_kernel,
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 

+ 6 - 1
tests/datawizard/dsm_stress.c

@@ -54,6 +54,10 @@ static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_arg
 {
 {
 }
 }
 
 
+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
+{
+}
+
 static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
 }
 }
@@ -74,9 +78,10 @@ static starpu_access_mode select_random_mode(void)
 
 
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = cpu_codelet_null,
 	.cpu_func = cpu_codelet_null,
 	.cuda_func = cuda_codelet_null,
 	.cuda_func = cuda_codelet_null,
+        .opencl_func = opencl_codelet_null,
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 

+ 2 - 1
tests/datawizard/readers_and_writers.c

@@ -24,9 +24,10 @@ static void dummy_kernel(void *descr[], void *arg)
 }
 }
 
 
 static starpu_codelet rw_cl = {
 static starpu_codelet rw_cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cuda_func = dummy_kernel,
 	.cuda_func = dummy_kernel,
 	.cpu_func = dummy_kernel,
 	.cpu_func = dummy_kernel,
+	.opencl_func = dummy_kernel,
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 19 - 4
tests/datawizard/sync_and_notify_data.c

@@ -46,6 +46,11 @@ void cuda_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
 void cuda_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
 void cuda_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 #endif
 
 
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
 #define VECTORSIZE	16
 #define VECTORSIZE	16
 
 
 starpu_data_handle v_handle;
 starpu_data_handle v_handle;
@@ -79,8 +84,12 @@ int main(int argc, char **argv)
 
 
 	fprintf(stderr, "kernel incA %d incC %d elf %d\n", kernel_incA_id, kernel_incC_id, elf_id);
 	fprintf(stderr, "kernel incA %d incC %d elf %d\n", kernel_incA_id, kernel_incC_id, elf_id);
 #endif
 #endif
-	
-	starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
+
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("tests/datawizard/sync_and_notify_data_opencl_codelet.cl");
+#endif
+
+        starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
 
 
 	unsigned iter;
 	unsigned iter;
 	for (iter = 0; iter < K; iter++)
 	for (iter = 0; iter < K; iter++)
@@ -91,11 +100,14 @@ int main(int argc, char **argv)
 		{
 		{
 			/* increment a = v[0] */
 			/* increment a = v[0] */
 			starpu_codelet cl_inc_a = {
 			starpu_codelet cl_inc_a = {
-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 				.cpu_func = cpu_codelet_incA,
 				.cpu_func = cpu_codelet_incA,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 				.cuda_func = cuda_codelet_incA,
 				.cuda_func = cuda_codelet_incA,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+				.opencl_func = opencl_codelet_incA,
+#endif
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 				.gordon_func = kernel_incA_id,
 				.gordon_func = kernel_incA_id,
 #endif
 #endif
@@ -127,11 +139,14 @@ int main(int argc, char **argv)
 		{
 		{
 			/* increment c = v[2] */
 			/* increment c = v[2] */
 			starpu_codelet cl_inc_c = {
 			starpu_codelet cl_inc_c = {
-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 				.cpu_func = cpu_codelet_incC,
 				.cpu_func = cpu_codelet_incC,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 				.cuda_func = cuda_codelet_incC,
 				.cuda_func = cuda_codelet_incC,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+				.opencl_func = opencl_codelet_incC,
+#endif
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 				.gordon_func = kernel_incC_id,
 				.gordon_func = kernel_incC_id,
 #endif
 #endif

+ 77 - 0
tests/datawizard/sync_and_notify_data_opencl.c

@@ -0,0 +1,77 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <CL/cl.h>
+
+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
+{
+        unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+	id = starpu_get_worker_id();
+	devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incA", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=100;
+		size_t local=100;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}
+
+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+	id = starpu_get_worker_id();
+	devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incC", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=100;
+		size_t local=100;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 30 - 0
tests/datawizard/sync_and_notify_data_opencl_codelet.cl

@@ -0,0 +1,30 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void incA(__global unsigned* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 0)
+		input[i] ++;
+}
+
+__kernel void incC(__global unsigned* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 2)
+		input[i] ++;
+}
+

+ 4 - 1
tests/datawizard/sync_with_data_with_mem.c

@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 }
 }
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_codelet,
 	.cpu_func = dummy_codelet,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 	.cuda_func = dummy_codelet,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+        .opencl_func = dummy_codelet,
+#endif
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 4 - 1
tests/datawizard/sync_with_data_with_mem_non_blocking.c

@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 }
 }
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_codelet,
 	.cpu_func = dummy_codelet,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 	.cuda_func = dummy_codelet,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func = dummy_codelet,
+#endif
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 4 - 1
tests/datawizard/unpartition.c

@@ -32,11 +32,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 }
 }
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_codelet,
 	.cpu_func = dummy_codelet,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 	.cuda_func = dummy_codelet,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func = dummy_codelet,
+#endif
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 19 - 1
tests/datawizard/write_only_tmp_buffer.c

@@ -24,6 +24,21 @@
 
 
 starpu_data_handle v_handle;
 starpu_data_handle v_handle;
 
 
+#ifdef STARPU_USE_OPENCL
+#include <CL/cl.h>
+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	cl_mem buf = (cl_mem)STARPU_GET_VECTOR_PTR(descr[0]);
+        char ptr = 42;
+        cl_command_queue queue;
+        int id = starpu_get_worker_id();
+        int devid = starpu_get_worker_devid(id);
+
+        starpu_opencl_get_queue(devid, &queue);
+        clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, sizeof(char), &ptr, 0, NULL, NULL);
+}
+#endif
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
@@ -51,11 +66,14 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 }
 }
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = cpu_codelet_null,
 	.cpu_func = cpu_codelet_null,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_func = cuda_codelet_null,
 	.cuda_func = cuda_codelet_null,
 #endif
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func = opencl_codelet_null,
+#endif
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 2 - 1
tests/errorcheck/invalid_blocking_calls.c

@@ -38,9 +38,10 @@ static void wrong_func(void *descr[], void *arg)
 
 
 static starpu_codelet wrong_codelet = 
 static starpu_codelet wrong_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = wrong_func,
 	.cpu_func = wrong_func,
 	.cuda_func = wrong_func,
 	.cuda_func = wrong_func,
+        .opencl_func = wrong_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 3 - 1
tests/errorcheck/starpu_init_noworker.c

@@ -29,9 +29,11 @@ int main(int argc, char **argv)
 		.sched_policy_name = NULL, /* default */
 		.sched_policy_name = NULL, /* default */
 		.ncpus = 0,
 		.ncpus = 0,
 		.ncuda = 0,
 		.ncuda = 0,
+                .nopencl = 0,
 		.nspus = 0,
 		.nspus = 0,
 		.use_explicit_workers_bindid = 0,
 		.use_explicit_workers_bindid = 0,
-		.use_explicit_workers_gpuid = 0,
+		.use_explicit_workers_cuda_gpuid = 0,
+		.use_explicit_workers_opencl_gpuid = 0,
 		.calibrate = 0
 		.calibrate = 0
 	};
 	};
 
 

+ 3 - 1
tests/helper/execute_on_all.c

@@ -32,12 +32,14 @@ int main(int argc, char **argv)
 
 
 	int arg = 0x42;
 	int arg = 0x42;
 
 
-	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA);
+	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA|STARPU_OPENCL);
 
 
 	starpu_execute_on_each_worker(func, &arg, STARPU_CPU);
 	starpu_execute_on_each_worker(func, &arg, STARPU_CPU);
 	
 	
 	starpu_execute_on_each_worker(func, &arg, STARPU_CUDA);
 	starpu_execute_on_each_worker(func, &arg, STARPU_CUDA);
 
 
+        starpu_execute_on_each_worker(func, &arg, STARPU_OPENCL);
+
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	return 0;
 	return 0;

+ 2 - 1
tests/helper/starpu_create_sync_task.c

@@ -25,9 +25,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet =
 static starpu_codelet dummy_codelet =
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+        .opencl_func = dummy_func,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };
 
 

+ 5 - 2
tests/microbenchs/async_tasks_overhead.c

@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+        .opencl_func = dummy_func,
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 	.gordon_func = 0, /* this will be defined later */
 	.gordon_func = 0, /* this will be defined later */
 #endif
 #endif
@@ -91,9 +92,11 @@ static struct starpu_conf conf = {
 	.sched_policy_name = NULL,
 	.sched_policy_name = NULL,
 	.ncpus = -1,
 	.ncpus = -1,
 	.ncuda = -1,
 	.ncuda = -1,
+        .nopencl = -1,
 	.nspus = -1,
 	.nspus = -1,
 	.use_explicit_workers_bindid = 0,
 	.use_explicit_workers_bindid = 0,
-	.use_explicit_workers_gpuid = 0,
+	.use_explicit_workers_cuda_gpuid = 0,
+	.use_explicit_workers_opencl_gpuid = 0,
 	.calibrate = 0
 	.calibrate = 0
 };
 };
 
 

+ 2 - 1
tests/microbenchs/prefetch_data_on_node.c

@@ -75,9 +75,10 @@ static starpu_access_mode select_random_mode(void)
 
 
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = codelet_null,
 	.cpu_func = codelet_null,
 	.cuda_func = codelet_null,
 	.cuda_func = codelet_null,
+	.opencl_func = codelet_null,
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 7 - 2
tests/microbenchs/redundant_buffer.c

@@ -27,6 +27,10 @@
 starpu_data_handle v_handle;
 starpu_data_handle v_handle;
 static unsigned *v;
 static unsigned *v;
 
 
+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
+{
+}
+
 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
 }
 }
@@ -36,10 +40,11 @@ static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args
 }
 }
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = cpu_codelet_null,
 	.cpu_func = cpu_codelet_null,
 	.cuda_func = cuda_codelet_null,
 	.cuda_func = cuda_codelet_null,
-	.nbuffers = 2
+	.opencl_func = opencl_codelet_null,
+        .nbuffers = 2
 };
 };
 
 
 
 

+ 2 - 1
tests/microbenchs/sync_tasks_overhead.c

@@ -28,9 +28,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+        .opencl_func = dummy_func,
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 	.gordon_func = 0, /* this will be defined later */
 	.gordon_func = 0, /* this will be defined later */
 #endif
 #endif

+ 2 - 1
tests/microbenchs/tasks_overhead.c

@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 
 static starpu_codelet dummy_codelet = 
 static starpu_codelet dummy_codelet = 
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.model = NULL,
 	.nbuffers = 0
 	.nbuffers = 0
 };
 };

+ 2 - 1
tests/overlap/overlap.c

@@ -59,9 +59,10 @@ static struct starpu_perfmodel_t model = {
 };
 };
 
 
 static starpu_codelet cl = {
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = codelet_sleep,
 	.cpu_func = codelet_sleep,
 	.cuda_func = codelet_sleep,
 	.cuda_func = codelet_sleep,
+        .opencl_func = codelet_sleep,
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model =  &model
 	.model =  &model
 };
 };