소스 검색

merge branches/opencl --> trunk

Nathalie Furmento 15 년 전
부모
커밋
59c29c44ec
90개의 변경된 파일4205개의 추가작업 그리고 192개의 파일을 삭제
  1. 1 0
      Makefile.am
  2. 79 6
      configure.ac
  3. 13 1
      doc/starpu.texi
  4. 91 5
      examples/Makefile.am
  5. 167 0
      examples/block/block.c
  6. 34 0
      examples/block/block_cuda.cu
  7. 23 0
      examples/block/block_kernel.cl
  8. 1 1
      examples/heat/Makefile.in
  9. 23 10
      examples/incrementer/incrementer.c
  10. 49 0
      examples/incrementer/incrementer_kernels_opencl.c
  11. 23 0
      examples/incrementer/incrementer_kernels_opencl_codelet.cl
  12. 193 0
      examples/matvecmult/matvecmult.c
  13. 48 0
      examples/matvecmult/matvecmult_kernel.cl
  14. 2 2
      examples/ppm_downscaler/yuv_downscaler.c
  15. 76 2
      examples/spmv/dw_spmv.c
  16. 0 1
      examples/spmv/dw_spmv.h
  17. 42 0
      examples/spmv/spmv_opencl.cl
  18. 147 0
      examples/stencil/stencil.c
  19. 43 0
      examples/stencil/stencil.h
  20. 66 0
      examples/stencil/stencil_opencl.c
  21. 100 0
      examples/variable/variable.c
  22. 30 0
      examples/variable/variable_kernels.cu
  23. 48 0
      examples/variable/variable_kernels_opencl.c
  24. 23 0
      examples/variable/variable_kernels_opencl_codelet.cl
  25. 14 2
      include/starpu.h
  26. 4 2
      include/starpu_config.h.in
  27. 6 0
      include/starpu_data_interfaces.h
  28. 192 0
      include/starpu_opencl.h
  29. 4 2
      include/starpu_perfmodel.h
  30. 2 0
      include/starpu_task.h
  31. 5 0
      src/Makefile.am
  32. 1 0
      src/core/jobs.h
  33. 3 0
      src/core/perfmodel/perfmodel.c
  34. 6 3
      src/core/perfmodel/perfmodel.h
  35. 335 66
      src/core/perfmodel/perfmodel_bus.c
  36. 173 21
      src/core/topology.c
  37. 28 0
      src/core/workers.c
  38. 16 3
      src/core/workers.h
  39. 6 4
      src/datawizard/coherency.c
  40. 67 0
      src/datawizard/copy_driver.c
  41. 23 0
      src/datawizard/copy_driver.h
  42. 121 0
      src/datawizard/interfaces/bcsr_interface.c
  43. 122 0
      src/datawizard/interfaces/block_interface.c
  44. 114 0
      src/datawizard/interfaces/csr_interface.c
  45. 4 0
      src/datawizard/interfaces/matrix_filters.c
  46. 120 1
      src/datawizard/interfaces/matrix_interface.c
  47. 109 0
      src/datawizard/interfaces/variable_interface.c
  48. 8 0
      src/datawizard/interfaces/vector_filters.c
  49. 118 2
      src/datawizard/interfaces/vector_interface.c
  50. 2 1
      src/datawizard/memory_nodes.h
  51. 422 0
      src/drivers/opencl/driver_opencl.c
  52. 57 0
      src/drivers/opencl/driver_opencl.h
  53. 430 0
      src/drivers/opencl/driver_opencl_utils.c
  54. 41 0
      src/drivers/opencl/driver_opencl_utils.h
  55. 1 0
      src/util/execute_on_all.c
  56. 93 16
      src/util/malloc.c
  57. 6 1
      tests/Makefile.am
  58. 2 1
      tests/core/empty_task_sync_point.c
  59. 2 1
      tests/core/execute_on_a_specific_worker.c
  60. 2 1
      tests/core/multithreaded.c
  61. 2 1
      tests/core/regenerate.c
  62. 5 2
      tests/core/starpu_wait_all_tasks.c
  63. 4 2
      tests/core/starpu_wait_task.c
  64. 3 2
      tests/core/static_restartable.c
  65. 2 1
      tests/core/static_restartable_tag.c
  66. 2 1
      tests/core/static_restartable_using_initializer.c
  67. 2 1
      tests/core/subgraph_repeat.c
  68. 2 1
      tests/core/subgraph_repeat_regenerate.c
  69. 2 1
      tests/core/tag_wait_api.c
  70. 3 2
      tests/core/task_wait_api.c
  71. 2 1
      tests/datawizard/dining_philosophers.c
  72. 6 1
      tests/datawizard/dsm_stress.c
  73. 2 1
      tests/datawizard/readers_and_writers.c
  74. 19 4
      tests/datawizard/sync_and_notify_data.c
  75. 77 0
      tests/datawizard/sync_and_notify_data_opencl.c
  76. 30 0
      tests/datawizard/sync_and_notify_data_opencl_codelet.cl
  77. 4 1
      tests/datawizard/sync_with_data_with_mem.c
  78. 4 1
      tests/datawizard/sync_with_data_with_mem_non_blocking.c
  79. 4 1
      tests/datawizard/unpartition.c
  80. 19 1
      tests/datawizard/write_only_tmp_buffer.c
  81. 2 1
      tests/errorcheck/invalid_blocking_calls.c
  82. 3 1
      tests/errorcheck/starpu_init_noworker.c
  83. 3 1
      tests/helper/execute_on_all.c
  84. 2 1
      tests/helper/starpu_create_sync_task.c
  85. 5 2
      tests/microbenchs/async_tasks_overhead.c
  86. 2 1
      tests/microbenchs/prefetch_data_on_node.c
  87. 7 2
      tests/microbenchs/redundant_buffer.c
  88. 2 1
      tests/microbenchs/sync_tasks_overhead.c
  89. 2 1
      tests/microbenchs/tasks_overhead.c
  90. 2 1
      tests/overlap/overlap.c

+ 1 - 0
Makefile.am

@@ -34,4 +34,5 @@ include_HEADERS = 				\
 	include/starpu_data.h			\
 	include/starpu_perfmodel.h		\
 	include/starpu_util.h			\
+	include/starpu_opencl.h			\
 	include/starpu_expert.h

+ 79 - 6
configure.ac

@@ -52,8 +52,10 @@ esac
 
 # This will be useful for program which use CUDA (and .cubin files) which need
 # some path to the CUDA code at runtime.
-AC_DEFINE_UNQUOTED(STARPU_DIR, "$PWD", [location of StarPU sources])
-AC_SUBST(STARPU_DIR, $PWD)
+AC_DEFINE_UNQUOTED(STARPU_BUILD_DIR, "$PWD", [location of StarPU build directory])
+AC_SUBST(STARPU_BUILD_DIR, $PWD)
+AC_DEFINE_UNQUOTED(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})", [location of StarPU sources])
+AC_SUBST(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})")
 
 AC_CHECK_LIB([pthread], [pthread_create])
 case "$target" in
@@ -300,6 +302,71 @@ fi
 
 ###############################################################################
 #                                                                             #
+#                                 OpenCL settings                             #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(maximum number of OpenCL devices)
+AC_ARG_ENABLE(maxopencldev, [AS_HELP_STRING([--enable-maxopencldev=<number>],
+			[maximum number of OPENCL devices])],
+			nmaxopencldev=$enableval, nmaxopencldev=8)
+AC_MSG_RESULT($nmaxopencldev)
+AC_DEFINE_UNQUOTED(STARPU_MAXOPENCLDEVS, [$nmaxopencldev], 
+		[maximum number of OPENCL devices])
+AC_ARG_ENABLE(opencl, [AS_HELP_STRING([--disable-opencl],
+		[do not use OpenCL device(s)])],, [enable_opencl=maybe])
+
+if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
+	#AC_MSG_CHECKING(whether OpenCL is available)
+	AC_ARG_WITH(opencl-dir, 
+		[AS_HELP_STRING([--with-opencl-dir=<path>],
+		[specify OpenCL installation directory (default is /usr/)])],
+		[
+			opencl_dir=$withval
+			# in case this was not explicit yet
+			enable_opencl=yes
+		], opencl_dir=/usr/)
+	
+	if test -d "$opencl_dir/include/"; then
+		CPPFLAGS="${CPPFLAGS} -I$opencl_dir/include/ "
+	fi
+
+	# do we have a valid OpenCL setup ?
+	have_valid_opencl=yes
+	AC_CHECK_HEADER([CL/cl.h],,[have_valid_opencl=no])
+
+	# we are looking for the proper option in LDFLAGS, so we save the
+	# current value of LDFLAGS so that we can add new things in it and
+	# restore it in case it's not working.
+	SAVED_LDFLAGS="${LDFLAGS}"
+
+	found_opencllib=no
+	if test -d "$opencl_dir/lib/"; then
+		LDFLAGS="${SAVED_LDFLAGS} -L$opencl_dir/lib/ "
+		AC_SEARCH_LIBS([clCreateKernel],[OpenCL],[found_opencllib=yes],[found_opencllib=no])
+	fi
+
+	# in case OpenCL was explicitely required, but is not available, this is an error
+	if test x$enable_opencl = xyes -a x$have_valid_opencl = no; then
+		AC_MSG_ERROR([cannot find OpenCL])
+	fi
+
+	# now we enable OpenCL if and only if a proper setup is available
+	enable_opencl=$have_valid_opencl
+fi
+
+AC_MSG_CHECKING(whether OpenCL should be used)
+AC_MSG_RESULT($enable_opencl)
+AC_SUBST(STARPU_USE_OPENCL, $enable_opencl)
+AM_CONDITIONAL(STARPU_USE_OPENCL, test x$enable_opencl = xyes)
+if test x$enable_opencl = xyes; then
+	AC_DEFINE(STARPU_USE_OPENCL, [1], [OpenCL support is activated])
+        AC_DEFINE_UNQUOTED(STARPU_OPENCL_DATADIR, "$(eval echo ${datarootdir}/starpu/opencl)", [Path to OpenCL codelets])
+        AC_SUBST(STARPU_OPENCL_DATAdir, "$(eval echo ${datarootdir}/starpu/opencl/examples)")
+fi
+
+###############################################################################
+#                                                                             #
 #                                 Cell settings                               #
 #                                                                             #
 ###############################################################################
@@ -506,13 +573,19 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 
 # We have one memory node shared by all CPU workers, one node per GPU, and
 # currently the Cell driver is using the same memory node as the CPU.
-if test x$enable_cuda = xyes; then
+maxnodes=1
+if test x$enable_cuda = xyes ; then
 	# we could have used nmaxcudadev + 1, but this would certainly give an
 	# odd number.
-	maxnodes=`expr 2 \* $nmaxcudadev`
-else
-	maxnodes=1	
+	maxnodes=`expr $maxnodes + $nmaxcudadev`
 fi
+if test x$enable_opencl = xyes ; then
+	# we could have used nmaxcudadev + 1, but this would certainly give an
+	# odd number.
+	maxnodes=`expr $maxnodes + $nmaxopencldev`
+fi
+# todo: set maxnodes to the next power of 2 greater than maxnodes
+
 AC_MSG_CHECKING(maximum number of memory nodes)
 AC_MSG_RESULT($maxnodes)
 AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],

+ 13 - 1
doc/starpu.texi

@@ -476,6 +476,10 @@ specified with the @code{STARPU_NCPUS} environment variable.
 This is the maximum number of CUDA devices that StarPU can use. This can also be
 specified with the @code{STARPU_NCUDA} environment variable.
 
+@item @code{nopencl} (default = -1):
+This is the maximum number of OpenCL devices that StarPU can use. This can also be
+specified with the @code{STARPU_NOPENCL} environment variable.
+
 @item @code{nspus} (default = -1):
 This is the maximum number of Cell SPUs that StarPU can use. This can also be
 specified with the @code{STARPU_NGORDON} environment variable.
@@ -600,7 +604,8 @@ an integer between 0 and @code{starpu_get_worker_count() - 1}.
 This function returns the type of worker associated to an identifier (as
 returned by the @code{starpu_get_worker_id} function). The returned value
 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
-core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
+core, @code{STARPU_CUDA_WORKER} for a CUDA device,
+@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
 @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
 identifier is unspecified.
 
@@ -716,6 +721,13 @@ be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
 field is ignored if @code{STARPU_CUDA} does not appear in the @code{.where}
 field, it must be non-null otherwise.
 
+@item @code{opencl_func} (optionnal):
+Is a function pointer to the OpenCL implementation of the codelet. Its
+prototype must be:
+@code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
+This pointer is ignored if @code{OPENCL} does not appear in the
+@code{.where} field, it must be non-null otherwise.
+
 @item @code{gordon_func} (optionnal):
 This is the index of the Cell SPU implementation within the Gordon library.
 TODO

+ 91 - 5
examples/Makefile.am

@@ -431,11 +431,97 @@ check_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
 	incrementer/incrementer
 
-if STARPU_USE_CUDA
-incrementer_incrementer_SOURCES =	\
-	incrementer/incrementer.c	\
-	incrementer/incrementer_kernels.cu
-else
 incrementer_incrementer_SOURCES =	\
 	incrementer/incrementer.c
+if STARPU_USE_CUDA
+incrementer_incrementer_SOURCES +=	\
+	incrementer/incrementer_kernels.cu
+endif
+if STARPU_USE_OPENCL
+incrementer_incrementer_SOURCES +=	\
+	incrementer/incrementer_kernels_opencl.c
+endif
+
+nobase_STARPU_OPENCL_DATA_DATA = \
+	incrementer/incrementer_kernels_opencl_codelet.cl
+
+###################
+# Stencil example #
+###################
+
+if STARPU_USE_OPENCL
+check_PROGRAMS +=		\
+	stencil/stencil
+examplebin_PROGRAMS +=		\
+	stencil/stencil
+stencil_stencil_SOURCES =	\
+	stencil/stencil.c       \
+	stencil/stencil_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	stencil/stencil_opencl_codelet.cl \
+	stencil/stencil.h
+endif
+
+####################
+# Variable example #
+####################
+
+check_PROGRAMS +=				\
+	variable/variable
+
+examplebin_PROGRAMS +=				\
+	variable/variable
+
+variable_variable_SOURCES =	\
+	variable/variable.c
+if STARPU_USE_CUDA
+variable_variable_SOURCES +=	\
+	variable/variable_kernels.cu
+endif
+if STARPU_USE_OPENCL
+variable_variable_SOURCES +=	\
+	variable/variable_kernels_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	variable/variable_kernels_opencl_codelet.cl
+endif
+
+######################
+# matVecMult example #
+######################
+
+check_PROGRAMS +=				\
+	matvecmult/matvecmult
+
+examplebin_PROGRAMS +=				\
+	matvecmult/matvecmult
+
+matvecmult_matvecmult_SOURCES =	\
+	matvecmult/matvecmult.c
+
+if STARPU_USE_OPENCL
+nobase_STARPU_OPENCL_DATA_DATA += \
+	matvecmult/matvecmult_kernel.cl
+endif
+
+#################
+# block example #
+#################
+
+check_PROGRAMS +=				\
+	block/block
+
+examplebin_PROGRAMS +=				\
+	block/block
+
+block_block_SOURCES =	\
+	block/block.c
+
+if STARPU_USE_CUDA
+block_block_SOURCES +=				\
+	block/block_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+nobase_STARPU_OPENCL_DATA_DATA += \
+	block/block_kernel.cl
 endif

+ 167 - 0
examples/block/block.c

@@ -0,0 +1,167 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <pthread.h>
+#include <math.h>
+
+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
+        int i;
+
+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
+}
+
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err, n;
+	float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
+	int nx = (int)STARPU_GET_BLOCK_NX(descr[0]);
+	int ny = (int)STARPU_GET_BLOCK_NY(descr[0]);
+	int nz = (int)STARPU_GET_BLOCK_NZ(descr[0]);
+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/block/block_kernel.cl", "block", devid);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+        n=0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
+	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
+	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
+	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
+	err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &multiplier);
+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+                size_t global=1024;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+        starpu_opencl_release(kernel);
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+typedef void (*device_func)(void **, void *);
+
+int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny, int pnz, float multiplier)
+{
+	starpu_codelet cl;
+	starpu_data_handle block_handle;
+        starpu_data_handle multiplier_handle;
+        int i, j, k;
+
+	starpu_register_block_data(&block_handle, 0, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
+	starpu_register_variable_data(&multiplier_handle, 0, (uintptr_t)&multiplier, sizeof(float));
+
+	cl.where = where;
+        cl.cuda_func = func;
+        cl.cpu_func = func;
+        cl.opencl_func = func;
+        cl.nbuffers = 2;
+        cl.model = NULL;
+
+        struct starpu_task *task = starpu_task_create();
+        task->cl = &cl;
+        task->callback_func = NULL;
+        task->buffers[0].handle = block_handle;
+        task->buffers[0].mode = STARPU_RW;
+        task->buffers[1].handle = multiplier_handle;
+        task->buffers[1].mode = STARPU_RW;
+
+        int ret = starpu_submit_task(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                fprintf(stderr, "No worker may execute this task\n");
+                return 1;
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+        starpu_sync_data_with_mem(block_handle, STARPU_R);
+
+        for(i=0 ; i<pnx*pny*pnz; i++) {
+          fprintf(stderr, "%f ", block[i]);
+        }
+        fprintf(stderr, "\n");
+
+        starpu_release_data_from_mem(block_handle);
+
+        return 0;
+}
+
+int main(int argc, char **argv)
+{
+	starpu_codelet cl;
+        float *block;
+        int i, ret;
+        int nx=3;
+        int ny=2;
+        int nz=4;
+        float multiplier=1.0;
+
+        starpu_init(NULL);
+
+        block = (float*)malloc(nx*ny*nz*sizeof(float));
+        assert(block);
+        for(i=0 ; i<nx*ny*nz ; i++) block[i] = i+1;
+
+        ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
+        if (!ret) multiplier *= 1.0;
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("examples/block/block_kernel.cl");
+        ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
+        if (!ret) multiplier *= 2.0;
+#endif
+#ifdef STARPU_USE_CUDA
+        ret = execute_on(STARPU_CUDA, cuda_codelet, block, nx, ny, nz, 3.0);
+        if (!ret) multiplier *= 3.0;
+#endif
+
+        // Check result is correct
+        ret=1;
+        for(i=0 ; i<nx*ny*nz ; i++) {
+          if (block[i] != (i+1) * multiplier) {
+            ret=0;
+            break;
+          }
+        }
+
+        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
+        starpu_shutdown();
+
+	return 0;
+}

+ 34 - 0
examples/block/block_cuda.cu

@@ -0,0 +1,34 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+static __global__ void cuda_block(float *block, int nx, int ny, int nz, float *multiplier)
+{
+        int i;
+        for(i=0 ; i<nx*ny*nz ; i++) block[i] *= *multiplier;
+}
+
+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+        float *block = (float *)STARPU_GET_BLOCK_PTR(descr[0]);
+	int nx = STARPU_GET_BLOCK_NX(descr[0]);
+	int ny = STARPU_GET_BLOCK_NY(descr[0]);
+	int nz = STARPU_GET_BLOCK_NZ(descr[0]);
+        float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
+
+        cuda_block<<<1,1>>>(block, nx, ny, nz, multiplier);
+}

+ 23 - 0
examples/block/block_kernel.cl

@@ -0,0 +1,23 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void block(__global float *b, int nx, int ny, int nz, __global float *multiplier)
+{
+        const int i = get_global_id(0);
+        if (i < nx*ny*nz) {
+                b[i] *= *multiplier;
+        }
+}

+ 1 - 1
examples/heat/Makefile.in

@@ -14,7 +14,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 
-export PKG_CONFIG_PATH=@STARPU_DIR@
+export PKG_CONFIG_PATH=@STARPU_BUILD_DIR@
 
 LIBS+=$$(pkg-config --libs libstarpu)
 CFLAGS+=$$(pkg-config --cflags libstarpu)

+ 23 - 10
examples/incrementer/incrementer.c

@@ -23,6 +23,10 @@ static unsigned niter = 50000;
 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 
+#ifdef STARPU_USE_OPENCL
+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
 extern void cuda_codelet_host(float *tab);
 
 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
@@ -39,20 +43,27 @@ int main(int argc, char **argv)
 	if (argc == 2)
 		niter = atoi(argv[1]);
 
-	float float_array[3] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f}; 
+	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
 
 	starpu_data_handle float_array_handle;
 	starpu_register_vector_data(&float_array_handle, 0 /* home node */,
-			(uintptr_t)&float_array, 3, sizeof(float));
+			(uintptr_t)&float_array, 4, sizeof(float));
+
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("examples/incrementer/incrementer_kernels_opencl_codelet.cl");
+#endif
 
 	starpu_codelet cl =
 	{
 		/* CUBLAS stands for CUDA kernels controlled from the host */
-		.where = STARPU_CPU|STARPU_CUDA,
+		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 		.cpu_func = cpu_codelet,
 #ifdef STARPU_USE_CUDA
 		.cuda_func = cuda_codelet,
 #endif
+#ifdef STARPU_USE_OPENCL
+		.opencl_func = opencl_codelet,
+#endif
 		.nbuffers = 1
 	};
 
@@ -67,7 +78,7 @@ int main(int argc, char **argv)
 		struct starpu_task *task = starpu_task_create();
 
 		task->cl = &cl;
-		
+
 		task->callback_func = NULL;
 
 		task->buffers[0].handle = float_array_handle;
@@ -85,15 +96,17 @@ int main(int argc, char **argv)
 
 	/* update the array in RAM */
 	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
-	
+
 	gettimeofday(&end, NULL);
 
-	fprintf(stderr, "array -> %f, %f, %f\n", float_array[0], 
-			float_array[1], float_array[2]);
-	
-	if (float_array[0] != float_array[1] + float_array[2])
+	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
+                float_array[1], float_array[2], float_array[3]);
+
+	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
+		fprintf(stderr, "Incorrect result\n");
 		return 1;
-	
+	}
+
 	starpu_release_data_from_mem(float_array_handle);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +

+ 49 - 0
examples/incrementer/incrementer_kernels_opencl.c

@@ -0,0 +1,49 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <CL/cl.h>
+
+void opencl_codelet(void *descr[], void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/incrementer/incrementer_kernels_opencl_codelet.cl", "incrementer", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=4;
+		size_t local=4;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 23 - 0
examples/incrementer/incrementer_kernels_opencl_codelet.cl

@@ -0,0 +1,23 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void incrementer(__global float* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 0 || i == 3)
+		input[i] = input[i] + 1.0;
+}
+

+ 193 - 0
examples/matvecmult/matvecmult.c

@@ -0,0 +1,193 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <pthread.h>
+#include <math.h>
+
+//static int width=1100;
+//static int height=244021;
+static int width=20;
+static int height=4;
+
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err, n;
+	float *matrix = (float *)STARPU_GET_MATRIX_PTR(descr[0]);
+	float *vector = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
+	float *mult = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/matvecmult/matvecmult_kernel.cl", "matVecMult", devid);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+        n=0;
+        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&width);
+        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&height);
+        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+                size_t global=512;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}
+
+void fillArray(float* pfData, int iSize) {
+    int i;
+    const float fScale = 1.0f / (float)RAND_MAX;
+    for (i = 0; i < iSize; ++i) {
+            pfData[i] = fScale * rand();
+    }
+}
+
+void printArray(float* pfData, int iSize) {
+    int i;
+    for (i = 0; i < iSize; ++i) {
+            fprintf(stderr, "%f ", pfData[i]);
+    }
+    fprintf(stderr, "\n");
+}
+
+void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
+    int i, j;
+    for (i = 0; i < height; ++i) {
+        double sum = 0;
+        for (j = 0; j < width; ++j) {
+            double a = matrix[i * width + j];
+            double b = vector[j];
+            sum += a * b;
+        }
+        mult[i] = (float)sum;
+    }
+}
+
+int compareL2fe(const float* reference, const float* data, const unsigned int len, const float epsilon) {
+    float error = 0;
+    float ref = 0;
+    unsigned int i;
+
+    for(i = 0; i < len; ++i) {
+        float diff = reference[i] - data[i];
+        error += diff * diff;
+        ref += reference[i] * reference[i];
+    }
+
+    float normRef = sqrtf(ref);
+    if (fabs(ref) < 1e-7) return 1;
+
+    float normError = sqrtf(error);
+    error = normError / normRef;
+
+    return error < epsilon ? 0 : 1;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	starpu_codelet cl;
+        starpu_init(NULL);
+
+#ifdef STARPU_USE_OPENCL
+        float *matrix, *vector, *mult;
+        float *correctResult;
+        unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
+
+	starpu_data_handle matrix_handle, vector_handle, mult_handle;
+
+        mem_size_matrix = width * height * sizeof(float);
+        matrix = (float*)malloc(mem_size_matrix);
+        mem_size_vector = width * sizeof(float);
+        vector = (float*)malloc(mem_size_vector);
+        mem_size_mult = height * sizeof(float);
+        mult = (float*)malloc(mem_size_mult);
+        correctResult = (float*)malloc(mem_size_mult);
+
+        assert(matrix);
+        assert(vector);
+        assert(mult);
+        assert(correctResult);
+
+        fillArray(matrix, width*height);
+        fillArray(vector, width);
+        fillArray(mult, height);
+        matVecMult(matrix, vector, width, height, correctResult);
+
+	starpu_register_matrix_data(&matrix_handle, 0, (uintptr_t)matrix, width, width, height, sizeof(float));
+	starpu_register_vector_data(&vector_handle, 0, (uintptr_t)vector, width, sizeof(float));
+	starpu_register_vector_data(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
+
+        _starpu_opencl_compile_source_to_opencl("examples/matvecmult/matvecmult_kernel.cl");
+
+	cl.where = STARPU_OPENCL;
+        cl.opencl_func = opencl_codelet;
+        cl.nbuffers = 3;
+        cl.model = NULL;
+
+        struct starpu_task *task = starpu_task_create();
+        task->cl = &cl;
+        task->callback_func = NULL;
+        task->buffers[0].handle = matrix_handle;
+        task->buffers[0].mode = STARPU_R;
+        task->buffers[1].handle = vector_handle;
+        task->buffers[1].mode = STARPU_R;
+        task->buffers[2].handle = mult_handle;
+        task->buffers[2].mode = STARPU_RW;
+
+        int ret = starpu_submit_task(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                fprintf(stderr, "No worker may execute this task\n");
+                exit(0);
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+        starpu_sync_data_with_mem(matrix_handle, STARPU_R);
+        starpu_sync_data_with_mem(vector_handle, STARPU_R);
+        starpu_sync_data_with_mem(mult_handle, STARPU_R);
+
+        int res = compareL2fe(correctResult, mult, height, 1e-6f);
+        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
+#if 0
+        printArray(matrix, width*height);
+        printArray(vector, width);
+        printArray(mult, height);
+#endif
+        starpu_release_data_from_mem(matrix_handle);
+        starpu_release_data_from_mem(vector_handle);
+        starpu_release_data_from_mem(mult_handle);
+
+        starpu_shutdown();
+#endif
+
+	return 0;
+}

+ 48 - 0
examples/matvecmult/matvecmult_kernel.cl

@@ -0,0 +1,48 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and
+ * proprietary rights in and to this software and related documentation.
+ * Any use, reproduction, disclosure, or distribution of this software
+ * and related documentation without an express license agreement from
+ * NVIDIA Corporation is strictly prohibited.
+ *
+ * Please refer to the applicable NVIDIA end user license agreement (EULA)
+ * associated with this source code for terms and conditions that govern
+ * your use of this NVIDIA software.
+ *
+ */
+
+/* Matrix-vector multiplication: W = M * V.
+ * Device code.
+ *
+ * This sample implements matrix-vector multiplication.
+ * It has been written for clarity of exposition to illustrate various OpenCL
+ * programming principles and optimizatoins, not with the goal of providing
+ * the most performant generic kernel for matrix-vector multiplication.
+ *
+ * CUBLAS provides high-performance matrix-vector multiplication on GPU.
+ */
+
+__kernel void matVecMult(
+                         __global float* M,
+                         __global float* V,
+                         int width, int height,
+                         __global float* W
+                         )
+{
+        // Row index
+        uint y = get_global_id(0);
+        if (y < height) {
+                // Row pointer
+                const __global float* row = M + y * width;
+
+                // Compute dot product
+                float dotProduct = 0;
+                for (int x = 0; x < width; ++x)
+                        dotProduct += row[x] * V[x];
+
+                // Write result to global memory
+                W[y] = dotProduct;
+        }
+}

+ 2 - 2
examples/ppm_downscaler/yuv_downscaler.c

@@ -40,8 +40,8 @@ void parse_args(int argc, char **argv)
 		strcpy(filename_out, argv[2]);
 	}
 	else {
-		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_in_default);
-		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_DIR, filename_out_default);
+		sprintf(filename_in, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_in_default);
+		sprintf(filename_out, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_out_default);
 	}
 }
 

+ 76 - 2
examples/spmv/dw_spmv.c

@@ -27,6 +27,60 @@ extern void spmv_kernel_cuda(void *descr[], void *args);
 struct timeval start;
 struct timeval end;
 
+#ifdef STARPU_USE_OPENCL
+#include "starpu_opencl.h"
+void spmv_kernel_opencl(void *descr[], void *args)
+{
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err, n;
+
+	uint32_t nnz = STARPU_GET_CSR_NNZ(descr[0]);
+	uint32_t nrow = STARPU_GET_CSR_NROW(descr[0]);
+	float *nzval = (float *)STARPU_GET_CSR_NZVAL(descr[0]);
+	uint32_t *colind = STARPU_GET_CSR_COLIND(descr[0]);
+	uint32_t *rowptr = STARPU_GET_CSR_ROWPTR(descr[0]);
+	uint32_t firstentry = STARPU_GET_CSR_FIRSTENTRY(descr[0]);
+
+	float *vecin = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
+	uint32_t nx_in = STARPU_GET_VECTOR_NX(descr[1]);
+
+	float *vecout = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
+	uint32_t nx_out = STARPU_GET_VECTOR_NX(descr[2]);
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/spmv/spmv_opencl.cl", "spvm", devid);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+        n=0;
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nnz);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nrow);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &nzval);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &colind);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &rowptr);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &firstentry);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecin);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_in);
+	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecout);
+	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_out);
+        if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+                size_t global=1024;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+        starpu_opencl_release(kernel);
+}
+#endif
+
 unsigned nblocks = 2;
 uint32_t size = 4194304;
 
@@ -189,14 +243,28 @@ void call_spmv_codelet_filters(void)
 	starpu_partition_data(sparse_matrix, &csr_f);
 	starpu_partition_data(vector_out, &vector_f);
 
+#ifdef STARPU_USE_OPENCL
+        {
+                int ret = _starpu_opencl_compile_source_to_opencl("examples/spmv/spmv_opencl.cl");
+                if (ret)
+		{
+			fprintf(stderr, "Failed to compile OpenCL codelet\n");
+			exit(ret);
+		}
+        }
+#endif
+
 	starpu_codelet cl;
 	memset(&cl, 0, sizeof(starpu_codelet));
 
-	cl.where = STARPU_CPU|STARPU_CUDA;
+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
 	cl.cpu_func =  cpu_spmv;
 #ifdef STARPU_USE_CUDA
 	cl.cuda_func = spmv_kernel_cuda;
 #endif
+#ifdef STARPU_USE_OPENCL
+        cl.opencl_func = spmv_kernel_opencl;
+#endif
 	cl.nbuffers = 3;
 	cl.model = NULL;
 
@@ -206,6 +274,7 @@ void call_spmv_codelet_filters(void)
 	for (part = 0; part < nblocks; part++)
 	{
 		struct starpu_task *task = starpu_task_create();
+                int ret;
 
 		task->callback_func = NULL;
 
@@ -219,7 +288,12 @@ void call_spmv_codelet_filters(void)
 		task->buffers[2].handle = starpu_get_sub_data(vector_out, 1, part);
 		task->buffers[2].mode = STARPU_W;
 	
-		starpu_submit_task(task);
+		ret = starpu_submit_task(task);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			fprintf(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
 	}
 
 	starpu_wait_all_tasks();

+ 0 - 1
examples/spmv/dw_spmv.h

@@ -24,7 +24,6 @@
 #include <sys/types.h>
 #include <pthread.h>
 #include <signal.h>
-#include <cblas.h>
 
 #include <starpu.h>
 

+ 42 - 0
examples/spmv/spmv_opencl.cl

@@ -0,0 +1,42 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void spvm(unsigned nnz, unsigned nrow,
+                   __global float* nzval, __global unsigned* colind,
+                   __global unsigned* rowptr, unsigned firstentry,
+                   __global float *vecin, unsigned nx_in,
+                   __global float *vecout, unsigned nx_out)
+{
+	unsigned row;
+	for (row = 0; row < nrow; row++)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstentry;
+		unsigned lastindex = rowptr[row+1] - firstentry;
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			unsigned col;
+
+			col = colind[index];
+			tmp += nzval[index]*vecin[col];
+		}
+
+		vecout[row] = tmp;
+	}
+}

+ 147 - 0
examples/stencil/stencil.c

@@ -0,0 +1,147 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "stencil.h"
+
+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+
+static int verbose = 0;
+
+static
+void display_non_zero_values(TYPE *ptr, char *msg)
+{
+        if(verbose) {
+                int x, y, z;
+
+                for(z = 0; z < DIM; z++)
+                        for(y = 0; y < DIM; y++)
+                                for(x = 0; x < DIM; x++) {
+                                        TYPE r = ptr[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD];
+                                        if(r != 0.0)
+                                                printf("%s[%d, %d, %d] == %f\n", msg, z, y, x, r);
+                                }
+        }
+}
+
+int main(int argc, char **argv)
+{
+        TYPE *data;                         // original data set given to device
+        TYPE *results;                      // results returned from device
+        TYPE C0 = 0.25;
+        TYPE C1 = 0.75;
+        starpu_data_handle data_handle;
+        starpu_data_handle results_handle;
+        starpu_data_handle C0_handle;
+        starpu_data_handle C1_handle;
+
+	starpu_init(NULL);
+
+        // Filter args
+        argv++;
+        while (argc > 1) {
+                if(!strcmp(*argv, "--verbose")) {
+                        verbose = 1;
+                } else
+                        break;
+                argc--; argv++;
+        }
+
+        // Fill our data set with random float values
+        {
+                long i, x, y, z;
+
+                data = (TYPE *)malloc(SIZE * sizeof(TYPE));
+                results = (TYPE *)malloc(SIZE * sizeof(TYPE));
+                if (!data || !results) {
+                        fprintf(stderr, "Malloc failed!\n");
+                        return;
+                }
+
+                for(i = 0; i < SIZE; i++) {
+                        data[i] = 0.0;
+                        results[i] = 0.0;
+                }
+
+                z = ZBLOCK-1;
+                y = YBLOCK-1;
+                x = XBLOCK-1;
+
+                data[(z + 1) * SURFACE + (y + 1) * REALDIM + x + 1 + FIRST_PAD] = 2.0;
+        }
+
+        display_non_zero_values(data, "data");
+
+        starpu_register_vector_data(&data_handle, 0 /* home node */,
+                                    (uintptr_t)data, SIZE, sizeof(TYPE));
+        starpu_register_vector_data(&results_handle, 0 /* home node */,
+                                    (uintptr_t)results, SIZE, sizeof(TYPE));
+        starpu_register_vector_data(&C0_handle, 0 /* home node */,
+                                    (uintptr_t)&C0, 1, sizeof(TYPE));
+        starpu_register_vector_data(&C1_handle, 0 /* home node */,
+                                    (uintptr_t)&C1, 1, sizeof(TYPE));
+
+        _starpu_opencl_compile_source_to_opencl("examples/stencil/stencil_opencl_codelet.cl");
+
+	starpu_codelet cl =
+	{
+		.where = STARPU_OPENCL,
+		.opencl_func = opencl_codelet,
+		.nbuffers = 4
+	};
+
+	{
+		struct starpu_task *task = starpu_task_create();
+
+                task->cl = &cl;
+                task->callback_func = NULL;
+
+		task->buffers[0].handle = data_handle;
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = results_handle;
+		task->buffers[1].mode = STARPU_W;
+		task->buffers[2].handle = C0_handle;
+		task->buffers[2].mode = STARPU_R;
+		task->buffers[3].handle = C1_handle;
+		task->buffers[3].mode = STARPU_R;
+
+		int ret = starpu_submit_task(task);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			fprintf(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+	starpu_sync_data_with_mem(data_handle, STARPU_R);
+	starpu_sync_data_with_mem(results_handle, STARPU_R);
+	starpu_sync_data_with_mem(C0_handle, STARPU_R);
+	starpu_sync_data_with_mem(C1_handle, STARPU_R);
+
+	display_non_zero_values(results, "results");
+
+	starpu_release_data_from_mem(data_handle);
+	starpu_release_data_from_mem(results_handle);
+	starpu_release_data_from_mem(C0_handle);
+	starpu_release_data_from_mem(C1_handle);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 43 - 0
examples/stencil/stencil.h

@@ -0,0 +1,43 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STENCIL_H__
+#define __STENCIL_H__
+
+#define TYPE    float
+
+#define DIM     (128)
+#define BORDER  (1)
+
+#define PADDING (64 / sizeof(TYPE) - 2*BORDER)
+#define FIRST_PAD (PADDING+1)
+
+#define REALDIM (DIM + 2 * BORDER + PADDING)
+
+#define SURFACE   ((DIM + 2 * BORDER) * REALDIM)
+
+#define SIZE    ((DIM + 2 * BORDER) * SURFACE + 1)
+
+#define XBLOCK  (16)
+#define YBLOCK  (16)
+#define ZBLOCK  (64)
+
+#define X_PER_THREAD (1)
+#define Y_PER_THREAD (4)
+#define Z_PER_THREAD (ZBLOCK)
+
+#endif
+

+ 66 - 0
examples/stencil/stencil_opencl.c

@@ -0,0 +1,66 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <starpu_util.h>
+#include "stencil.h"
+
+void opencl_codelet(void *descr[], void *_args)
+{
+	float *data = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+	float *results = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
+	float *C0 = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
+	float *C1 = (float *)STARPU_GET_VECTOR_PTR(descr[3]);
+
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+        err = starpu_opencl_load_kernel(&kernel, &queue, "examples/stencil/stencil_opencl_codelet.cl", "stencil", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
+	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &results);
+	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &C0);
+	err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &C1);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global[3];
+		size_t local[3];
+
+                // Execute the kernel over the entire range of our 3d input data set
+                local[0] = XBLOCK / X_PER_THREAD;   // threads along the X axis
+                local[1] = YBLOCK / Y_PER_THREAD;   // threads along the Y axis
+                local[2] = ZBLOCK / Z_PER_THREAD;   // threads along the Z axis
+
+                global[0] = DIM / X_PER_THREAD;  // virtual size of global X axis
+                global[1] = DIM / Y_PER_THREAD;  // virtual size of global Y axis
+                global[2] = DIM / Z_PER_THREAD;  // virtual size of global Z axis
+
+                err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 100 - 0
examples/variable/variable.c

@@ -0,0 +1,100 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <pthread.h>
+
+static unsigned niter = 50000;
+
+#ifdef STARPU_USE_CUDA
+extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+#ifdef STARPU_USE_OPENCL
+extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+extern void cuda_codelet_host(float *tab);
+
+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+
+	*val += 1.0f;
+}
+
+int main(int argc, char **argv)
+{
+	unsigned i;
+        float foo;
+	starpu_data_handle float_array_handle;
+	starpu_codelet cl;
+
+	starpu_init(NULL);
+        if (argc == 2) niter = atoi(argv[1]);
+        foo = 0.0f;
+
+	starpu_register_variable_data(&float_array_handle, 0 /* home node */,
+                                      (uintptr_t)&foo, sizeof(float));
+
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("examples/variable/variable_kernels_opencl_codelet.cl");
+#endif
+
+	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
+        cl.cpu_func = cpu_codelet;
+#ifdef STARPU_USE_CUDA
+        cl.cuda_func = cuda_codelet;
+#endif
+#ifdef STARPU_USE_OPENCL
+        cl.opencl_func = opencl_codelet;
+#endif
+        cl.nbuffers = 1;
+        cl.model = NULL;
+
+	for (i = 0; i < niter; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+                int ret;
+
+		task->cl = &cl;
+
+		task->callback_func = NULL;
+
+		task->buffers[0].handle = float_array_handle;
+		task->buffers[0].mode = STARPU_RW;
+
+		ret = starpu_submit_task(task);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			fprintf(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+	}
+
+	starpu_wait_all_tasks();
+
+	/* update the array in RAM */
+	starpu_sync_data_with_mem(float_array_handle, STARPU_R);
+
+	fprintf(stderr, "variable -> %f\n", foo);
+
+	starpu_release_data_from_mem(float_array_handle);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 30 - 0
examples/variable/variable_kernels.cu

@@ -0,0 +1,30 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+static __global__ void cuda_variable(float * tab)
+{
+	*tab += 2;
+	return;
+}
+
+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+
+	cuda_variable<<<1,1>>>(val);
+}

+ 48 - 0
examples/variable/variable_kernels_opencl.c

@@ -0,0 +1,48 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+void opencl_codelet(void *descr[], void *_args)
+{
+	float *val = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+        id = starpu_get_worker_id();
+        devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue,
+                                        "examples/variable/variable_kernels_opencl_codelet.cl", "variable", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local=1;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 23 - 0
examples/variable/variable_kernels_opencl_codelet.cl

@@ -0,0 +1,23 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void variable(__global float* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 0)
+		input[i] = input[i] + 4.0;
+}
+

+ 14 - 2
include/starpu.h

@@ -46,14 +46,19 @@ struct starpu_conf {
 	int ncpus;
 	/* maximum number of CUDA GPUs (-1 for default) */
 	int ncuda;
+	/* maximum number of OpenCL GPUs (-1 for default) */
+	int nopencl;
 	/* maximum number of Cell's SPUs (-1 for default) */
 	int nspus;
 
 	unsigned use_explicit_workers_bindid;
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 
-	unsigned use_explicit_workers_gpuid;
-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
+	unsigned use_explicit_workers_cuda_gpuid;
+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
+
+	unsigned use_explicit_workers_opencl_gpuid;
+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 
 	/* calibrate performance models, if any */
 	unsigned calibrate;
@@ -75,6 +80,7 @@ unsigned starpu_get_worker_count(void);
 unsigned starpu_get_cpu_worker_count(void);
 unsigned starpu_get_cuda_worker_count(void);
 unsigned starpu_get_spu_worker_count(void);
+unsigned starpu_get_opencl_worker_count(void);
 
 /* Return the identifier of the thread in case this is associated to a worker.
  * This will return -1 if this function is called directly from the application
@@ -84,6 +90,7 @@ int starpu_get_worker_id(void);
 enum starpu_archtype {
 	STARPU_CPU_WORKER, /* CPU core */
 	STARPU_CUDA_WORKER, /* NVIDIA CUDA device */
+	STARPU_OPENCL_WORKER, /* OpenCL CUDA device */
 	STARPU_GORDON_WORKER /* Cell SPU */
 };
 
@@ -103,6 +110,11 @@ enum starpu_archtype starpu_get_worker_type(int id);
  * behaviour. */
 void starpu_get_worker_name(int id, char *dst, size_t maxlen);
 
+/* This functions returns the device id of the worker associated to an
+ *  identifier (as returned by the starpu_get_worker_id() function)
+ */
+int starpu_get_worker_devid(int id);
+
 #ifdef __cplusplus
 }
 #endif

+ 4 - 2
include/starpu_config.h.in

@@ -3,16 +3,17 @@
 
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CUDA
+#undef STARPU_USE_OPENCL
 #undef STARPU_USE_GORDON
 
 #undef STARPU_ATLAS
 #undef STARPU_GOTO
 #undef STARPU_SYSTEM_BLAS
 
+#undef STARPU_BUILD_DIR
+#undef STARPU_OPENCL_DATADIR
 #undef STARPU_HAVE_MAGMA
 
-#undef STARPU_DIR
-
 #undef STARPU_OPENGL_RENDER
 
 #undef STARPU_USE_GTK
@@ -37,6 +38,7 @@
 
 #undef STARPU_NMAXBUFS
 #undef STARPU_MAXCUDADEVS
+#undef STARPU_MAXOPENCLDEVS
 
 #undef STARPU_HAVE_LIBNUMA
 

+ 6 - 0
include/starpu_data_interfaces.h

@@ -36,6 +36,8 @@ void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memo
 /* Matrix interface for dense matrices */
 typedef struct starpu_matrix_interface_s {
 	uintptr_t ptr;
+        uintptr_t dev_handle;
+        size_t offset;
 	uint32_t nx;
 	uint32_t ny;
 	uint32_t ld;
@@ -62,6 +64,8 @@ size_t starpu_get_matrix_elemsize(starpu_data_handle handle);
 /* BLOCK interface for 3D dense blocks */
 typedef struct starpu_block_interface_s {
 	uintptr_t ptr;
+        uintptr_t dev_handle;
+        size_t offset;
 	uint32_t nx;
 	uint32_t ny;
 	uint32_t nz;
@@ -93,6 +97,8 @@ size_t starpu_get_block_elemsize(starpu_data_handle handle);
 /* vector interface for contiguous (non-strided) buffers */
 typedef struct starpu_vector_interface_s {
 	uintptr_t ptr;
+        uintptr_t dev_handle;
+        size_t offset;
 	uint32_t nx;
 	size_t elemsize;
 } starpu_vector_interface_t;

+ 192 - 0
include/starpu_opencl.h

@@ -0,0 +1,192 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_OPENCL_H__
+#define __STARPU_OPENCL_H__
+
+#ifdef STARPU_USE_OPENCL
+#include <CL/cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define STARPU_OPENCL_REPORT_ERROR(status)                                     \
+	do {                                                            \
+		char *errormsg;                                         \
+		switch (status) {                                       \
+		case CL_SUCCESS:                                        \
+			errormsg = "success";                           \
+			break;                                          \
+		case CL_DEVICE_NOT_FOUND:				\
+			errormsg = "Device not found";                  \
+			break;                                          \
+		case CL_DEVICE_NOT_AVAILABLE:				\
+			errormsg = "Device not available";              \
+			break;                                          \
+		case CL_COMPILER_NOT_AVAILABLE:				\
+			errormsg = "Compiler not available";            \
+			break;                                          \
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:			\
+			errormsg = "Memory object allocation failure";  \
+			break;                                          \
+		case CL_OUT_OF_RESOURCES:				\
+			errormsg = "Out of resources";                  \
+			break;                                          \
+		case CL_OUT_OF_HOST_MEMORY:				\
+			errormsg = "Out of host memory";                \
+			break;                                          \
+		case CL_PROFILING_INFO_NOT_AVAILABLE:			\
+			errormsg = "Profiling info not available";      \
+			break;                                          \
+		case CL_MEM_COPY_OVERLAP:				\
+			errormsg = "Memory copy overlap";               \
+			break;                                          \
+		case CL_IMAGE_FORMAT_MISMATCH:				\
+			errormsg = "Image format mismatch";             \
+			break;                                          \
+		case CL_IMAGE_FORMAT_NOT_SUPPORTED:			\
+			errormsg = "Image format not supported";        \
+			break;                                          \
+		case CL_BUILD_PROGRAM_FAILURE:				\
+			errormsg = "Build program failure";             \
+			break;                                          \
+		case CL_MAP_FAILURE:				        \
+			errormsg = "Map failure";                       \
+			break;                                          \
+		case CL_INVALID_VALUE:				        \
+			errormsg = "Invalid value";                     \
+			break;                                          \
+		case CL_INVALID_DEVICE_TYPE:				\
+			errormsg = "Invalid device type";               \
+			break;                                          \
+		case CL_INVALID_PLATFORM:				\
+			errormsg = "Invalid platform";                  \
+			break;                                          \
+		case CL_INVALID_DEVICE:				        \
+			errormsg = "Invalid device";                    \
+			break;                                          \
+		case CL_INVALID_CONTEXT:				\
+			errormsg = "Invalid context";                   \
+			break;                                          \
+		case CL_INVALID_QUEUE_PROPERTIES:			\
+			errormsg = "Invalid queue properties";          \
+			break;                                          \
+		case CL_INVALID_COMMAND_QUEUE:				\
+			errormsg = "Invalid command queue";             \
+			break;                                          \
+		case CL_INVALID_HOST_PTR:				\
+			errormsg = "Invalid host pointer";              \
+			break;                                          \
+		case CL_INVALID_MEM_OBJECT:				\
+			errormsg = "Invalid memory object";             \
+			break;                                          \
+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:		\
+			errormsg = "Invalid image format descriptor";   \
+			break;                                          \
+		case CL_INVALID_IMAGE_SIZE:				\
+			errormsg = "Invalid image size";                \
+			break;                                          \
+		case CL_INVALID_SAMPLER:				\
+			errormsg = "Invalid sampler";                   \
+			break;                                          \
+		case CL_INVALID_BINARY:				        \
+			errormsg = "Invalid binary";                    \
+			break;                                          \
+		case CL_INVALID_BUILD_OPTIONS:				\
+			errormsg = "Invalid build options";             \
+			break;                                          \
+		case CL_INVALID_PROGRAM:				\
+			errormsg = "Invalid program";                   \
+			break;                                          \
+		case CL_INVALID_PROGRAM_EXECUTABLE:			\
+			errormsg = "Invalid program executable";        \
+			break;                                          \
+		case CL_INVALID_KERNEL_NAME:				\
+			errormsg = "Invalid kernel name";               \
+			break;                                          \
+		case CL_INVALID_KERNEL_DEFINITION:			\
+			errormsg = "Invalid kernel definition";         \
+			break;                                          \
+		case CL_INVALID_KERNEL:				        \
+			errormsg = "Invalid kernel";                    \
+			break;                                          \
+		case CL_INVALID_ARG_INDEX:				\
+			errormsg = "Invalid argument index";            \
+			break;                                          \
+		case CL_INVALID_ARG_VALUE:				\
+			errormsg = "Invalid argument value";            \
+			break;                                          \
+		case CL_INVALID_ARG_SIZE:				\
+			errormsg = "Invalid argument size";             \
+			break;                                          \
+		case CL_INVALID_KERNEL_ARGS:				\
+			errormsg = "Invalid kernel arguments";          \
+			break;                                          \
+		case CL_INVALID_WORK_DIMENSION:				\
+			errormsg = "Invalid work dimension";            \
+			break;                                          \
+		case CL_INVALID_WORK_GROUP_SIZE:			\
+			errormsg = "Invalid work group size";           \
+			break;                                          \
+		case CL_INVALID_WORK_ITEM_SIZE:				\
+			errormsg = "Invalid work item size";            \
+			break;                                          \
+		case CL_INVALID_GLOBAL_OFFSET:				\
+			errormsg = "Invalid global offset";             \
+			break;                                          \
+		case CL_INVALID_EVENT_WAIT_LIST:			\
+			errormsg = "Invalid event wait list";           \
+			break;                                          \
+		case CL_INVALID_EVENT:				        \
+			errormsg = "Invalid event";                     \
+			break;                                          \
+		case CL_INVALID_OPERATION:				\
+			errormsg = "Invalid operation";                 \
+			break;                                          \
+		case CL_INVALID_GL_OBJECT:				\
+			errormsg = "Invalid GL object";                 \
+			break;                                          \
+		case CL_INVALID_BUFFER_SIZE:				\
+			errormsg = "Invalid buffer size";               \
+			break;                                          \
+		case CL_INVALID_MIP_LEVEL:				\
+			errormsg = "Invalid MIP level";                 \
+			break;                                          \
+		case CL_INVALID_GLOBAL_WORK_SIZE:			\
+			errormsg = "Invalid global work size";          \
+			break;                                          \
+		default:						\
+			errormsg = "unknown error";			\
+			break;			                        \
+		}                                                       \
+		printf("oops in %s ... <%s> \n", __func__, errormsg);	\
+		assert(0);	                                        \
+	} while (0)
+
+void starpu_opencl_get_context(int devid, cl_context *context);
+void starpu_opencl_get_device(int devid, cl_device_id *device);
+void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid);
+int starpu_opencl_release(cl_kernel kernel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // STARPU_USE_OPENCL
+#endif // __STARPU_OPENCL_H__
+

+ 4 - 2
include/starpu_perfmodel.h

@@ -21,6 +21,7 @@
 #include <pthread.h>
 #include <starpu.h>
 #include <starpu_config.h>
+#include <starpu_task.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -39,8 +40,9 @@ struct starpu_buffer_descr_t;
 enum starpu_perf_archtype {
 	STARPU_CPU_DEFAULT = 0,
 	STARPU_CUDA_DEFAULT = 1,
-	/* STARPU_CUDA_DEFAULT + devid */
-	STARPU_GORDON_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS
+	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
+	/* STARPU_OPENCL_DEFAULT + devid */
+	STARPU_GORDON_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS
 };
 
 #define STARPU_NARCH_VARIATIONS	(STARPU_GORDON_DEFAULT+1)

+ 2 - 0
include/starpu_task.h

@@ -31,6 +31,7 @@
 #define STARPU_CUDA	((1ULL)<<3)
 #define STARPU_SPU	((1ULL)<<4)
 #define STARPU_GORDON	((1ULL)<<5)
+#define STARPU_OPENCL	((1ULL)<<6)
 
 #define STARPU_MIN_PRIO        (-4)
 #define STARPU_MAX_PRIO        5
@@ -53,6 +54,7 @@ typedef struct starpu_codelet_t {
 	/* the different implementations of the codelet */
 	void (*cuda_func)(void **, void *);
 	void (*cpu_func)(void **, void *);
+	void (*opencl_func)(void **, void *);
 	uint8_t gordon_func;
 
 	/* how many buffers do the codelet takes as argument ? */

+ 5 - 0
src/Makefile.am

@@ -157,3 +157,8 @@ endif
 if STARPU_USE_GORDON
 libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
 endif
+
+if STARPU_USE_OPENCL
+libstarpu_la_SOURCES += drivers/opencl/driver_opencl.c
+libstarpu_la_SOURCES += drivers/opencl/driver_opencl_utils.c
+endif

+ 1 - 0
src/core/jobs.h

@@ -49,6 +49,7 @@ typedef void (*callback)(void *);
 #define STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
 #define STARPU_SPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SPU)
 #define STARPU_GORDON_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_GORDON)
+#define STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
 
 /* a job is the internal representation of a task */
 LIST_TYPE(starpu_job,

+ 3 - 0
src/core/perfmodel/perfmodel.c

@@ -82,6 +82,9 @@ static double common_task_expected_length(struct starpu_perfmodel_t *model, uint
 			case STARPU_CUDA:
 				alpha = STARPU_CUDA_ALPHA;
 				break;
+  		        case STARPU_OPENCL:
+	                        alpha = STARPU_OPENCL_ALPHA;
+                                break;
 			default:
 				/* perhaps there are various worker types on that queue */
 				alpha = 1.0; // this value is not significant ...

+ 6 - 3
src/core/perfmodel/perfmodel.h

@@ -106,8 +106,11 @@ double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_
 void _starpu_set_calibrate_flag(unsigned val);
 unsigned _starpu_get_calibrate_flag(void);
 
-#ifdef STARPU_USE_CUDA
-int *_starpu_get_gpu_affinity_vector(unsigned gpuid);
+#if defined(STARPU_USE_CUDA)
+int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 #endif
- 
+#if defined(STARPU_USE_OPENCL)
+int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
+#endif
+
 #endif // __PERFMODEL_H__

+ 335 - 66
src/core/perfmodel/perfmodel_bus.c

@@ -23,8 +23,10 @@
 #include <unistd.h>
 #include <sys/time.h>
 #include <stdlib.h>
+#include <math.h>
 
 #include <starpu.h>
+#include <starpu_opencl.h>
 #include <common/config.h>
 #include <core/workers.h>
 #include <core/perfmodel/perfmodel.h>
@@ -34,7 +36,7 @@
 
 #define MAXCPUS	32
 
-struct cudadev_timing {
+struct dev_timing {
 	int cpu_id;
 	double timing_htod;
 	double timing_dtoh;
@@ -43,19 +45,29 @@ struct cudadev_timing {
 static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
 static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
 static unsigned was_benchmarked = 0;
+static unsigned ncpus = 0;
 static int ncuda = 0;
-
-static int affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
+static int nopencl = 0;
 
 /* Benchmarking the performance of the bus */
 
 #ifdef STARPU_USE_CUDA
+static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
+static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+#endif
+#ifdef STARPU_USE_OPENCL
+static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
+static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
+static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
+static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+#endif
 
-static struct cudadev_timing cudadev_timing_per_cpu[STARPU_MAXNODES][MAXCPUS];
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 
-static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
+#ifdef STARPU_USE_CUDA
+static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 {
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -84,7 +96,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
-	cudaHostAlloc((void **)&h_buffer, SIZE, 0); 
+	cudaHostAlloc((void **)&h_buffer, SIZE, 0);
 	assert(h_buffer);
 
 	/* hack to avoid third party libs to rebind threads */
@@ -104,7 +116,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	struct timeval start;
 	struct timeval end;
 
-	cudadev_timing_per_cpu[dev+1][cpu].cpu_id = cpu;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
 
 	/* Measure upload bandwidth */
 	gettimeofday(&start, NULL);
@@ -116,7 +128,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	cudadev_timing_per_cpu[dev+1][cpu].timing_htod = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
 
 	/* Measure download bandwidth */
 	gettimeofday(&start, NULL);
@@ -128,7 +140,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
 
 	/* Free buffers */
 	cudaFreeHost(h_buffer);
@@ -137,18 +149,101 @@ static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
 	cudaThreadExit();
 
 }
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
+{
+        cl_context context;
+        cl_command_queue queue;
+
+        struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+	/* Initialize OpenCL context on the device */
+        _starpu_opencl_init_context(dev);
+        starpu_opencl_get_context(dev, &context);
+        starpu_opencl_get_queue(dev, &queue);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+	/* Allocate a buffer on the device */
+        int err;
+	cl_mem d_buffer;
+	d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, SIZE, NULL, &err);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+        /* Allocate a buffer on the host */
+	unsigned char *h_buffer;
+        h_buffer = malloc(SIZE);
+	assert(h_buffer);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+        /* Fill them */
+	memset(h_buffer, 0, SIZE);
+        err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	/* hack to avoid third party libs to rebind threads */
+	_starpu_bind_thread_on_cpu(config, cpu);
+
+        unsigned iter;
+	double timing;
+	struct timeval start;
+	struct timeval end;
+
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
+
+	/* Measure upload bandwidth */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; iter++)
+	{
+                err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
+
+	/* Measure download bandwidth */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; iter++)
+	{
+                err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, SIZE, h_buffer, 0, NULL, NULL);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
+
+	/* Free buffers */
+	clReleaseMemObject(d_buffer);
+	free(h_buffer);
+
+	/* Uninitiliaze OpenCL context on the device */
+        _starpu_opencl_deinit_context(dev);
+}
+#endif
 
 /* NB: we want to sort the bandwidth by DECREASING order */
-static int compar_cudadev_timing(const void *left_cudadev_timing, const void *right_cudadev_timing)
+static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
 {
-	const struct cudadev_timing *left = left_cudadev_timing;
-	const struct cudadev_timing *right = right_cudadev_timing;
-	
+	const struct dev_timing *left = left_dev_timing;
+	const struct dev_timing *right = right_dev_timing;
+
 	double left_dtoh = left->timing_dtoh;
 	double left_htod = left->timing_htod;
 	double right_dtoh = right->timing_dtoh;
 	double right_htod = right->timing_htod;
-	
+
 	double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
 	double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
 
@@ -156,47 +251,55 @@ static int compar_cudadev_timing(const void *left_cudadev_timing, const void *ri
 	return (bandwidth_sum2_left < bandwidth_sum2_right);
 }
 
-static void measure_bandwidth_between_host_and_dev(int dev, unsigned ncpus)
+static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
+                                                   struct dev_timing *dev_timing_per_cpu, char type)
 {
 	unsigned cpu;
 	for (cpu = 0; cpu < ncpus; cpu++)
 	{
-		measure_bandwidth_between_host_and_dev_on_cpu(dev, cpu);
-	}
+#ifdef STARPU_USE_CUDA
+                if (type == 'C')
+                        measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
+#endif
+#ifdef STARPU_USE_OPENCL
+                if (type == 'O')
+                        measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
+#endif
+        }
 
 	/* sort the results */
-	qsort(cudadev_timing_per_cpu[dev+1], ncpus,
-			sizeof(struct cudadev_timing),
-			compar_cudadev_timing);
-	
+	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
+              sizeof(struct dev_timing),
+			compar_dev_timing);
+
 #ifdef STARPU_VERBOSE
 	for (cpu = 0; cpu < ncpus; cpu++)
 	{
-		unsigned current_cpu = cudadev_timing_per_cpu[dev+1][cpu].cpu_id;
-		double bandwidth_dtoh = cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh;
-		double bandwidth_htod = cudadev_timing_per_cpu[dev+1][cpu].timing_htod;
+		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
+		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
+		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
 		fprintf(stderr, "BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 
-	unsigned best_cpu = cudadev_timing_per_cpu[dev+1][0].cpu_id;
+	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
 
 	fprintf(stderr, "BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
 #endif
 
 	/* The results are sorted in a decreasing order, so that the best
 	 * measurement is currently the first entry. */
-	cudadev_timing_dtoh[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_dtoh;
-	cudadev_timing_htod[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_htod;
+	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
+	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
 }
-#endif
+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
 
-static void benchmark_all_cuda_devices(void)
+static void benchmark_all_gpu_devices(void)
 {
-#ifdef STARPU_USE_CUDA
-	int ret;
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+	int i, ret;
 
 #ifdef STARPU_VERBOSE
 	fprintf(stderr, "Benchmarking the speed of the bus\n");
@@ -213,15 +316,24 @@ static void benchmark_all_cuda_devices(void)
 	}
 
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = _starpu_topology_get_nhwcpu(config);
 
+#ifdef STARPU_USE_CUDA
         cudaGetDeviceCount(&ncuda);
-	int i;
 	for (i = 0; i < ncuda; i++)
 	{
 		/* measure bandwidth between Host and Device i */
-		measure_bandwidth_between_host_and_dev(i, ncpus);
+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+        nopencl = _starpu_opencl_get_device_count();
+	for (i = 0; i < nopencl; i++)
+	{
+		/* measure bandwith between Host and Device i */
+		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
 	}
+#endif
 
 	/* FIXME: use hwloc */
 	/* Restore the former affinity */
@@ -235,7 +347,7 @@ static void benchmark_all_cuda_devices(void)
 #ifdef STARPU_VERBOSE
 	fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
 #endif
-#endif
+#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
 
 	was_benchmarked = 1;
 }
@@ -244,7 +356,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 {
 	_starpu_get_perf_model_dir_bus(path, maxlen);
 	strncat(path, type, maxlen);
-	
+
 	char hostname[32];
 	gethostname(hostname, 32);
 	strncat(path, ".", maxlen);
@@ -270,13 +382,13 @@ static void load_bus_affinity_file_content(void)
 	f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
-#ifdef STARPU_USE_CUDA
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = _starpu_topology_get_nhwcpu(config);
+        int gpu;
 
+#ifdef STARPU_USE_CUDA
         cudaGetDeviceCount(&ncuda);
-
-	int gpu;
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 		int ret;
@@ -292,7 +404,7 @@ static void load_bus_affinity_file_content(void)
 		unsigned cpu;
 		for (cpu = 0; cpu < ncpus; cpu++)
 		{
-			ret = fscanf(f, "%d\t", &affinity_matrix[gpu][cpu]);
+			ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
 			STARPU_ASSERT(ret == 1);
 		}
 
@@ -300,6 +412,32 @@ static void load_bus_affinity_file_content(void)
 		STARPU_ASSERT(ret == 0);
 	}
 #endif
+#ifdef STARPU_USE_OPENCL
+        nopencl = _starpu_opencl_get_device_count();
+	for (gpu = 0; gpu < nopencl; gpu++)
+	{
+		int ret;
+
+		int dummy;
+
+		starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &dummy);
+		STARPU_ASSERT(ret == 1);
+
+		STARPU_ASSERT(dummy == gpu);
+
+		unsigned cpu;
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+			ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
+			STARPU_ASSERT(ret == 1);
+		}
+
+		ret = fscanf(f, "\n");
+		STARPU_ASSERT(ret == 0);
+	}
+#endif
+#endif
 
 	fclose(f);
 }
@@ -320,24 +458,36 @@ static void write_bus_affinity_file_content(void)
 		STARPU_ABORT();
 	}
 
-#ifdef STARPU_USE_CUDA
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	unsigned cpu;
+        int gpu;
 
-	fprintf(f, "# GPU\t");
+        fprintf(f, "# GPU\t");
 	for (cpu = 0; cpu < ncpus; cpu++)
 		fprintf(f, "CPU%d\t", cpu);
 	fprintf(f, "\n");
 
-	int gpu;
+#ifdef STARPU_USE_CUDA
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 		fprintf(f, "%d\t", gpu);
 
 		for (cpu = 0; cpu < ncpus; cpu++)
 		{
-			fprintf(f, "%d\t", cudadev_timing_per_cpu[gpu+1][cpu].cpu_id);
+			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
+		}
+
+		fprintf(f, "\n");
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+	for (gpu = 0; gpu < nopencl; gpu++)
+	{
+		fprintf(f, "%d\t", gpu);
+
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
 		}
 
 		fprintf(f, "\n");
@@ -345,12 +495,13 @@ static void write_bus_affinity_file_content(void)
 #endif
 
 	fclose(f);
+#endif
 }
 
 static void generate_bus_affinity_file(void)
 {
 	if (!was_benchmarked)
-		benchmark_all_cuda_devices();
+		benchmark_all_gpu_devices();
 
 	write_bus_affinity_file_content();
 }
@@ -372,10 +523,19 @@ static void load_bus_affinity_file(void)
 	load_bus_affinity_file_content();
 }
 
-int *_starpu_get_gpu_affinity_vector(unsigned gpuid)
+#ifdef STARPU_USE_CUDA
+int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
+{
+        return cuda_affinity_matrix[gpuid];
+}
+#endif /* STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
 {
-	return affinity_matrix[gpuid];
+        return opencl_affinity_matrix[gpuid];
 }
+#endif /* STARPU_USE_OPENCL */
 
 /*
  *	Latency
@@ -420,7 +580,7 @@ static void load_bus_latency_file_content(void)
 
 static void write_bus_latency_file_content(void)
 {
-	int src, dst;
+        int src, dst, maxnode;
 	FILE *f;
 
 	STARPU_ASSERT(was_benchmarked);
@@ -440,13 +600,17 @@ static void write_bus_latency_file_content(void)
 		fprintf(f, "to %d\t\t", dst);
 	fprintf(f, "\n");
 
-	for (src = 0; src < STARPU_MAXNODES; src++)
+        maxnode = ncuda;
+#ifdef STARPU_USE_OPENCL
+        maxnode += nopencl;
+#endif
+        for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		{
 			double latency;
 
-			if ((src > ncuda) || (dst > ncuda))
+			if ((src > maxnode) || (dst > maxnode))
 			{
 				/* convention */
 				latency = -1.0;
@@ -471,7 +635,7 @@ static void write_bus_latency_file_content(void)
 static void generate_bus_latency_file(void)
 {
 	if (!was_benchmarked)
-		benchmark_all_cuda_devices();
+		benchmark_all_gpu_devices();
 
 	write_bus_latency_file_content();
 }
@@ -494,7 +658,7 @@ static void load_bus_latency_file(void)
 }
 
 
-/* 
+/*
  *	Bandwidth
  */
 static void get_bandwidth_path(char *path, size_t maxlen)
@@ -540,7 +704,7 @@ static void load_bus_bandwidth_file_content(void)
 
 static void write_bus_bandwidth_file_content(void)
 {
-	int src, dst;
+	int src, dst, maxnode;
 	FILE *f;
 
 	STARPU_ASSERT(was_benchmarked);
@@ -556,25 +720,38 @@ static void write_bus_bandwidth_file_content(void)
 		fprintf(f, "to %d\t\t", dst);
 	fprintf(f, "\n");
 
+        maxnode = ncuda;
+#ifdef STARPU_USE_OPENCL
+        maxnode += nopencl;
+#endif
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		{
 			double bandwidth;
-			
-			if ((src > ncuda) || (dst > ncuda))
+
+			if ((src > maxnode) || (dst > maxnode))
 			{
 				bandwidth = -1.0;
 			}
-#ifdef STARPU_USE_CUDA
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 			else if (src != dst)
 			{
-			/* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
-				double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
-				double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
-				
+                                double time_src_to_ram=0.0, time_ram_to_dst=0.0;
+                                /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
+#ifdef STARPU_USE_CUDA
+				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
+                                time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
+#endif
+#ifdef STARPU_USE_OPENCL
+                                if (src > ncuda)
+                                        time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
+                                if (dst > ncuda)
+                                        time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
+#endif
+
 				double timing =time_src_to_ram + time_ram_to_dst;
-				
+
 				bandwidth = 1.0*SIZE/timing;
 			}
 #endif
@@ -582,7 +759,7 @@ static void write_bus_bandwidth_file_content(void)
 			        /* convention */
 			        bandwidth = 0.0;
 			}
-			
+
 			fprintf(f, "%lf\t", bandwidth);
 		}
 
@@ -595,7 +772,7 @@ static void write_bus_bandwidth_file_content(void)
 static void generate_bus_bandwidth_file(void)
 {
 	if (!was_benchmarked)
-		benchmark_all_cuda_devices();
+		benchmark_all_gpu_devices();
 
 	write_bus_bandwidth_file_content();
 }
@@ -618,6 +795,96 @@ static void load_bus_bandwidth_file(void)
 }
 
 /*
+ *	Config
+ */
+static void get_config_path(char *path, size_t maxlen)
+{
+	get_bus_path("config", path, maxlen);
+}
+
+static void check_bus_config_file()
+{
+        int res;
+        char path[256];
+
+        get_config_path(path, 256);
+        res = access(path, F_OK);
+        if (res) {
+                starpu_force_bus_sampling();
+        }
+        else {
+                FILE *f;
+                int ret, read_cuda, read_opencl;
+                unsigned read_cpus;
+                struct starpu_machine_config_s *config = _starpu_get_machine_config();
+
+                // Loading configuration from file
+                f = fopen(path, "r");
+                STARPU_ASSERT(f);
+                starpu_drop_comments(f);
+                ret = fscanf(f, "%d\t", &read_cpus);
+		STARPU_ASSERT(ret == 1);
+                starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &read_cuda);
+		STARPU_ASSERT(ret == 1);
+                starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &read_opencl);
+		STARPU_ASSERT(ret == 1);
+                starpu_drop_comments(f);
+                fclose(f);
+
+                // Loading current configuration
+                ncpus = _starpu_topology_get_nhwcpu(config);
+#ifdef STARPU_USE_CUDA
+                cudaGetDeviceCount(&ncuda);
+#endif
+#ifdef STARPU_USE_OPENCL
+                nopencl = _starpu_opencl_get_device_count();
+#endif
+
+                // Checking if both configurations match
+                if (read_cpus != ncpus) {
+                        fprintf(stderr, "Current configuration does not match the performance model (CPUS: %d != %d)\n", read_cpus, ncpus);
+                        starpu_force_bus_sampling();
+                }
+                else if (read_cuda != ncuda) {
+                        fprintf(stderr, "Current configuration does not match the performance model (CUDA: %d != %d)\n", read_cuda, ncuda);
+                        starpu_force_bus_sampling();
+                }
+                else if (read_opencl != nopencl) {
+                        fprintf(stderr, "Current configuration does not match the performance model (OpenCL: %d != %d)\n", read_opencl, nopencl);
+                        starpu_force_bus_sampling();
+                }
+        }
+}
+
+static void write_bus_config_file_content(void)
+{
+	FILE *f;
+	char path[256];
+
+	STARPU_ASSERT(was_benchmarked);
+        get_config_path(path, 256);
+        f = fopen(path, "w+");
+	STARPU_ASSERT(f);
+
+        fprintf(f, "# Current configuration\n");
+        fprintf(f, "%d # Number of CPUs\n", ncpus);
+        fprintf(f, "%d # Number of CUDA devices\n", ncuda);
+        fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
+
+        fclose(f);
+}
+
+static void generate_bus_config_file()
+{
+	if (!was_benchmarked)
+		benchmark_all_gpu_devices();
+
+	write_bus_config_file_content();
+}
+
+/*
  *	Generic
  */
 
@@ -628,12 +895,14 @@ void starpu_force_bus_sampling(void)
 	generate_bus_affinity_file();
 	generate_bus_latency_file();
 	generate_bus_bandwidth_file();
+        generate_bus_config_file();
 }
 
 void _starpu_load_bus_performance_files(void)
 {
 	_starpu_create_sampling_directory_if_needed();
 
+        check_bus_config_file();
 	load_bus_affinity_file();
 	load_bus_latency_file();
 	load_bus_bandwidth_file();

+ 173 - 21
src/core/topology.c

@@ -21,6 +21,7 @@
 #include <core/debug.h>
 #include <core/topology.h>
 #include <drivers/cuda/driver_cuda.h>
+#include <common/hash.h>
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
@@ -37,8 +38,16 @@ static unsigned topology_is_initialized = 0;
 
 static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *config);
 
-#ifdef STARPU_USE_CUDA
-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config);
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+#  ifdef STARPU_USE_CUDA
+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config);
+static struct starpu_htbl32_node_s *devices_using_cuda = NULL;
+#  endif
+#  ifdef STARPU_USE_OPENCL
+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config);
+#  endif
+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus);
 static unsigned may_bind_automatically = 0;
 #endif
 
@@ -47,12 +56,70 @@ static unsigned may_bind_automatically = 0;
  */
 
 #ifdef STARPU_USE_CUDA
-static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *config)
+static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config)
+{
+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_cuda_gpuid,
+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_cuda_gpuid,
+                                         &(config->current_cuda_gpuid), (int *)config->workers_cuda_gpuid, "STARPU_WORKERS_CUDAID",
+                                         config->nhwcudagpus);
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config)
+{
+        _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_opencl_gpuid,
+                                         config->user_conf==NULL?NULL:(int *)config->user_conf->workers_opencl_gpuid,
+                                         &(config->current_opencl_gpuid), (int *)config->workers_opencl_gpuid, "STARPU_WORKERS_OPENCLID",
+                                         config->nhwopenclgpus);
+
+#ifdef STARPU_USE_CUDA
+        // Detect devices which are already used with CUDA
+        {
+                unsigned tmp[STARPU_NMAXWORKERS];
+                unsigned nb=0;
+                int i;
+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
+                        if (_starpu_htbl_search_32(devices_using_cuda, key) == NULL) {
+                                tmp[nb] = config->workers_opencl_gpuid[i];
+                                nb++;
+                        }
+                }
+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
+        }
+#endif /* STARPU_USE_CUDA */
+        {
+                // Detect identical devices
+                struct starpu_htbl32_node_s *devices_already_used = NULL;
+                unsigned tmp[STARPU_NMAXWORKERS];
+                unsigned nb=0;
+                int i;
+
+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
+                        uint32_t key = _starpu_crc32_be(config->workers_opencl_gpuid[i], 0);
+                        if (_starpu_htbl_search_32(devices_already_used, key) == NULL) {
+                                _starpu_htbl_insert_32(&devices_already_used, key, config);
+                                tmp[nb] = config->workers_opencl_gpuid[i];
+                                nb ++;
+                        }
+                }
+                for(i=nb ; i<STARPU_NMAXWORKERS ; i++) tmp[i] = -1;
+                memcpy(config->workers_opencl_gpuid, tmp, sizeof(unsigned)*STARPU_NMAXWORKERS);
+        }
+}
+#endif
+
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
+                                             int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus)
 {
 	char *strval;
 	unsigned i;
 
-	config->current_gpuid = 0;
+	*current = 0;
 
 	/* conf->workers_bindid indicates the successive cpu identifier that
 	 * should be used to bind the workers. It should be either filled
@@ -62,14 +129,14 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 	 * cpus. */
 
 	/* what do we use, explicit value, env. variable, or round-robin ? */
-	if (config->user_conf && config->user_conf->use_explicit_workers_gpuid)
+	if (use_explicit_workers_gpuid)
 	{
 		/* we use the explicit value from the user */
-		memcpy(config->workers_gpuid,
-			config->user_conf->workers_gpuid,
-			STARPU_NMAXWORKERS*sizeof(unsigned));
+		memcpy(workers_gpuid,
+                       explicit_workers_gpuid,
+                       STARPU_NMAXWORKERS*sizeof(unsigned));
 	}
-	else if ((strval = getenv("STARPU_WORKERS_CUDAID")))
+	else if ((strval = getenv(varname)))
 	{
 		/* STARPU_WORKERS_CUDAID certainly contains less entries than
 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
@@ -86,7 +153,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 				val = strtol(strval, &endptr, 10);
 				if (endptr != strval)
 				{
-					config->workers_gpuid[i] = (unsigned)val;
+					workers_gpuid[i] = (unsigned)val;
 					strval = endptr;
 				}
 				else {
@@ -97,11 +164,11 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 					/* there is no more values in the string */
 					wrap = 1;
 
-					config->workers_gpuid[i] = config->workers_gpuid[0];
+					workers_gpuid[i] = workers_gpuid[0];
 				}
 			}
 			else {
-				config->workers_gpuid[i] = config->workers_gpuid[i % number_of_entries];
+				workers_gpuid[i] = workers_gpuid[i % number_of_entries];
 			}
 		}
 	}
@@ -109,7 +176,7 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 	{
 		/* by default, we take a round robin policy */
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
-			config->workers_gpuid[i] = (unsigned)i;
+			workers_gpuid[i] = (unsigned)(i % nhwgpus);
 
 		/* StarPU can use sampling techniques to bind threads correctly */
 		may_bind_automatically = 1;
@@ -117,11 +184,18 @@ static void _starpu_initialize_workers_gpuid(struct starpu_machine_config_s *con
 }
 #endif
 
-static inline int _starpu_get_next_gpuid(struct starpu_machine_config_s *config)
+static inline int _starpu_get_next_cuda_gpuid(struct starpu_machine_config_s *config)
 {
-	unsigned i = ((config->current_gpuid++) % config->ncudagpus);
+	unsigned i = ((config->current_cuda_gpuid++) % config->ncudagpus);
 
-	return (int)config->workers_gpuid[i];
+	return (int)config->workers_cuda_gpuid[i];
+}
+
+static inline int _starpu_get_next_opencl_gpuid(struct starpu_machine_config_s *config)
+{
+	unsigned i = ((config->current_opencl_gpuid++) % config->nopenclgpus);
+
+	return (int)config->workers_opencl_gpuid[i];
 }
 
 static void _starpu_init_topology(struct starpu_machine_config_s *config)
@@ -152,7 +226,14 @@ static void _starpu_init_topology(struct starpu_machine_config_s *config)
 #warning no way to know number of cores, assuming 1
 		config->nhwcpus = 1;
 #endif
-	
+
+#ifdef STARPU_USE_CUDA
+                config->nhwcudagpus = _starpu_get_cuda_device_count();
+#endif
+#ifdef STARPU_USE_OPENCL
+                config->nhwopenclgpus = _starpu_opencl_get_device_count();
+#endif
+
 		topology_is_initialized = 1;
 	}
 }
@@ -208,22 +289,79 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 	if (config->ncudagpus > 0)
 		use_accelerator = 1;
 
-	_starpu_initialize_workers_gpuid(config);
+	_starpu_initialize_workers_cuda_gpuid(config);
 
 	unsigned cudagpu;
 	for (cudagpu = 0; cudagpu < config->ncudagpus; cudagpu++)
 	{
 		config->workers[config->nworkers + cudagpu].arch = STARPU_CUDA_WORKER;
-		int devid = _starpu_get_next_gpuid(config);
+		int devid = _starpu_get_next_cuda_gpuid(config);
 		enum starpu_perf_archtype arch = STARPU_CUDA_DEFAULT + devid;
 		config->workers[config->nworkers + cudagpu].devid = devid;
 		config->workers[config->nworkers + cudagpu].perf_arch = arch; 
 		config->workers[config->nworkers + cudagpu].worker_mask = STARPU_CUDA;
 		config->worker_mask |= STARPU_CUDA;
-	}
+
+                uint32_t key = _starpu_crc32_be(devid, 0);
+                _starpu_htbl_insert_32(&devices_using_cuda, key, config);
+        }
 
 	config->nworkers += config->ncudagpus;
 #endif
+
+#ifdef STARPU_USE_OPENCL
+	if (user_conf && (user_conf->nopencl == 0))
+	{
+		/* the user explicitely disabled OpenCL */
+		config->nopenclgpus = 0;
+	}
+	else {
+		/* we need to initialize OpenCL early to count the number of devices */
+		_starpu_opencl_init();
+
+		if (user_conf && (user_conf->nopencl != -1))
+		{
+			explicitval = user_conf->nopencl;
+		}
+		else {
+			explicitval = starpu_get_env_number("STARPU_NOPENCL");
+		}
+
+		if (explicitval < 0) {
+			config->nopenclgpus =
+				STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
+		} else {
+			/* use the specified value */
+			config->nopenclgpus = (unsigned)explicitval;
+			STARPU_ASSERT(config->nopenclgpus <= STARPU_MAXOPENCLDEVS);
+		}
+		STARPU_ASSERT(config->nopenclgpus + config->nworkers <= STARPU_NMAXWORKERS);
+	}
+
+	if (config->nopenclgpus > 0)
+		use_accelerator = 1;
+	// TODO: use_accelerator pour les OpenCL?
+
+	_starpu_initialize_workers_opencl_gpuid(config);
+
+	unsigned openclgpu;
+	for (openclgpu = 0; openclgpu < config->nopenclgpus; openclgpu++)
+	{
+		int devid = _starpu_get_next_opencl_gpuid(config);
+                if (devid == -1) { // There is no more devices left
+                  config->nopenclgpus = openclgpu;
+                  break;
+                }
+		config->workers[config->nworkers + openclgpu].arch = STARPU_OPENCL_WORKER;
+		enum starpu_perf_archtype arch = STARPU_OPENCL_DEFAULT + devid;
+		config->workers[config->nworkers + openclgpu].devid = devid;
+		config->workers[config->nworkers + openclgpu].perf_arch = arch; 
+		config->workers[config->nworkers + openclgpu].worker_mask = STARPU_OPENCL;
+		config->worker_mask |= STARPU_OPENCL;
+	}
+
+	config->nworkers += config->nopenclgpus;
+#endif
 	
 #ifdef STARPU_USE_GORDON
 	if (user_conf && (user_conf->ncuda != -1)) {
@@ -500,13 +638,27 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				if (may_bind_automatically)
 				{
 					/* StarPU is allowed to bind threads automatically */
-					preferred_binding = _starpu_get_gpu_affinity_vector(workerarg->devid);
+					preferred_binding = _starpu_get_cuda_affinity_vector(workerarg->devid);
 					npreferred = config->nhwcpus;
 				}
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
 				break;
 #endif
+
+#ifdef STARPU_USE_OPENCL
+		        case STARPU_OPENCL_WORKER:
+				if (may_bind_automatically)
+				{
+					/* StarPU is allowed to bind threads automatically */
+					preferred_binding = _starpu_get_opencl_affinity_vector(workerarg->devid);
+					npreferred = config->nhwcpus;
+				}
+				is_a_set_of_accelerators = 0;
+				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
+				break;
+#endif
+
 			default:
 				STARPU_ABORT();
 		}

+ 28 - 0
src/core/workers.c

@@ -59,6 +59,11 @@ inline uint32_t _starpu_may_submit_cpu_task(void)
 	return (STARPU_CPU & config.worker_mask);
 }
 
+inline uint32_t _starpu_may_submit_opencl_task(void)
+{
+	return (STARPU_OPENCL & config.worker_mask);
+}
+
 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where)
 {
 	return (where & config.workers[workerid].worker_mask);
@@ -90,6 +95,9 @@ static void _starpu_init_worker_queue(struct starpu_worker_s *workerarg)
 		case STARPU_CUDA_WORKER:
 			jobq->alpha = STARPU_CUDA_ALPHA;
 			break;
+		case STARPU_OPENCL_WORKER:
+			jobq->alpha = STARPU_OPENCL_ALPHA;
+			break;
 		case STARPU_GORDON_WORKER:
 			jobq->alpha = STARPU_GORDON_ALPHA;
 			break;
@@ -151,6 +159,15 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 
 				break;
 #endif
+#ifdef STARPU_USE_OPENCL
+			case STARPU_OPENCL_WORKER:
+				workerarg->set = NULL;
+				workerarg->worker_is_initialized = 0;
+				pthread_create(&workerarg->worker_thread, 
+						NULL, _starpu_opencl_worker, workerarg);
+
+				break;
+#endif
 #ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 				/* we will only launch gordon once, but it will handle 
@@ -192,6 +209,7 @@ static void _starpu_init_workers(struct starpu_machine_config_s *config)
 		switch (workerarg->arch) {
 			case STARPU_CPU_WORKER:
 			case STARPU_CUDA_WORKER:
+			case STARPU_OPENCL_WORKER:			  
 				PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 				while (!workerarg->worker_is_initialized)
 					PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
@@ -523,6 +541,11 @@ unsigned starpu_get_cuda_worker_count(void)
 	return config.ncudagpus;
 }
 
+unsigned starpu_get_opencl_worker_count(void)
+{
+	return config.nopenclgpus;
+}
+
 unsigned starpu_get_spu_worker_count(void)
 {
 	return config.ngordon_spus;
@@ -549,6 +572,11 @@ int starpu_get_worker_id(void)
 	}
 }
 
+int starpu_get_worker_devid(int id)
+{
+	return config.workers[id].devid;
+}
+
 struct starpu_worker_s *_starpu_get_worker_struct(unsigned id)
 {
 	return &config.workers[id];

+ 16 - 3
src/core/workers.h

@@ -42,6 +42,10 @@
 #include <drivers/cuda/driver_cuda.h>
 #endif
 
+#ifdef STARPU_USE_OPENCL
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
 #ifdef STARPU_USE_GORDON
 #include <drivers/gordon/driver_gordon.h>
 #endif
@@ -52,6 +56,7 @@
 
 #define STARPU_CPU_ALPHA	1.0f
 #define STARPU_CUDA_ALPHA	13.33f
+#define STARPU_OPENCL_ALPHA	12.22f
 #define STARPU_GORDON_ALPHA	6.0f /* XXX this is a random value ... */
 
 #ifdef STARPU_DATA_STATS
@@ -105,18 +110,25 @@ struct starpu_machine_config_s {
 #endif
 
 	unsigned nhwcpus;
+        unsigned nhwcudagpus;
+        unsigned nhwopenclgpus;
 
 	unsigned ncpus;
 	unsigned ncudagpus;
+	unsigned nopenclgpus;
 	unsigned ngordon_spus;
 
 	/* Where to bind workers ? */
 	int current_bindid;
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	
-	/* Which GPU(s) do we use ? */
-	int current_gpuid;
-	unsigned workers_gpuid[STARPU_NMAXWORKERS];
+	/* Which GPU(s) do we use for CUDA ? */
+	int current_cuda_gpuid;
+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
+
+	/* Which GPU(s) do we use for OpenCL ? */
+	int current_opencl_gpuid;
+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 	
 	struct starpu_worker_s workers[STARPU_NMAXWORKERS];
 	uint32_t worker_mask;
@@ -138,6 +150,7 @@ unsigned _starpu_machine_is_running(void);
 inline uint32_t _starpu_worker_exists(uint32_t task_mask);
 inline uint32_t _starpu_may_submit_cuda_task(void);
 inline uint32_t _starpu_may_submit_cpu_task(void);
+inline uint32_t _starpu_may_submit_opencl_task(void);
 inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t where);
 unsigned _starpu_worker_can_block(unsigned memnode);
 

+ 6 - 4
src/datawizard/coherency.c

@@ -25,8 +25,8 @@ uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_n
 	/* in case one of the node is a GPU, it needs to perform the transfer,
 	 * if both of them are GPU, it's a bit more complicated (TODO !) */
 
-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM);
+	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
+	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
 
 	/* we do not handle GPU->GPU transfers yet ! */
 	STARPU_ASSERT( !(src_is_a_gpu && dst_is_a_gpu) );
@@ -77,6 +77,8 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 			 * 	other should be ok */
 			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM)
 				break;
+			if (_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)
+				break;
 
 			/* XXX do a better algorithm to distribute the memory copies */
 			/* TODO : use the "requesting_node" as an argument to do so */
@@ -181,8 +183,8 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, uint32_t requesting_no
 			STARPU_ASSERT(src_node != requesting_node);
 		}
 	
-		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM);
-		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM);
+		unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
+		unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(requesting_node) == STARPU_OPENCL_RAM);
 
 		/* we have to perform 2 successive requests for GPU->GPU transfers */
 		if (read && (src_is_a_gpu && dst_is_a_gpu)) {

+ 67 - 0
src/datawizard/copy_driver.c

@@ -135,6 +135,28 @@ cudaStream_t *stream;
 				}
 				break;
 #endif
+#ifdef STARPU_USE_OPENCL
+    		        case STARPU_OPENCL_RAM:
+				/* OpenCL -> RAM */
+				if (_starpu_get_local_memory_node() == src_node)
+				{
+					STARPU_ASSERT(copy_methods->opencl_to_ram);
+					if (!req || !copy_methods->opencl_to_ram_async)
+					{
+						/* this is not associated to a request so it's synchronous */
+                                                copy_methods->opencl_to_ram(handle, src_node, dst_node);
+                                        }
+                                        else {
+                                                ret = copy_methods->opencl_to_ram_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
+                                        }
+				}
+				else
+				{
+					/* we should not have a blocking call ! */
+					STARPU_ABORT();
+				}
+				break;
+#endif
 			case STARPU_SPU_LS:
 				STARPU_ABORT(); // TODO
 				break;
@@ -180,6 +202,34 @@ cudaStream_t *stream;
 		}
 		break;
 #endif
+#ifdef STARPU_USE_OPENCL
+	case STARPU_OPENCL_RAM:
+		switch (src_kind) {
+		        case STARPU_RAM:
+				/* STARPU_RAM -> STARPU_OPENCL_RAM */
+				STARPU_ASSERT(_starpu_get_local_memory_node() == dst_node);
+				STARPU_ASSERT(copy_methods->ram_to_opencl);
+				if (!req || !copy_methods->ram_to_opencl_async)
+				{
+					/* this is not associated to a request so it's synchronous */
+					copy_methods->ram_to_opencl(handle, src_node, dst_node);
+				}
+				else {
+                                        ret = copy_methods->ram_to_opencl_async(handle, src_node, dst_node, &(req->async_channel.opencl_event));
+				}
+				break;
+			case STARPU_CUDA_RAM:
+			case STARPU_OPENCL_RAM:
+			case STARPU_SPU_LS:
+				STARPU_ABORT(); // TODO 
+				break;
+			case STARPU_UNUSED:
+			default:
+				STARPU_ABORT();
+				break;
+		}
+		break;
+#endif
 	case STARPU_SPU_LS:
 		STARPU_ABORT(); // TODO
 		break;
@@ -275,6 +325,12 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        fprintf(stderr, "not implemented yet\n");
+			STARPU_ABORT();
+                        break;
+#endif
 		case STARPU_RAM:
 		default:
 			STARPU_ABORT();
@@ -301,6 +357,17 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        {
+                                cl_int event_status;
+                                cl_event opencl_event = (*async_channel).opencl_event;
+                                if (opencl_event == NULL) STARPU_ABORT();
+                                clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
+                                success = (event_status == CL_COMPLETE);
+                                break;
+                        }
+#endif
 		case STARPU_RAM:
 		default:
 			STARPU_ABORT();

+ 23 - 0
src/datawizard/copy_driver.h

@@ -28,6 +28,10 @@
 #include <cublas.h>
 #endif
 
+#ifdef STARPU_USE_OPENCL
+#include <CL/cl.h>
+#endif
+
 struct starpu_data_request_s;
 
 /* this is a structure that can be queried to see whether an asynchronous
@@ -37,24 +41,36 @@ typedef union {
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t cuda_event;
 #endif
+#ifdef STARPU_USE_OPENCL
+        cl_event opencl_event;
+#endif
 } starpu_async_channel;
 
 struct starpu_copy_data_methods_s {
 	/* src type is ram */
 	int (*ram_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*ram_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*ram_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*ram_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 
 	/* src type is cuda */
 	int (*cuda_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*cuda_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*cuda_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*cuda_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 
 	/* src type is spu */
 	int (*spu_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*spu_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 
+	/* src type is opencl */
+	int (*opencl_to_ram)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*opencl_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*opencl_to_opencl)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+	int (*opencl_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
+
 #ifdef STARPU_USE_CUDA
 	/* for asynchronous CUDA transfers */
 	int (*ram_to_cuda_async)(starpu_data_handle handle, uint32_t src,
@@ -64,6 +80,13 @@ struct starpu_copy_data_methods_s {
 	int (*cuda_to_cuda_async)(starpu_data_handle handle, uint32_t src,
 					uint32_t dst, cudaStream_t *stream);
 #endif
+
+#ifdef STARPU_USE_OPENCL
+	/* for asynchronous OpenCL transfers */
+        int (*ram_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
+	int (*opencl_to_ram_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
+	int (*opencl_to_opencl_async)(starpu_data_handle handle, uint32_t src, uint32_t dst, cl_event *event);
+#endif
 };
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);

+ 121 - 0
src/datawizard/interfaces/bcsr_interface.c

@@ -22,6 +22,11 @@
 #include <datawizard/filters.h>
 #include <common/hash.h>
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
 /*
  * BCSR : blocked CSR, we use blocks of size (r x c)
  */
@@ -31,6 +36,10 @@ static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, u
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+#endif
 
 static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -39,6 +48,10 @@ static const struct starpu_copy_data_methods_s bcsr_copy_data_methods_s = {
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
@@ -274,6 +287,27 @@ static size_t allocate_bcsr_buffer_on_node(starpu_data_handle handle, uint32_t d
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+                        {
+                                int ret;
+                                void *ptr;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*r*c*elemsize, CL_MEM_READ_WRITE);
+                                addr_nzval = (uintptr_t)ptr;
+                                if (ret) goto fail_nzval;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_colind = ptr;
+				if (ret) goto fail_colind;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_rowptr = ptr;
+				if (ret) goto fail_rowptr;
+
+                                break;
+                        }
+#endif
 		default:
 			assert(0);
 	}
@@ -298,6 +332,11 @@ fail_rowptr:
 			cudaFree((void*)addr_colind);
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_colind);
+			break;
+#endif
 		default:
 			assert(0);
 	}
@@ -311,6 +350,11 @@ fail_colind:
 			cudaFree((void*)addr_nzval);
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_nzval);
+			break;
+#endif
 		default:
 			assert(0);
 	}
@@ -341,6 +385,13 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)bcsr_interface->rowptr);
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)bcsr_interface->nzval);
+			clReleaseMemObject((void*)bcsr_interface->colind);
+			clReleaseMemObject((void*)bcsr_interface->rowptr);
+			break;
+#endif
 		default:
 			assert(0);
 	}
@@ -420,6 +471,76 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 }
 #endif // STARPU_USE_CUDA
 
+#ifdef STARPU_USE_OPENCL
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_bcsr_interface_t *src_bcsr;
+	starpu_bcsr_interface_t *dst_bcsr;
+
+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_bcsr->nnz;
+	uint32_t nrow = src_bcsr->nrow;
+	size_t elemsize = src_bcsr->elemsize;
+
+	uint32_t r = src_bcsr->r;
+	uint32_t c = src_bcsr->c;
+
+        int err;
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->nzval, (void *)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->colind, (void *)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_bcsr->rowptr, (void *)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_bcsr_interface_t *src_bcsr;
+	starpu_bcsr_interface_t *dst_bcsr;
+
+	src_bcsr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_bcsr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_bcsr->nnz;
+	uint32_t nrow = src_bcsr->nrow;
+	size_t elemsize = src_bcsr->elemsize;
+
+	uint32_t r = src_bcsr->r;
+	uint32_t c = src_bcsr->c;
+
+        int err;
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->nzval, (cl_mem)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->colind, (cl_mem)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_bcsr->rowptr, (cl_mem)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+#endif // STARPU_USE_OPENCL
+
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {

+ 122 - 0
src/datawizard/interfaces/block_interface.c

@@ -22,6 +22,11 @@
 
 #include <common/hash.h>
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
@@ -29,6 +34,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -39,6 +50,12 @@ static const struct starpu_copy_data_methods_s block_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
@@ -94,11 +111,15 @@ static void register_block_handle(starpu_data_handle handle, uint32_t home_node,
 
 		if (node == home_node) {
 			local_interface->ptr = block_interface->ptr;
+                        local_interface->dev_handle = block_interface->dev_handle;
+                        local_interface->offset = block_interface->offset;
 			local_interface->ldy  = block_interface->ldy;
 			local_interface->ldz  = block_interface->ldz;
 		}
 		else {
 			local_interface->ptr = 0;
+                        local_interface->dev_handle = 0;
+                        local_interface->offset = 0;
 			local_interface->ldy  = 0;
 			local_interface->ldz  = 0;
 		}
@@ -117,6 +138,8 @@ void starpu_register_block_data(starpu_data_handle *handleptr, uint32_t home_nod
 {
 	starpu_block_interface_t interface = {
 		.ptr = ptr,
+                .dev_handle = ptr,
+                .offset = 0,
 		.ldy = ldy,
 		.ldz = ldz,
 		.nx = nx,
@@ -278,6 +301,19 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 			assert(0);
 	}
@@ -288,6 +324,8 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 
 		/* update the data properly in consequence */
 		dst_block->ptr = addr;
+                dst_block->dev_handle = addr;
+                dst_block->offset = 0;
 		dst_block->ldy = nx;
 		dst_block->ldz = nx*ny;
 	} else {
@@ -319,6 +357,11 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node)
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void *)block_interface->ptr);
+                        break;
+#endif
 		default:
 			assert(0);
 	}
@@ -660,6 +703,85 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 }
 #endif // STARPU_USE_CUDA
 
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                dst_block->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                  src_block->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                                                src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                dst_block->offset, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = starpu_data_get_interface_on_node(handle, src_node);
+	dst_block = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
+                                                  src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+                                                  src_block->offset, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return 0;
+}
+
+#endif
+
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {

+ 114 - 0
src/datawizard/interfaces/csr_interface.c

@@ -22,12 +22,20 @@
 
 #include <common/hash.h>
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+#endif
 
 static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -36,6 +44,10 @@ static const struct starpu_copy_data_methods_s csr_copy_data_methods_s = {
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
@@ -246,6 +258,27 @@ static size_t allocate_csr_buffer_on_node(starpu_data_handle handle, uint32_t ds
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
+                                addr_nzval = (uintptr_t)ptr;
+				if (ret) goto fail_nzval;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_colind = ptr;
+				if (ret) goto fail_colind;
+
+                                ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
+                                addr_rowptr = ptr;
+				if (ret) goto fail_rowptr;
+
+				break;
+			}
+#endif
 		default:
 			assert(0);
 	}
@@ -270,6 +303,11 @@ fail_rowptr:
 			cudaFree((void*)addr_colind);
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_colind);
+			break;
+#endif
 		default:
 			assert(0);
 	}
@@ -283,6 +321,11 @@ fail_colind:
 			cudaFree((void*)addr_nzval);
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)addr_nzval);
+			break;
+#endif
 		default:
 			assert(0);
 	}
@@ -313,6 +356,13 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)csr_interface->rowptr);
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			clReleaseMemObject((void*)csr_interface->nzval);
+			clReleaseMemObject((void*)csr_interface->colind);
+			clReleaseMemObject((void*)csr_interface->rowptr);
+			break;
+#endif
 		default:
 			assert(0);
 	}
@@ -386,6 +436,70 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 }
 #endif // STARPU_USE_CUDA
 
+#ifdef STARPU_USE_OPENCL
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_csr_interface_t *src_csr;
+	starpu_csr_interface_t *dst_csr;
+
+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+        int err;
+
+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+        err = _starpu_opencl_copy_from_opencl((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_csr_interface_t *src_csr;
+	starpu_csr_interface_t *dst_csr;
+
+	src_csr = starpu_data_get_interface_on_node(handle, src_node);
+	dst_csr = starpu_data_get_interface_on_node(handle, dst_node);
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+        int err;
+
+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = _starpu_opencl_copy_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+        err = _starpu_opencl_copy_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+}
+#endif // STARPU_USE_OPENCL
+
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {

+ 4 - 0
src/datawizard/interfaces/matrix_filters.c

@@ -69,6 +69,8 @@ void starpu_block_filter_func(starpu_filter *f, starpu_data_handle root_handle)
 
 				local->ptr = local_root->ptr + offset;
 				local->ld = local_root->ld;
+                                local->dev_handle = local_root->dev_handle;
+                                local->offset = local_root->offset + offset;
 			}
 		}
 	}
@@ -122,6 +124,8 @@ void starpu_vertical_block_filter_func(starpu_filter *f, starpu_data_handle root
 					(size_t)chunk*chunk_size*local_root->ld*elemsize;
 				local->ptr = local_root->ptr + offset;
 				local->ld = local_root->ld;
+                                local->dev_handle = local_root->dev_handle;
+                                local->offset = local_root->offset + offset;
 			}
 		}
 	}

+ 120 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -26,6 +26,10 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s matrix_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
@@ -106,10 +122,14 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 
 		if (node == home_node) {
 			local_interface->ptr = matrix_interface->ptr;
+                        local_interface->dev_handle = matrix_interface->dev_handle;
+                        local_interface->offset = matrix_interface->offset;
 			local_interface->ld  = matrix_interface->ld;
 		}
 		else {
 			local_interface->ptr = 0;
+			local_interface->dev_handle = 0;
+			local_interface->offset = 0;
 			local_interface->ld  = 0;
 		}
 
@@ -129,7 +149,9 @@ void starpu_register_matrix_data(starpu_data_handle *handleptr, uint32_t home_no
 		.ld = ld,
 		.nx = nx,
 		.ny = ny,
-		.elemsize = elemsize
+		.elemsize = elemsize,
+                .dev_handle = ptr,
+                .offset = 0
 	};
 
 	_starpu_register_data_handle(handleptr, home_node, &interface, &_starpu_interface_matrix_ops);
@@ -256,6 +278,19 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 			assert(0);
 	}
@@ -266,6 +301,8 @@ static size_t allocate_matrix_buffer_on_node(starpu_data_handle handle, uint32_t
 
 		/* update the data properly in consequence */
 		interface->ptr = addr;
+                interface->dev_handle = addr;
+                interface->offset = 0;
 		interface->ld = ld;
 	} else {
 		/* allocation failed */
@@ -296,6 +333,11 @@ static void liberate_matrix_buffer_on_node(void *interface, uint32_t node)
 
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void *)matrix_interface->ptr);
+                        break;
+#endif
 		default:
 			assert(0);
 	}
@@ -423,6 +465,83 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
 #endif // STARPU_USE_CUDA
 
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                dst_matrix->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                  src_matrix->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle, src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                dst_matrix->offset, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_matrix_interface_t *src_matrix;
+	starpu_matrix_interface_t *dst_matrix;
+
+	src_matrix = starpu_data_get_interface_on_node(handle, src_node);
+	dst_matrix = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
+                                                  src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+                                                  src_matrix->offset, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+
+	return 0;
+}
+
+#endif
+
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {

+ 109 - 0
src/datawizard/interfaces/variable_interface.c

@@ -26,6 +26,10 @@
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
@@ -34,6 +38,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -44,6 +54,12 @@ static const struct starpu_copy_data_methods_s variable_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
@@ -192,6 +208,19 @@ static size_t allocate_variable_buffer_on_node(starpu_data_handle handle, uint32
 			}
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 			assert(0);
 	}
@@ -220,6 +249,11 @@ static void liberate_variable_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)STARPU_GET_VARIABLE_PTR(interface));
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void*)STARPU_GET_VARIABLE_PTR(interface));
+                        break;
+#endif
 		default:
 			assert(0);
 	}
@@ -324,6 +358,81 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
 #endif // STARPU_USE_CUDA
 
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
+                                                0, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
+                                                  0, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
+                                                0, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_variable_interface_t *src_variable;
+	starpu_variable_interface_t *dst_variable;
+
+	src_variable = starpu_data_get_interface_on_node(handle, src_node);
+	dst_variable = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
+                                                  0, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+	return 0;
+}
+
+#endif
+
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_variable_interface_t *src_variable;

+ 8 - 0
src/datawizard/interfaces/vector_filters.c

@@ -62,6 +62,8 @@ void starpu_block_filter_func_vector(starpu_filter *f, starpu_data_handle root_h
 					starpu_data_get_interface_on_node(root_handle, node);
 
 				local->ptr = local_root->ptr + offset;
+                                local->dev_handle = local_root->dev_handle;
+                                local->offset = local_root->offset + offset;
 			}
 		}
 	}
@@ -100,6 +102,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 				starpu_data_get_interface_on_node(root_handle, node);
 
 			local->ptr = local_root->ptr;
+                        local->offset = local_root->offset;
+                        local->dev_handle = local_root->dev_handle;
 		}
 	}
 
@@ -119,6 +123,8 @@ void starpu_divide_in_2_filter_func_vector(starpu_filter *f, starpu_data_handle
 				starpu_data_get_interface_on_node(root_handle, node);
 
 			local->ptr = local_root->ptr + length_first*elemsize;
+                        local->offset = local_root->offset + length_first*elemsize;
+                        local->dev_handle = local_root->dev_handle;
 		}
 	}
 }
@@ -161,6 +167,8 @@ void starpu_list_filter_func_vector(starpu_filter *f, starpu_data_handle root_ha
 					starpu_data_get_interface_on_node(root_handle, node);
 
 				local->ptr = local_root->ptr + current_pos*elemsize;
+                                local->offset = local_root->offset + current_pos*elemsize;
+                                local->dev_handle = local_root->dev_handle;
 			}
 		}
 

+ 118 - 2
src/datawizard/interfaces/vector_interface.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,6 +25,10 @@
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#include <drivers/opencl/driver_opencl.h>
+#endif
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef STARPU_USE_CUDA
@@ -33,6 +37,12 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event);
+#endif
 
 static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
@@ -43,6 +53,12 @@ static const struct starpu_copy_data_methods_s vector_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl = copy_ram_to_opencl,
+	.opencl_to_ram = copy_opencl_to_ram,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
@@ -87,9 +103,13 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 
 		if (node == home_node) {
 			local_interface->ptr = vector_interface->ptr;
+                        local_interface->dev_handle = vector_interface->dev_handle;
+                        local_interface->offset = vector_interface->offset;
 		}
 		else {
 			local_interface->ptr = 0;
+                        local_interface->dev_handle = 0;
+                        local_interface->offset = 0;
 		}
 
 		local_interface->nx = vector_interface->nx;
@@ -116,7 +136,9 @@ void starpu_register_vector_data(starpu_data_handle *handleptr, uint32_t home_no
 	starpu_vector_interface_t vector = {
 		.ptr = ptr,
 		.nx = nx,
-		.elemsize = elemsize
+		.elemsize = elemsize,
+                .dev_handle = ptr,
+                .offset = 0
 	};	
 
 	_starpu_register_data_handle(handleptr, home_node, &vector, &interface_vector_ops); 
@@ -216,6 +238,19 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 			}
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+	        case STARPU_OPENCL_RAM:
+			{
+                                int ret;
+                                void *ptr;
+                                ret = _starpu_opencl_allocate_memory(&ptr, nx*elemsize, CL_MEM_READ_WRITE);
+                                addr = (uintptr_t)ptr;
+				if (ret) {
+					fail = 1;
+				}
+				break;
+			}
+#endif
 		default:
 			assert(0);
 	}
@@ -228,6 +263,8 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
 	/* update the data properly in consequence */
 	interface->ptr = addr;
+        interface->dev_handle = addr;
+        interface->offset = 0;
 	
 	return allocated_memory;
 }
@@ -246,6 +283,11 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 			cudaFree((void*)vector_interface->ptr);
 			break;
 #endif
+#ifdef STARPU_USE_OPENCL
+                case STARPU_OPENCL_RAM:
+                        clReleaseMemObject((void *)vector_interface->ptr);
+                        break;
+#endif
 		default:
 			assert(0);
 	}
@@ -349,6 +391,80 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 
 
 #endif // STARPU_USE_CUDA
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
+                                                dst_vector->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_opencl_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cl_event *event) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
+                                                  src_vector->offset, event);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return EAGAIN;
+}
+
+static int copy_ram_to_opencl(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_to_opencl((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle, src_vector->nx*src_vector->elemsize,
+                                                dst_vector->offset, NULL);
+
+	if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return 0;
+}
+
+static int copy_opencl_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node) {
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = starpu_data_get_interface_on_node(handle, src_node);
+	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
+
+	int err = _starpu_opencl_copy_from_opencl((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
+                                                  src_vector->offset, NULL);
+
+        if (STARPU_UNLIKELY(err))
+                STARPU_OPENCL_REPORT_ERROR(err);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return 0;
+}
+
+#endif
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {

+ 2 - 1
src/datawizard/memory_nodes.h

@@ -26,7 +26,8 @@ typedef enum {
 	STARPU_UNUSED,
 	STARPU_SPU_LS,
 	STARPU_RAM,
-	STARPU_CUDA_RAM
+	STARPU_CUDA_RAM,
+        STARPU_OPENCL_RAM,
 } starpu_node_kind;
 
 typedef struct {

+ 422 - 0
src/drivers/opencl/driver_opencl.c

@@ -0,0 +1,422 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <common/config.h>
+#include <common/utils.h>
+#include <core/debug.h>
+#include <starpu_opencl.h>
+#include "driver_opencl.h"
+#include "driver_opencl_utils.h"
+#include <common/utils.h>
+
+static cl_context contexts[STARPU_MAXOPENCLDEVS];
+static cl_device_id devices[STARPU_MAXOPENCLDEVS];
+static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
+static cl_uint nb_devices = -1;
+static int init_done = 0;
+extern char *_starpu_opencl_codelet_dir;
+
+void starpu_opencl_get_context(int devid, cl_context *context)
+{
+        *context = contexts[devid];
+}
+
+void starpu_opencl_get_device(int devid, cl_device_id *device)
+{
+        *device = devices[devid];
+}
+
+void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
+{
+        *queue = queues[devid];
+}
+
+int _starpu_opencl_init_context(int devid)
+{
+	cl_int err;
+        cl_device_id device;
+
+        _STARPU_OPENCL_DEBUG("Initialising context for dev %d\n", devid);
+
+        // Create a compute context
+        device = devices[devid];
+        contexts[devid] = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        // Create queue for the given device
+        queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        _starpu_opencl_init_programs(devid);
+
+	return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_deinit_context(int devid)
+{
+        int err;
+
+        _STARPU_OPENCL_DEBUG("De-initialising context for dev %d\n", devid);
+
+        err = clReleaseContext(contexts[devid]);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        err = clReleaseCommandQueue(queues[devid]);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        _starpu_opencl_release_programs(devid);
+
+        return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags)
+{
+	cl_int err;
+        cl_mem address;
+        struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+
+	address = clCreateBuffer(contexts[worker->devid], flags, size, NULL, &err);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        *addr = address;
+        return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event)
+{
+      int err;
+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+
+      if (event == NULL) {
+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
+      }
+      else {
+              err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
+      }
+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+      return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event)
+{
+      int err;
+      struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+
+      if (event == NULL) {
+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
+      }
+      else {
+              err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
+      }
+      if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+      return EXIT_SUCCESS;
+}
+
+void _starpu_opencl_init()
+{
+   if (!init_done) {
+           cl_platform_id platform_id[STARPU_OPENCL_PLATFORM_MAX];
+           cl_uint nb_platforms;
+           cl_device_type device_type = CL_DEVICE_TYPE_GPU;
+           cl_int err;
+
+           _STARPU_OPENCL_DEBUG("Initialising OpenCL\n");
+
+           // Get Platforms
+           err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
+           if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+           _STARPU_OPENCL_DEBUG("Platforms detected: %d\n", nb_platforms);
+
+           // Get devices
+           nb_devices = 0;
+           {
+                   unsigned int i;
+                   for (i=0; i<nb_platforms; i++) {
+                           cl_uint num;
+
+#ifdef STARPU_VERBOSE
+                           {
+                                   char name[1024], vendor[1024];
+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
+                                   clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
+                                   _STARPU_OPENCL_DEBUG("Platform: %s - %s\n", name, vendor);
+                           }
+#endif
+                           err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
+                           if (err == CL_DEVICE_NOT_FOUND) {
+                                   _STARPU_OPENCL_DEBUG("  No devices detected on this platform\n");
+                           }
+                           else {
+                                   if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                                   _STARPU_OPENCL_DEBUG("  %d devices detected\n", num);
+                                   nb_devices += num;
+                           }
+         }
+      }
+
+      // Get location of OpenCl codelet source files
+      _starpu_opencl_codelet_dir = getenv("STARPU_OPENCL_CODELET_DIR");
+
+      init_done=1;
+   }
+}
+
+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args);
+
+void *_starpu_opencl_worker(void *arg)
+{
+	struct starpu_worker_s* args = arg;
+
+	int devid = args->devid;
+	unsigned memory_node = args->memory_node;
+
+#ifdef USE_FXT
+	fxt_register_thread(args->bindid);
+#endif
+
+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+
+	_starpu_set_local_memory_node_key(&(args->memory_node));
+
+	_starpu_set_local_queue(args->jobq);
+
+	_starpu_set_local_worker_key(args);
+
+	/* this is only useful (and meaningful) is there is a single
+	   memory node "related" to that queue */
+	args->jobq->memory_node = memory_node;
+
+	args->jobq->total_computation_time = 0.0;
+	args->jobq->total_communication_time = 0.0;
+	args->jobq->total_computation_time_error = 0.0;
+	args->jobq->total_job_performed = 0;
+
+	_starpu_opencl_init_context(devid);
+
+	/* one more time to avoid hacks from third party lib :) */
+	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+
+	args->status = STATUS_UNKNOWN;
+
+	/* get the device's name */
+	char devname[128];
+	_starpu_opencl_get_device_name(devid, devname, 128);
+	snprintf(args->name, 32, "OpenCL %d (%s)", args->devid, devname);
+
+	_STARPU_OPENCL_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
+
+	STARPU_TRACE_WORKER_INIT_END
+
+	/* tell the main thread that this one is ready */
+	PTHREAD_MUTEX_LOCK(&args->mutex);
+	args->worker_is_initialized = 1;
+	PTHREAD_COND_SIGNAL(&args->ready_cond);
+	PTHREAD_MUTEX_UNLOCK(&args->mutex);
+
+	struct starpu_job_s * j;
+	int res;
+
+	struct starpu_sched_policy_s *policy = _starpu_get_sched_policy();
+	struct starpu_jobq_s *queue = policy->starpu_get_local_queue(policy);
+	unsigned memnode = args->memory_node;
+
+	while (_starpu_machine_is_running())
+	{
+		STARPU_TRACE_START_PROGRESS(memnode);
+		_starpu_datawizard_progress(memnode, 1);
+		STARPU_TRACE_END_PROGRESS(memnode);
+
+		_starpu_execute_registered_progression_hooks();
+
+		_starpu_jobq_lock(queue);
+
+		/* perhaps there is some local task to be executed first */
+		j = _starpu_pop_local_task(args);
+
+		/* otherwise ask a task to the scheduler */
+		if (!j)
+			j = _starpu_pop_task();
+
+		if (j == NULL) {
+			if (_starpu_worker_can_block(memnode))
+				PTHREAD_COND_WAIT(&queue->activity_cond, &queue->activity_mutex);
+			_starpu_jobq_unlock(queue);
+			continue;
+		}
+
+		_starpu_jobq_unlock(queue);
+
+		/* can OpenCL do that task ? */
+		if (!STARPU_OPENCL_MAY_PERFORM(j))
+		{
+			/* this is not a OpenCL task */
+			_starpu_push_task(j);
+			continue;
+		}
+
+		_starpu_set_current_task(j->task);
+
+		res = _starpu_opencl_execute_job(j, args);
+
+		_starpu_set_current_task(NULL);
+
+                if (res) {
+			switch (res) {
+				case -EAGAIN:
+					fprintf(stderr, "ouch, put the codelet %p back ... \n", j);
+					_starpu_push_task(j);
+					STARPU_ABORT();
+					continue;
+				default:
+					assert(0);
+			}
+		}
+
+		_starpu_handle_job_termination(j);
+	}
+
+	STARPU_TRACE_WORKER_DEINIT_START
+
+          _starpu_opencl_deinit_context(devid);
+
+#ifdef DATA_STATS
+	fprintf(stderr, "OpenCL #%d computation %le comm %le (%lf \%%)\n", args->id, args->jobq->total_computation_time, args->jobq->total_communication_time, args->jobq->total_communication_time*100.0/args->jobq->total_computation_time);
+#endif
+
+#ifdef STARPU_VERBOSE
+	double ratio = 0;
+	if (args->jobq->total_job_performed != 0)
+	{
+		ratio = args->jobq->total_computation_time_error/args->jobq->total_computation_time;
+	}
+
+
+	_starpu_print_to_logfile("MODEL ERROR: OpenCL %d ERROR %lf EXEC %lf RATIO %lf NTASKS %d\n", args->devid, args->jobq->total_computation_time_error, args->jobq->total_computation_time, ratio, args->jobq->total_job_performed);
+#endif
+
+	pthread_exit(NULL);
+
+	return NULL;
+}
+
+static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
+{
+	int err;
+
+        if (!init_done) {
+                _starpu_opencl_init();
+        }
+
+	// Get device name
+	err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	_STARPU_OPENCL_DEBUG("Device %d : [%s]\n", dev, name);
+	return EXIT_SUCCESS;
+}
+
+unsigned _starpu_opencl_get_device_count(void)
+{
+        if (!init_done) {
+                _starpu_opencl_init();
+        }
+	return nb_devices;
+}
+
+static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *args)
+{
+	int ret;
+//	uint32_t mask = (1<<0);
+	uint32_t mask = 0;
+
+	STARPU_ASSERT(j);
+	struct starpu_task *task = j->task;
+
+	starpu_tick_t codelet_start, codelet_end;
+	starpu_tick_t codelet_start_comm, codelet_end_comm;
+
+	unsigned calibrate_model = 0;
+
+	STARPU_ASSERT(task);
+	struct starpu_codelet_t *cl = task->cl;
+	STARPU_ASSERT(cl);
+
+	if (cl->model && cl->model->benchmarking)
+		calibrate_model = 1;
+
+	/* we do not take communication into account when modeling the performance */
+	if (STARPU_BENCHMARK_COMM)
+	{
+                //barrier(CLK_GLOBAL_MEM_FENCE);
+		STARPU_GET_TICK(codelet_start_comm);
+	}
+
+	ret = _starpu_fetch_task_input(task, mask);
+	if (ret != 0) {
+		/* there was not enough memory, so the input of
+		 * the codelet cannot be fetched ... put the
+		 * codelet back, and try it later */
+		return -EAGAIN;
+	}
+
+	if (calibrate_model || STARPU_BENCHMARK_COMM)
+	{
+                //barrier(CLK_GLOBAL_MEM_FENCE);
+		STARPU_GET_TICK(codelet_end_comm);
+	}
+
+	STARPU_TRACE_START_CODELET_BODY(j);
+
+	args->status = STATUS_EXECUTING;
+	cl_func func = cl->opencl_func;
+	STARPU_ASSERT(func);
+	STARPU_GET_TICK(codelet_start);
+	func(task->interface, task->cl_arg);
+
+	cl->per_worker_stats[args->workerid]++;
+
+	STARPU_GET_TICK(codelet_end);
+
+	args->status = STATUS_UNKNOWN;
+
+	STARPU_TRACE_END_CODELET_BODY(j);
+
+	if (calibrate_model || STARPU_BENCHMARK_COMM)
+	{
+		double measured = _starpu_timing_delay(&codelet_start, &codelet_end);
+		double measured_comm = _starpu_timing_delay(&codelet_start_comm, &codelet_end_comm);
+
+		args->jobq->total_computation_time += measured;
+		args->jobq->total_communication_time += measured_comm;
+
+		double error;
+		error = fabs(STARPU_MAX(measured, 0.0) - STARPU_MAX(j->predicted, 0.0));
+		args->jobq->total_computation_time_error += error;
+
+		if (calibrate_model)
+			_starpu_update_perfmodel_history(j, args->perf_arch, (unsigned)args->devid, measured);
+	}
+
+	args->jobq->total_job_performed++;
+
+	_starpu_push_task_output(task, mask);
+
+	return EXIT_SUCCESS;
+}

+ 57 - 0
src/drivers/opencl/driver_opencl.h

@@ -0,0 +1,57 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_OPENCL_H__
+#define __DRIVER_OPENCL_H__
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <CL/cl.h>
+
+extern
+int _starpu_opencl_init_context(int devid);
+
+extern
+int _starpu_opencl_deinit_context(int devid);
+
+extern
+unsigned _starpu_opencl_get_device_count(void);
+
+extern
+int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
+
+extern
+int _starpu_opencl_copy_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event);
+
+extern
+int _starpu_opencl_copy_from_opencl(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event);
+
+extern
+void _starpu_opencl_init(void);
+
+extern
+void *_starpu_opencl_worker(void *);
+
+extern
+int _starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue,
+                               char *program_name, char *kernel_name, int dev);
+
+extern
+int _starpu_opencl_compile_source_to_opencl(char *source_file_name);
+
+#endif //  __DRIVER_OPENCL_H__

+ 430 - 0
src/drivers/opencl/driver_opencl_utils.c

@@ -0,0 +1,430 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <starpu_opencl.h>
+#include <common/list.h>
+#include <common/htable32.h>
+#include <core/workers.h>
+#include "driver_opencl_utils.h"
+#include "driver_opencl.h"
+
+#define CRC32C_POLY_BE 0x1EDC6F41
+
+static
+inline uint32_t __attribute__ ((pure)) crc32_be_8(uint8_t inputbyte, uint32_t inputcrc)
+{
+	unsigned i;
+	uint32_t crc;
+
+	crc = inputcrc ^ (inputbyte << 24);
+	for (i = 0; i < 8; i++)
+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE : 0);
+
+	return crc;
+}
+
+static
+uint32_t crc32_string(char *str)
+{
+	uint32_t hash = 0;
+
+	size_t len = strlen(str);
+
+	unsigned i;
+	for (i = 0; i < len; i++)
+	{
+		hash = crc32_be_8((uint8_t)str[i], hash);
+	}
+
+	return hash;
+}
+
+static
+cl_uint _starpu_opencl_device_uniqueid(cl_device_id id)
+{
+	char name[1024];
+	cl_int  err;
+
+	err = clGetDeviceInfo(id, CL_DEVICE_NAME, 1024, name, NULL);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	//  fprintf(stderr, "name %s\n", name);
+
+	return crc32_string(name);
+}
+
+char *_starpu_opencl_codelet_dir;
+
+static
+int _starpu_opencl_locate_file(char *source_file_name, char *located_file_name) {
+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", source_file_name);
+        if (access(source_file_name, R_OK) == 0) {
+                strcpy(located_file_name, source_file_name);
+                return EXIT_SUCCESS;
+        }
+        if (_starpu_opencl_codelet_dir) {
+                sprintf(located_file_name, "%s/%s", _starpu_opencl_codelet_dir, source_file_name);
+                _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
+                if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
+        }
+        sprintf(located_file_name, "%s/%s", STARPU_OPENCL_DATADIR, source_file_name);
+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
+        sprintf(located_file_name, "%s/%s", STARPU_SRC_DIR, source_file_name);
+        _STARPU_OPENCL_DEBUG("Trying to locate <%s>\n", located_file_name);
+        if (access(located_file_name, R_OK) == 0) return EXIT_SUCCESS;
+
+        strcpy(located_file_name, "");
+        OPENCL_ERROR("Cannot locate file <%s>\n", source_file_name);
+        return EXIT_FAILURE;
+}
+
+static
+unsigned char *_starpu_opencl_load_program_binary(char *filename, size_t *len)
+{
+	struct stat statbuf;
+	FILE        *fh;
+	unsigned char        *binary;
+
+	fh = fopen(filename, "r");
+	if (fh == 0)
+		return EXIT_SUCCESS;
+
+	stat(filename, &statbuf);
+
+	binary = (unsigned char *) malloc(statbuf.st_size);
+	if(!binary)
+		return binary;
+
+	fread(binary, statbuf.st_size, 1, fh);
+
+	*len = statbuf.st_size;
+	return binary;
+}
+
+static
+cl_int _starpu_opencl_load_program(cl_context context, char *program_name, cl_device_id device, cl_program *program)
+{
+        //	cl_program     program;
+        const unsigned char *binary;
+	size_t         len;
+	cl_int         err;
+	cl_int         status;
+        cl_device_type type;
+
+	cl_uint uniqueid;
+	char     located_program_name[1024];
+	char     binary_file_name[1024];
+	char    *p;
+
+        // locate file
+        _starpu_opencl_locate_file(program_name, located_program_name);
+
+        // Get type of device
+        err = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
+	if (err != CL_SUCCESS) {
+                STARPU_OPENCL_REPORT_ERROR(err);
+                return err;
+        }
+
+        // Get the name of the binary file
+	uniqueid = _starpu_opencl_device_uniqueid(device);
+	strcpy(binary_file_name, located_program_name);
+	p = strstr(binary_file_name, ".cl");
+	if(p == NULL) OPENCL_ERROR("Program file name doesn't have the '.cl' extension!\n");
+        strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
+	sprintf(p + strlen(p), "%u", uniqueid);
+
+        // Load the binary file
+	binary = _starpu_opencl_load_program_binary(binary_file_name, &len);
+	if(binary == NULL)
+		OPENCL_ERROR("Cannot load binary file %s\n", binary_file_name);
+
+	//_STARPU_OPENCL_DEBUG("[%s] binary file loaded.\n", binary_file_name);
+	*program = clCreateProgramWithBinary(context, 1, &device, &len, &binary, &status, &err);
+	if (err != CL_SUCCESS) {
+                STARPU_OPENCL_REPORT_ERROR(err);
+                return err;
+        }
+
+	// Build the program executable
+	err = clBuildProgram(*program, 0, NULL, NULL, NULL, NULL);
+	if (err != CL_SUCCESS) {
+		size_t len;
+		char buffer[2048];
+
+		fprintf(stderr, "Error: Failed to build program executable!\n");
+		clGetProgramBuildInfo(*program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+		fprintf(stderr, "%s\n", buffer);
+                return err;
+	}
+
+	return CL_SUCCESS;
+}
+
+static struct starpu_htbl32_node_s *history_program_hash[STARPU_MAXOPENCLDEVS] = {NULL};
+LIST_TYPE(program,
+          char *program_name;
+          cl_program program;
+          );
+program_list_t history_program_list[STARPU_MAXOPENCLDEVS];
+
+int _starpu_opencl_init_programs(int dev)
+{
+        history_program_list[dev] = program_list_new();
+        return CL_SUCCESS;
+}
+
+int _starpu_opencl_release_programs(int dev)
+{
+        while (!program_list_empty(history_program_list[dev])) {
+                program_t pp = program_list_pop_front(history_program_list[dev]);
+                _STARPU_OPENCL_DEBUG("Releasing program=<%s> on dev=<%d>\n", pp->program_name, dev);
+                clReleaseProgram(pp->program);
+        }
+        program_list_delete(history_program_list[dev]);
+        return CL_SUCCESS;
+}
+
+int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, char *program_name, char *kernel_name, int devid)
+{
+        int err;
+	cl_device_id device;
+        cl_context context;
+        uint32_t key;
+        cl_program program;
+
+        starpu_opencl_get_device(devid, &device);
+        starpu_opencl_get_context(devid, &context);
+        starpu_opencl_get_queue(devid, queue);
+
+        key = crc32_string(program_name);
+        program = _starpu_htbl_search_32(history_program_hash[devid], key);
+        if (!program) {
+                err = _starpu_opencl_load_program(context, program_name, device, &program);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                _starpu_htbl_insert_32(&(history_program_hash[devid]), key, program);
+                program_t pp = program_new();
+                pp->program_name = program_name;
+                pp->program = program;
+                program_list_push_front(history_program_list[devid], pp);
+        }
+
+        // Create the compute kernel in the program we wish to run
+        *kernel = clCreateKernel(program, kernel_name, &err);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	return CL_SUCCESS;
+}
+
+int starpu_opencl_release(cl_kernel kernel) {
+	cl_int err;
+
+	err = clReleaseKernel(kernel);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        return CL_SUCCESS;
+}
+
+static
+char *_starpu_opencl_load_program_source(const char *filename)
+{
+        struct stat statbuf;
+        FILE        *fh;
+        char        *source;
+
+        fh = fopen(filename, "r");
+        if (fh == 0)
+                return EXIT_SUCCESS;
+
+        stat(filename, &statbuf);
+        source = (char *) malloc(statbuf.st_size + 1);
+        fread(source, statbuf.st_size, 1, fh);
+        source[statbuf.st_size] = '\0';
+
+        fclose(fh);
+
+        return source;
+}
+
+static
+int _starpu_opencl_store_program_binary(const char *filename, const char *binary, size_t len)
+{
+        FILE *fh;
+
+        fh = fopen(filename, "w");
+        if(fh == NULL) {
+                perror("fopen"); return EXIT_FAILURE;
+        }
+
+        fwrite(binary, len, 1, fh);
+        fclose(fh);
+
+        return EXIT_SUCCESS;
+}
+
+int _starpu_opencl_compile_source_to_opencl(char *source_file_name)
+{
+        int              err;
+        int              device_type = CL_DEVICE_TYPE_ALL;
+        cl_device_id     devices[STARPU_MAXOPENCLDEVS];
+        unsigned         max = STARPU_MAXOPENCLDEVS;
+        unsigned         nb_devices = 0;
+        cl_uint          history[STARPU_MAXOPENCLDEVS]; // To track similar devices
+        char             preproc_file_name[1024];
+        char             located_file_name[1024];
+        cl_platform_id   platform_ids[STARPU_OPENCL_PLATFORM_MAX];
+        cl_uint          platform, nb_platforms;
+
+        // Locate source file
+        _starpu_opencl_locate_file(source_file_name, located_file_name);
+        _STARPU_OPENCL_DEBUG("Source file name : <%s>\n", located_file_name);
+
+        // Prepare preprocessor temporary filename
+        {
+                char *p;
+                strcpy(preproc_file_name, located_file_name);
+                p = strstr(preproc_file_name, ".cl");
+                if(p == NULL)
+                        OPENCL_ERROR("Kernel file name doesn't have the '.cl' extension!\n");
+                strcpy(p, ".pre");
+        }
+
+        // Get Platforms
+        err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_ids, &nb_platforms);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        // Iterate over each platform
+        for(platform=0; platform<nb_platforms; platform++) {
+                // Get devices
+                err = clGetDeviceIDs(platform_ids[platform], device_type, max, devices, &nb_devices);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                if(nb_devices > max)
+                        nb_devices = max;
+
+                // Iterate over each device
+                unsigned int dev;
+                for(dev = 0; dev < nb_devices; dev ++) {
+                        cl_context       context;
+                        cl_program       program;
+                        cl_device_type   type;
+                        cl_int           err;
+                        cl_uint          uniqueid;
+
+                        uniqueid =_starpu_opencl_device_uniqueid(devices[dev]);
+                        // Look up and update history (to avoid unuseful compilations in the case of identical devices)
+                        {
+                                unsigned int d;
+                                for(d = 0; d < dev; d++)
+                                        if(history[d] == uniqueid)
+                                                break; // Just skip compiling for this device
+                                if(d != dev)
+                                        continue;
+                                history[dev] = uniqueid;
+                        }
+
+                        err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                        // Create a compute context
+                        context = clCreateContext(0, 1, devices + dev, NULL, NULL, &err);
+                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                        // Run C preprocessor
+                        {
+                                pid_t pid;
+                                pid = fork();
+                                if(pid == 0) {
+                                        execlp("cpp", "cpp", located_file_name, "-o", preproc_file_name, NULL);
+                                        perror("execlp");
+                                        exit(EXIT_FAILURE);
+                                }
+                                else {
+                                        int status;
+                                        waitpid(pid, &status, 0);
+                                        if (WEXITSTATUS(status) != EXIT_SUCCESS)
+                                                OPENCL_ERROR("Cannot preprocess file [%s]\n", located_file_name);
+                                }
+                        }
+
+                        // Load the compute program from disk into a cstring buffer
+                        char *source = _starpu_opencl_load_program_source(preproc_file_name);
+                        if(!source)
+                                OPENCL_ERROR("Failed to load compute program from file <%s>!\n", preproc_file_name);
+
+                        // Create the compute program from the source buffer
+                        program = clCreateProgramWithSource(context, 1, (const char **) & source, NULL, &err);
+                        if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                        // Build the program executable
+                        err = clBuildProgram(program, 1, devices + dev, "-Werror -cl-mad-enable", NULL, NULL);
+                        if (err != CL_SUCCESS) {
+                                size_t len;
+                                static char buffer[4096];
+
+                                fprintf(stderr, "Error: Failed to build program executable!\n");
+                                clGetProgramBuildInfo(program, devices[dev], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+
+                                fprintf(stderr, "%s\n", buffer);
+                                return EXIT_FAILURE;
+                        }
+
+                        // Store program binary
+                        {
+                                char     binary_file_name[1024];
+                                char    *binary;
+                                size_t   binary_len;
+                                char    *p;
+
+                                strcpy(binary_file_name, located_file_name);
+                                p = strstr(binary_file_name, ".cl");
+                                if(p == NULL)
+                                        OPENCL_ERROR("Input file name doesn't have the '.cl' extension!\n");
+
+                                strcpy(p, (type == CL_DEVICE_TYPE_GPU) ? ".gpu." : ".cpu.");
+                                sprintf(p + strlen(p), "%u", uniqueid);
+
+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_len, NULL);
+                                if(err != CL_SUCCESS)
+                                        OPENCL_ERROR("Cannot get program binary size (err = %d)!\n", err);
+
+                                binary = malloc(binary_len);
+
+                                err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binary), &binary, NULL);
+                                if(err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+                                _starpu_opencl_store_program_binary(binary_file_name, binary, binary_len);
+
+                                free(binary);
+
+                                _STARPU_OPENCL_DEBUG("Binary file [%s] successfully built (%ld bytes).\n", binary_file_name, binary_len);
+                        }
+
+                        clReleaseProgram(program);
+                        clReleaseContext(context);
+                }
+        }
+
+        return EXIT_SUCCESS;
+}

+ 41 - 0
src/drivers/opencl/driver_opencl_utils.h

@@ -0,0 +1,41 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_OPENCL_UTILS_H__
+#define __STARPU_OPENCL_UTILS_H__
+
+#include <config.h>
+
+#ifdef STARPU_VERBOSE
+#  define _STARPU_OPENCL_DEBUG(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
+#else
+#  define _STARPU_OPENCL_DEBUG(fmt, args ...)
+#endif
+
+#define _STARPU_OPENCL_DISP(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
+
+#define OPENCL_ERROR(fmt, args ...)                                                   \
+	do {                                                                          \
+                fprintf(stderr, "[starpu][%s] Error: " fmt ,__func__ ,##args); \
+		assert(0);                                                            \
+	} while (0)
+
+#define STARPU_OPENCL_PLATFORM_MAX 4
+
+int _starpu_opencl_init_programs(int dev);
+int _starpu_opencl_release_programs(int dev);
+
+#endif /* __STARPU_OPENCL_UTILS_H__ */

+ 1 - 0
src/util/execute_on_all.c

@@ -39,6 +39,7 @@ void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t whe
 		.where = where,
 		.cuda_func = wrapper_func,
 		.cpu_func = wrapper_func,
+		.opencl_func = wrapper_func,
 		/* XXX we do not handle Cell .. */
 		.nbuffers = 0,
 		.model = NULL

+ 93 - 16
src/util/malloc.c

@@ -24,13 +24,28 @@
 #include <cuda.h>
 #endif
 
-#ifdef STARPU_USE_CUDA
+#ifdef STARPU_USE_OPENCL
+#include <drivers/opencl/driver_opencl.h>
+#endif
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 struct malloc_pinned_codelet_struct {
 	void **ptr;
 	size_t dim;
 };
+#endif
+
+//#ifdef STARPU_USE_OPENCL
+//static void malloc_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
+//{
+//	struct malloc_pinned_codelet_struct *s = arg;
+//        //        *(s->ptr) = malloc(s->dim);
+//        _starpu_opencl_allocate_memory((void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
+//}
+//#endif
 
-static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
+#ifdef STARPU_USE_CUDA
+static void malloc_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
 {
 	struct malloc_pinned_codelet_struct *s = arg;
 
@@ -39,10 +54,16 @@ static void malloc_pinned_codelet(void *buffers[] __attribute__((unused)), void
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
+#endif
 
+#if defined(STARPU_USE_CUDA)// || defined(STARPU_USE_OPENCL)
 static starpu_codelet malloc_pinned_cl = {
-	.where = STARPU_CUDA,
-	.cuda_func = malloc_pinned_codelet,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = malloc_pinned_cuda_codelet,
+#endif
+//#ifdef STARPU_USE_OPENCL
+//	.opencl_func = malloc_pinned_opencl_codelet,
+//#endif
 	.model = NULL,
 	.nbuffers = 0
 };
@@ -59,24 +80,47 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 	{
 #ifdef STARPU_USE_CUDA
 		int push_res;
-	
+
 		struct malloc_pinned_codelet_struct s = {
 			.ptr = A,
 			.dim = dim
-		};	
-	
+		};
+
+                malloc_pinned_cl.where = STARPU_CUDA;
 		struct starpu_task *task = starpu_task_create();
-			task->callback_func = NULL; 
+			task->callback_func = NULL;
 			task->cl = &malloc_pinned_cl;
 			task->cl_arg = &s;
 
 		task->synchronous = 1;
-	
+
 		push_res = starpu_submit_task(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 #endif
 	}
-	else {
+//	else if (_starpu_may_submit_opencl_task())
+//	{
+//#ifdef STARPU_USE_OPENCL
+//		int push_res;
+//
+//		struct malloc_pinned_codelet_struct s = {
+//			.ptr = A,
+//			.dim = dim
+//		};
+//
+//                malloc_pinned_cl.where = STARPU_OPENCL;
+//		struct starpu_task *task = starpu_task_create();
+//			task->callback_func = NULL;
+//			task->cl = &malloc_pinned_cl;
+//			task->cl_arg = &s;
+//
+//		task->synchronous = 1;
+//
+//		push_res = starpu_submit_task(task);
+//		STARPU_ASSERT(push_res != -ENODEV);
+//#endif
+//        }
+        else {
 		*A = malloc(dim);
 	}
 
@@ -86,17 +130,32 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 }
 
 #ifdef STARPU_USE_CUDA
-static void free_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
+static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
 {
 	cudaError_t cures;
 	cures = cudaFreeHost(arg);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
+#endif
+
+//#ifdef STARPU_USE_OPENCL
+//static void free_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
+//{
+//        //        free(arg);
+//        int err = clReleaseMemObject(arg);
+//        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+//}
+//#endif
 
+#if defined(STARPU_USE_CUDA) // || defined(STARPU_USE_OPENCL)
 static starpu_codelet free_pinned_cl = {
-	.where = STARPU_CUDA,
-	.cuda_func = free_pinned_codelet,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = free_pinned_cuda_codelet,
+#endif
+//#ifdef STARPU_USE_OPENCL
+//	.opencl_func = free_pinned_opencl_codelet,
+//#endif
 	.model = NULL,
 	.nbuffers = 0
 };
@@ -111,18 +170,36 @@ int starpu_free_pinned_if_possible(void *A)
 	{
 #ifdef STARPU_USE_CUDA
 		int push_res;
-	
+
+                free_pinned_cl.where = STARPU_CUDA;
 		struct starpu_task *task = starpu_task_create();
-			task->callback_func = NULL; 
+			task->callback_func = NULL;
 			task->cl = &free_pinned_cl;
 			task->cl_arg = A;
 
 		task->synchronous = 1;
-	
+
 		push_res = starpu_submit_task(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 #endif
 	}
+//	else if (_starpu_may_submit_opencl_task())
+//	{
+//#ifdef STARPU_USE_OPENCL
+//		int push_res;
+//
+//                free_pinned_cl.where = STARPU_OPENCL;
+//		struct starpu_task *task = starpu_task_create();
+//			task->callback_func = NULL;
+//			task->cl = &free_pinned_cl;
+//			task->cl_arg = A;
+//
+//		task->synchronous = 1;
+//
+//		push_res = starpu_submit_task(task);
+//		STARPU_ASSERT(push_res != -ENODEV);
+//#endif
+//	}
 	else {
 		free(A);
 	}

+ 6 - 1
tests/Makefile.am

@@ -33,7 +33,7 @@ if STARPU_USE_CUDA
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
 
-NVCCFLAGS += -I$(top_srcdir)/include/
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
 
 .cu.cubin:
 	$(MKDIR_P) `dirname $@`
@@ -246,6 +246,11 @@ datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_kernels.cu
 endif
 
+if STARPU_USE_OPENCL
+datawizard_sync_and_notify_data_SOURCES +=	\
+	datawizard/sync_and_notify_data_opencl.c
+endif
+
 if STARPU_USE_GORDON
 datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_gordon_kernels.c

+ 2 - 1
tests/core/empty_task_sync_point.c

@@ -34,9 +34,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 2 - 1
tests/core/execute_on_a_specific_worker.c

@@ -71,9 +71,10 @@ static starpu_access_mode select_random_mode(void)
 
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = codelet_null,
 	.cuda_func = codelet_null,
+        .opencl_func = codelet_null,
 	.nbuffers = 1
 };
 

+ 2 - 1
tests/core/multithreaded.c

@@ -33,9 +33,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 2 - 1
tests/core/regenerate.c

@@ -51,9 +51,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 5 - 2
tests/core/starpu_wait_all_tasks.c

@@ -28,9 +28,10 @@ static void dummy_func(void *descr[], void *arg)
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 #ifdef STARPU_USE_GORDON
 	.gordon_func = 0, /* this will be defined later */
 #endif
@@ -70,9 +71,11 @@ static struct starpu_conf conf = {
 	.sched_policy_name = NULL,
 	.ncpus = -1,
 	.ncuda = -1,
+        .nopencl = -1,
 	.nspus = -1,
 	.use_explicit_workers_bindid = 0,
-	.use_explicit_workers_gpuid = 0,
+	.use_explicit_workers_cuda_gpuid = 0,
+	.use_explicit_workers_opencl_gpuid = 0,
 	.calibrate = 0
 };
 

+ 4 - 2
tests/core/starpu_wait_task.c

@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };
@@ -86,7 +87,8 @@ int main(int argc, char **argv)
 		int ret = starpu_submit_task(task);
 		STARPU_ASSERT(!ret);
 
-		starpu_wait_task(task);
+		ret = starpu_wait_task(task);
+		STARPU_ASSERT(!ret);
 
 		starpu_task_destroy(task);
 	}

+ 3 - 2
tests/core/static_restartable.c

@@ -28,10 +28,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
-	.model = NULL,
+	.opencl_func = dummy_func,
+        .model = NULL,
 	.nbuffers = 0
 };
 

+ 2 - 1
tests/core/static_restartable_tag.c

@@ -29,9 +29,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 2 - 1
tests/core/static_restartable_using_initializer.c

@@ -31,9 +31,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 2 - 1
tests/core/subgraph_repeat.c

@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 2 - 1
tests/core/subgraph_repeat_regenerate.c

@@ -47,9 +47,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 2 - 1
tests/core/tag_wait_api.c

@@ -26,9 +26,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 3 - 2
tests/core/task_wait_api.c

@@ -26,10 +26,11 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
-	.model = NULL,
+	.opencl_func = dummy_func,
+        .model = NULL,
 	.nbuffers = 0
 };
 

+ 2 - 1
tests/datawizard/dining_philosophers.c

@@ -27,9 +27,10 @@ static void eat_kernel(void *descr[], void *arg)
 }
 
 static starpu_codelet eating_cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cuda_func = eat_kernel,
 	.cpu_func = eat_kernel,
+        .opencl_func = eat_kernel,
 	.nbuffers = 2
 };
 

+ 6 - 1
tests/datawizard/dsm_stress.c

@@ -54,6 +54,10 @@ static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_arg
 {
 }
 
+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
+{
+}
+
 static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
 }
@@ -74,9 +78,10 @@ static starpu_access_mode select_random_mode(void)
 
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = cpu_codelet_null,
 	.cuda_func = cuda_codelet_null,
+        .opencl_func = opencl_codelet_null,
 	.nbuffers = 2
 };
 

+ 2 - 1
tests/datawizard/readers_and_writers.c

@@ -24,9 +24,10 @@ static void dummy_kernel(void *descr[], void *arg)
 }
 
 static starpu_codelet rw_cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cuda_func = dummy_kernel,
 	.cpu_func = dummy_kernel,
+	.opencl_func = dummy_kernel,
 	.nbuffers = 1
 };
 

+ 19 - 4
tests/datawizard/sync_and_notify_data.c

@@ -46,6 +46,11 @@ void cuda_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
 void cuda_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
 #define VECTORSIZE	16
 
 starpu_data_handle v_handle;
@@ -79,8 +84,12 @@ int main(int argc, char **argv)
 
 	fprintf(stderr, "kernel incA %d incC %d elf %d\n", kernel_incA_id, kernel_incC_id, elf_id);
 #endif
-	
-	starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
+
+#ifdef STARPU_USE_OPENCL
+        _starpu_opencl_compile_source_to_opencl("tests/datawizard/sync_and_notify_data_opencl_codelet.cl");
+#endif
+
+        starpu_register_vector_data(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
 
 	unsigned iter;
 	for (iter = 0; iter < K; iter++)
@@ -91,11 +100,14 @@ int main(int argc, char **argv)
 		{
 			/* increment a = v[0] */
 			starpu_codelet cl_inc_a = {
-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 				.cpu_func = cpu_codelet_incA,
 #ifdef STARPU_USE_CUDA
 				.cuda_func = cuda_codelet_incA,
 #endif
+#ifdef STARPU_USE_OPENCL
+				.opencl_func = opencl_codelet_incA,
+#endif
 #ifdef STARPU_USE_GORDON
 				.gordon_func = kernel_incA_id,
 #endif
@@ -127,11 +139,14 @@ int main(int argc, char **argv)
 		{
 			/* increment c = v[2] */
 			starpu_codelet cl_inc_c = {
-				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+				.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 				.cpu_func = cpu_codelet_incC,
 #ifdef STARPU_USE_CUDA
 				.cuda_func = cuda_codelet_incC,
 #endif
+#ifdef STARPU_USE_OPENCL
+				.opencl_func = opencl_codelet_incC,
+#endif
 #ifdef STARPU_USE_GORDON
 				.gordon_func = kernel_incC_id,
 #endif

+ 77 - 0
tests/datawizard/sync_and_notify_data_opencl.c

@@ -0,0 +1,77 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <CL/cl.h>
+
+void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
+{
+        unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+	id = starpu_get_worker_id();
+	devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incA", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=100;
+		size_t local=100;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}
+
+void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *val = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
+	cl_kernel kernel;
+	cl_command_queue queue;
+	int id, devid, err;
+
+	id = starpu_get_worker_id();
+	devid = starpu_get_worker_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, "tests/datawizard/sync_and_notify_data_opencl_codelet.cl", "incC", devid);
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=100;
+		size_t local=100;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+
+	starpu_opencl_release(kernel);
+}

+ 30 - 0
tests/datawizard/sync_and_notify_data_opencl_codelet.cl

@@ -0,0 +1,30 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void incA(__global unsigned* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 0)
+		input[i] ++;
+}
+
+__kernel void incC(__global unsigned* input) 
+{
+	const int i = get_global_id(0);
+	if (i == 2)
+		input[i] ++;
+}
+

+ 4 - 1
tests/datawizard/sync_with_data_with_mem.c

@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 }
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_codelet,
 #ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 #endif
+#ifdef STARPU_USE_OPENCL
+        .opencl_func = dummy_codelet,
+#endif
 	.nbuffers = 1
 };
 

+ 4 - 1
tests/datawizard/sync_with_data_with_mem_non_blocking.c

@@ -33,11 +33,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 }
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_codelet,
 #ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func = dummy_codelet,
+#endif
 	.nbuffers = 1
 };
 

+ 4 - 1
tests/datawizard/unpartition.c

@@ -32,11 +32,14 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 }
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_codelet,
 #ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func = dummy_codelet,
+#endif
 	.nbuffers = 1
 };
 

+ 19 - 1
tests/datawizard/write_only_tmp_buffer.c

@@ -24,6 +24,21 @@
 
 starpu_data_handle v_handle;
 
+#ifdef STARPU_USE_OPENCL
+#include <CL/cl.h>
+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	cl_mem buf = (cl_mem)STARPU_GET_VECTOR_PTR(descr[0]);
+        char ptr = 42;
+        cl_command_queue queue;
+        int id = starpu_get_worker_id();
+        int devid = starpu_get_worker_devid(id);
+
+        starpu_opencl_get_queue(devid, &queue);
+        clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, sizeof(char), &ptr, 0, NULL, NULL);
+}
+#endif
+
 #ifdef STARPU_USE_CUDA
 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
@@ -51,11 +66,14 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 }
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = cpu_codelet_null,
 #ifdef STARPU_USE_CUDA
 	.cuda_func = cuda_codelet_null,
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func = opencl_codelet_null,
+#endif
 	.nbuffers = 1
 };
 

+ 2 - 1
tests/errorcheck/invalid_blocking_calls.c

@@ -38,9 +38,10 @@ static void wrong_func(void *descr[], void *arg)
 
 static starpu_codelet wrong_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = wrong_func,
 	.cuda_func = wrong_func,
+        .opencl_func = wrong_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 3 - 1
tests/errorcheck/starpu_init_noworker.c

@@ -29,9 +29,11 @@ int main(int argc, char **argv)
 		.sched_policy_name = NULL, /* default */
 		.ncpus = 0,
 		.ncuda = 0,
+                .nopencl = 0,
 		.nspus = 0,
 		.use_explicit_workers_bindid = 0,
-		.use_explicit_workers_gpuid = 0,
+		.use_explicit_workers_cuda_gpuid = 0,
+		.use_explicit_workers_opencl_gpuid = 0,
 		.calibrate = 0
 	};
 

+ 3 - 1
tests/helper/execute_on_all.c

@@ -32,12 +32,14 @@ int main(int argc, char **argv)
 
 	int arg = 0x42;
 
-	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA);
+	starpu_execute_on_each_worker(func, &arg, STARPU_CPU|STARPU_CUDA|STARPU_OPENCL);
 
 	starpu_execute_on_each_worker(func, &arg, STARPU_CPU);
 	
 	starpu_execute_on_each_worker(func, &arg, STARPU_CUDA);
 
+        starpu_execute_on_each_worker(func, &arg, STARPU_OPENCL);
+
 	starpu_shutdown();
 
 	return 0;

+ 2 - 1
tests/helper/starpu_create_sync_task.c

@@ -25,9 +25,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+        .opencl_func = dummy_func,
 	.nbuffers = 0
 };
 

+ 5 - 2
tests/microbenchs/async_tasks_overhead.c

@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+        .opencl_func = dummy_func,
 #ifdef STARPU_USE_GORDON
 	.gordon_func = 0, /* this will be defined later */
 #endif
@@ -91,9 +92,11 @@ static struct starpu_conf conf = {
 	.sched_policy_name = NULL,
 	.ncpus = -1,
 	.ncuda = -1,
+        .nopencl = -1,
 	.nspus = -1,
 	.use_explicit_workers_bindid = 0,
-	.use_explicit_workers_gpuid = 0,
+	.use_explicit_workers_cuda_gpuid = 0,
+	.use_explicit_workers_opencl_gpuid = 0,
 	.calibrate = 0
 };
 

+ 2 - 1
tests/microbenchs/prefetch_data_on_node.c

@@ -75,9 +75,10 @@ static starpu_access_mode select_random_mode(void)
 
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = codelet_null,
 	.cuda_func = codelet_null,
+	.opencl_func = codelet_null,
 	.nbuffers = 1
 };
 

+ 7 - 2
tests/microbenchs/redundant_buffer.c

@@ -27,6 +27,10 @@
 starpu_data_handle v_handle;
 static unsigned *v;
 
+static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
+{
+}
+
 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
 }
@@ -36,10 +40,11 @@ static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args
 }
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = cpu_codelet_null,
 	.cuda_func = cuda_codelet_null,
-	.nbuffers = 2
+	.opencl_func = opencl_codelet_null,
+        .nbuffers = 2
 };
 
 

+ 2 - 1
tests/microbenchs/sync_tasks_overhead.c

@@ -28,9 +28,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_GORDON,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+        .opencl_func = dummy_func,
 #ifdef STARPU_USE_GORDON
 	.gordon_func = 0, /* this will be defined later */
 #endif

+ 2 - 1
tests/microbenchs/tasks_overhead.c

@@ -35,9 +35,10 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static starpu_codelet dummy_codelet = 
 {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
+	.opencl_func = dummy_func,
 	.model = NULL,
 	.nbuffers = 0
 };

+ 2 - 1
tests/overlap/overlap.c

@@ -59,9 +59,10 @@ static struct starpu_perfmodel_t model = {
 };
 
 static starpu_codelet cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_func = codelet_sleep,
 	.cuda_func = codelet_sleep,
+        .opencl_func = codelet_sleep,
 	.nbuffers = 1,
 	.model =  &model
 };