Andra Hugo лет назад: 14
Родитель
Сommit
a1079cd79e
74 измененных файлов с 795 добавлено и 241 удалено
  1. 5 0
      ChangeLog
  2. 2 3
      Makefile.am
  3. 5 0
      configure.ac
  4. 9 1
      doc/chapters/basic-api.texi
  5. 15 11
      doc/chapters/basic-examples.texi
  6. 10 10
      doc/chapters/c-extensions.texi
  7. 1 1
      doc/chapters/configuration.texi
  8. 6 5
      doc/chapters/perf-feedback.texi
  9. 17 6
      examples/Makefile.am
  10. 6 3
      examples/axpy/axpy.c
  11. 2 1
      examples/axpy/axpy_opencl.c
  12. 6 6
      examples/basic_examples/block_opencl.c
  13. 7 11
      examples/basic_examples/block_opencl_kernel.cl
  14. 2 1
      examples/basic_examples/multiformat.c
  15. 2 1
      examples/basic_examples/vector_scal.c
  16. 25 21
      examples/cpp/incrementer_cpp.cpp
  17. 4 2
      examples/interface/complex.c
  18. 5 5
      examples/interface/complex_kernels_opencl.c
  19. 168 7
      examples/reductions/dot_product.c
  20. 6 0
      examples/reductions/dot_product.h
  21. 40 0
      examples/reductions/dot_product_opencl_kernels.cl
  22. 5 0
      examples/tag_example/tag_restartable.c
  23. 8 4
      gcc-plugin/examples/vector_scal/vector_scal.c
  24. 3 0
      gcc-plugin/src/Makefile.am
  25. 2 0
      gcc-plugin/src/starpu-gcc-config.h.in
  26. 18 5
      gcc-plugin/src/starpu.c
  27. 5 0
      include/starpu.h
  28. 1 7
      include/starpu_config.h.in
  29. 9 3
      include/starpu_rand.h
  30. 16 0
      libstarpu.pc.in
  31. 3 1
      m4/gcc.m4
  32. 1 1
      socl/src/cl_enqueuemarker.c
  33. 2 2
      socl/src/command.c
  34. 3 3
      socl/src/debug.h
  35. 4 4
      socl/src/init.c
  36. 2 2
      src/common/list.h
  37. 0 1
      src/core/jobs.c
  38. 7 0
      src/core/perfmodel/perfmodel_bus.c
  39. 21 0
      src/core/workers.c
  40. 5 2
      src/core/workers.h
  41. 21 0
      src/datawizard/interfaces/data_interface.c
  42. 1 4
      tests/Makefile.am
  43. 4 2
      tests/datawizard/acquire_release.c
  44. 4 2
      tests/datawizard/acquire_release2.c
  45. 205 55
      tests/datawizard/gpu_register.c
  46. 7 0
      tests/datawizard/increment_redux.c
  47. 16 4
      tests/datawizard/increment_redux_lazy.c
  48. 3 1
      tests/datawizard/increment_redux_v2.c
  49. 2 1
      tests/datawizard/interfaces/bcsr/bcsr_opencl.c
  50. 2 1
      tests/datawizard/interfaces/block/block_opencl.c
  51. 2 1
      tests/datawizard/interfaces/csr/csr_opencl.c
  52. 2 1
      tests/datawizard/interfaces/matrix/matrix_opencl.c
  53. 2 1
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c
  54. 2 1
      tests/datawizard/interfaces/multiformat/multiformat_opencl.c
  55. 5 0
      tests/datawizard/interfaces/test_interfaces.c
  56. 2 1
      tests/datawizard/interfaces/variable/variable_opencl.c
  57. 2 1
      tests/datawizard/interfaces/vector/test_vector_opencl.c
  58. 1 1
      tests/datawizard/reclaim.c
  59. 5 2
      tests/datawizard/scratch.c
  60. 13 0
      tests/datawizard/scratch_opencl.c
  61. 3 7
      tests/datawizard/scratch_opencl_kernel.cl
  62. 4 3
      tests/datawizard/user_interaction_implicit.c
  63. 1 1
      tests/datawizard/write_only_tmp_buffer.c
  64. 6 4
      tests/datawizard/wt_broadcast.c
  65. 6 4
      tests/datawizard/wt_host.c
  66. 4 4
      tests/errorcheck/invalid_tasks.c
  67. 1 1
      tests/main/execute_on_a_specific_worker.c
  68. 1 1
      tests/parallel_tasks/explicit_combined_worker.c
  69. 1 1
      tests/parallel_tasks/parallel_kernels.c
  70. 1 1
      tests/parallel_tasks/parallel_kernels_spmd.c
  71. 2 2
      tests/parallel_tasks/spmd_pgreedy.c
  72. 2 1
      tests/perfmodels/non_linear_regression_based.c
  73. 2 1
      tests/perfmodels/opencl_memset.c
  74. 2 1
      tests/perfmodels/regression_based.c

+ 5 - 0
ChangeLog

@@ -18,6 +18,11 @@ StarPU 1.0 (svn revision xxxx)
 ==============================================
 The extensions-again release
 
+  * Allow users to disable asynchronous data transfers between CPUs and
+	GPUs.
+  * Update OpenCL driver to enable CPU devices (the environment variable
+	STARPU_OPENCL_ON_CPUS must be set to a positive value when
+	executing an application)
   * struct starpu_data_interface_ops --- operations on a data
         interface --- define a new function pointer allocate_new_data
         which creates a new data interface of the given type based on

+ 2 - 3
Makefile.am

@@ -72,13 +72,12 @@ nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h
 
 if BUILD_STARPU_TOP
-starpu-top/starpu_top$(EXEEXT):
-	cd starpu-top ; $(QMAKE) ; $(MAKE)
+starpu-top/starpu_top$(EXEEXT): all-local
 all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 clean-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
-	$(RM) starpu_top.1
+	$(RM) starpu_top.1 starpu-top/starpu_top
 # TODO: resources
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)

+ 5 - 0
configure.ac

@@ -140,6 +140,7 @@ AC_CHECK_FUNCS([memalign], [AC_DEFINE([STARPU_HAVE_MEMALIGN], [1], [Define to 1
 
 # Some systems don't have drand48
 AC_CHECK_FUNC([drand48], have_drand48=yes, have_drand48=no)
+AC_CHECK_FUNC([erand48_r], have_erand48_r=yes, have_erand48_r=no)
 # Maybe the user still does not want to use the provided drand48
 AC_ARG_ENABLE(default-drand48, [AS_HELP_STRING([--disable-default-drand48],
 				   [Do not use the default version of drand48])],
@@ -147,6 +148,9 @@ AC_ARG_ENABLE(default-drand48, [AS_HELP_STRING([--disable-default-drand48],
 if test x$have_drand48 = xyes -a x$enable_default_drand48 = xyes ; then
    AC_DEFINE([STARPU_USE_DRAND48], [1], [Define to 1 if drandr48 is available and should be used])
 fi
+if test x$have_erand48_r = xyes ; then
+   AC_DEFINE([STARPU_USE_ERAND48_R], [1], [Define to 1 if erandr48_r is available])
+fi
 
 # Some systems do not define strerror_r
 AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
@@ -1629,6 +1633,7 @@ AC_OUTPUT([
 	doc/Makefile
 	mpi/Makefile
 	starpu-top/StarPU-Top.pro
+	starpu-top/StarPU-Top-qwt-embed.pri
 	starpu-top/StarPU-Top-qwt-system.pri
         gcc-plugin/Makefile
 	gcc-plugin/src/Makefile

+ 9 - 1
doc/chapters/basic-api.texi

@@ -113,7 +113,15 @@ By default, StarPU parallel tasks concurrently.
 Some parallel libraries (e.g. most OpenMP implementations) however do
 not support concurrent calls to parallel code. In such case, setting this flag
 makes StarPU only start one parallel task at a time.
-@code{STARPU_SINGLE_COMBINED_WORKER} environment variable.
+This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environment variable.
+
+@item @code{int disable_asynchronous_copy} (default = 0)
+This flag should be set to 1 to disable asynchronous copies between
+CPUs and accelerators. This can also be specified with the
+@code{STARPU_DISABLE_ASYNCHRONOUS_COPY} environment variable.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
 @end table
 @end deftp
 

+ 15 - 11
doc/chapters/basic-examples.texi

@@ -38,6 +38,10 @@ LDFLAGS         +=      $$(pkg-config --libs starpu-1.0)
 @end example
 @end cartouche
 
+Make sure that @code{pkg-config --libs starpu-1.0} actually produces some output
+before going further: @code{PKG_CONFIG_PATH} has to point to the place where
+@code{starpu-1.0.pc} was installed during @code{make install}.
+
 Also pass the @code{--static} option if the application is to be linked statically.
 
 @node Hello World
@@ -55,8 +59,8 @@ Extensions}) or directly use the StarPU's API.
 @node Hello World using the C Extension
 @subsection Hello World using the C Extension
 
-Writing a task is both simpler and less error-prone when using the C
-extensions implemented by StarPU's GCC plug-in (@pxref{C Extensions}).
+GCC from version 4.5 permit to use the StarPU GCC plug-in (@pxref{C
+Extensions}). This makes writing a task both simpler and less error-prone.
 In a nutshell, all it takes is to declare a task, declare and define its
 implementations (for CPU, OpenCL, and/or CUDA), and invoke the task like
 a regular C function.  The example below defines @code{my_task}, which
@@ -68,7 +72,8 @@ has a single implementation for CPU:
 static void my_task (int x) __attribute__ ((task));
 
 /* Declaration of the CPU implementation of `my_task'.  */
-static void my_task_cpu (int x) __attribute__ ((task_implementation ("cpu", my_task)));
+static void my_task_cpu (int x)
+   __attribute__ ((task_implementation ("cpu", my_task)));
 
 /* Definition of said CPU implementation.  */
 static void my_task_cpu (int x)
@@ -327,20 +332,20 @@ has to be defined:
 @smallexample
 /* Declare the `vector_scal' task.  */
 
-static void vector_scal (size_t size, float vector[size],
+static void vector_scal (unsigned size, float vector[size],
                          float factor)
   __attribute__ ((task));
 
 /* Declare and define the standard CPU implementation.  */
 
-static void vector_scal_cpu (size_t size, float vector[size],
+static void vector_scal_cpu (unsigned size, float vector[size],
                              float factor)
   __attribute__ ((task_implementation ("cpu", vector_scal)));
 
 static void
-vector_scal_cpu (size_t size, float vector[size], float factor)
+vector_scal_cpu (unsigned size, float vector[size], float factor)
 @{
-  size_t i;
+  unsigned i;
   for (i = 0; i < size; i++)
     vector[i] *= factor;
 @}
@@ -447,13 +452,12 @@ in our C file like this:
 /* The OpenCL programs, loaded from `main' (see below).  */
 static struct starpu_opencl_program cl_programs;
 
-static void vector_scal_opencl (size_t size, float vector[size],
+static void vector_scal_opencl (unsigned size, float vector[size],
                                 float factor)
   __attribute__ ((task_implementation ("opencl", vector_scal)));
 
-@c TODO This example will not work : size cannot be a size_t in clSetKernelArg, and global should not be 1. Do we want to document the ugly hach we use, though ?
 static void
-vector_scal_opencl (size_t size, float vector[size], float factor)
+vector_scal_opencl (unsigned size, float vector[size], float factor)
 @{
   int id, devid, err;
   cl_kernel kernel;
@@ -523,7 +527,7 @@ declaration for the task implementation:
 
 @cartouche
 @smallexample
-extern void vector_scal_cuda (size_t size, float vector[size],
+extern void vector_scal_cuda (unsigned size, float vector[size],
                               float factor)
   __attribute__ ((task_implementation ("cuda", vector_scal)));
 @end smallexample

+ 10 - 10
doc/chapters/c-extensions.texi

@@ -120,20 +120,20 @@ Here is an example:
 
 static void matmul (const float *A, const float *B,
                     __output float *C,
-                    size_t nx, size_t ny, size_t nz)
+                    unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task));
 
 static void matmul_cpu (const float *A, const float *B,
                         __output float *C,
-                        size_t nx, size_t ny, size_t nz)
+                        unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task_implementation ("cpu", matmul)));
 
 
 static void
 matmul_cpu (const float *A, const float *B, __output float *C,
-            size_t nx, size_t ny, size_t nz)
+            unsigned nx, unsigned ny, unsigned nz)
 @{
-  size_t i, j, k;
+  unsigned i, j, k;
 
   for (j = 0; j < ny; j++)
     for (i = 0; i < nx; i++)
@@ -156,11 +156,11 @@ CUDA and OpenCL implementations can be declared in a similar way:
 @cartouche
 @smallexample
 static void matmul_cuda (const float *A, const float *B, float *C,
-                         size_t nx, size_t ny, size_t nz)
+                         unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task_implementation ("cuda", matmul)));
 
 static void matmul_opencl (const float *A, const float *B, float *C,
-                           size_t nx, size_t ny, size_t nz)
+                           unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task_implementation ("opencl", matmul)));
 @end smallexample
 @end cartouche
@@ -175,7 +175,7 @@ OpenCL under the hood, such as CUBLAS functions:
 @smallexample
 static void
 matmul_cuda (const float *A, const float *B, float *C,
-             size_t nx, size_t ny, size_t nz)
+             unsigned nx, unsigned ny, unsigned nz)
 @{
   cublasSgemm ('n', 'n', nx, ny, nz,
                1.0f, A, 0, B, 0,
@@ -356,12 +356,12 @@ in a way that allows it to be compiled without the GCC plug-in:
 #include <stdlib.h>
 
 static void matmul (const float *A, const float *B, float *C,
-                    size_t nx, size_t ny, size_t nz) __task;
+                    unsigned nx, unsigned ny, unsigned nz) __task;
 
 #ifdef STARPU_GCC_PLUGIN
 
 static void matmul_cpu (const float *A, const float *B, float *C,
-                        size_t nx, size_t ny, size_t nz)
+                        unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task_implementation ("cpu", matmul)));
 
 #endif
@@ -369,7 +369,7 @@ static void matmul_cpu (const float *A, const float *B, float *C,
 
 static void
 CPU_TASK_IMPL (matmul) (const float *A, const float *B, float *C,
-                        size_t nx, size_t ny, size_t nz)
+                        unsigned nx, unsigned ny, unsigned nz)
 @{
   /* Code of the CPU kernel here...  */
 @}

+ 1 - 1
doc/chapters/configuration.texi

@@ -197,7 +197,7 @@ Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
 default, it is enabled when an OpenCL implementation is found.
 
 @item --disable-starpu-top
-Disable the StarPU-Top interface (@pxref{starpu-top}).  By default, it
+Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
 is enabled when the required dependencies are found.
 
 @end table

+ 6 - 5
doc/chapters/perf-feedback.texi

@@ -93,7 +93,7 @@ Calling @code{starpu_worker_get_profiling_info} resets the profiling
 information associated to a worker.
 
 When an FxT trace is generated (see @ref{Generating traces}), it is also
-possible to use the @code{starpu_top} script (described in @ref{starpu-top}) to
+possible to use the @code{starpu_workers_activity} script (described in @ref{starpu-workers-activity}) to
 generate a graphic showing the evolution of these values during the time, for
 the different workers.
 
@@ -186,7 +186,8 @@ starpu_top_task_prevision(task, workerid, begin, end);
 @end smallexample
 @end cartouche
 
-Starting StarPU-Top and the application can be done two ways:
+Starting StarPU-Top@footnote{StarPU-Top is started via the binary
+@code{starpu_top}.} and the application can be done two ways:
 
 @itemize
 @item The application is started by hand on some machine (and thus already
@@ -218,7 +219,7 @@ and "localhost" should be used as IP Address to connect to.
 * Generating traces::           Generating traces with FxT
 * Gantt diagram::               Creating a Gantt Diagram
 * DAG::                         Creating a DAG with graphviz
-* starpu-top::                  Monitoring activity
+* starpu-workers-activity::     Monitoring activity
 @end menu
 
 @node Generating traces
@@ -301,7 +302,7 @@ graphical output of the graph by using the graphviz library:
 $ dot -Tpdf dag.dot -o output.pdf
 @end example
 
-@node starpu-top
+@node starpu-workers-activity
 @subsection Monitoring activity
 
 When the FxT trace file @code{filename} has been generated, it is possible to
@@ -314,7 +315,7 @@ This will create an @code{activity.data} file in the current
 directory. A profile of the application showing the activity of StarPU
 during the execution of the program can be generated:
 @example
-$ starpu_top activity.data
+$ starpu_workers_activity activity.data
 @end example
 
 This will create a file named @code{activity.eps} in the current directory.

+ 17 - 6
examples/Makefile.am

@@ -57,7 +57,8 @@ EXTRA_DIST = 					\
 	filters/fblock_opencl_kernel.cl		\
 	filters/custom_mf/conversion_opencl.cl  \
 	filters/custom_mf/custom_opencl.cl \
-	interface/complex_kernels.cl
+	interface/complex_kernels.cl \
+	reductions/dot_product_opencl_kernels.cl
 
 CLEANFILES = 					\
 	gordon/null_kernel_gordon.spuelf
@@ -141,7 +142,8 @@ noinst_HEADERS = 				\
 	pi/SobolQRNG/sobol.h			\
 	pi/SobolQRNG/sobol_gold.h		\
 	pi/SobolQRNG/sobol_gpu.h		\
-	pi/SobolQRNG/sobol_primitives.h
+	pi/SobolQRNG/sobol_primitives.h         \
+	reductions/dot_product.h
 
 #####################################
 # What to install and what to check #
@@ -194,6 +196,7 @@ examplebin_PROGRAMS +=				\
 	mandelbrot/mandelbrot			\
 	ppm_downscaler/ppm_downscaler		\
 	ppm_downscaler/yuv_downscaler
+
 if STARPU_HAVE_F77_H
 examplebin_PROGRAMS +=				\
 	basic_examples/vector_scal_fortran
@@ -693,10 +696,14 @@ endif
 cpp_incrementer_cpp_SOURCES	=	\
 	cpp/incrementer_cpp.cpp
 
-#if STARPU_USE_CUDA
-#cpp_incrementer_cpp_SOURCES +=	\
-#	incrementer/incrementer_kernels.cu
-#endif
+if STARPU_USE_CUDA
+cpp_incrementer_cpp_SOURCES +=	\
+	incrementer/incrementer_kernels.cu
+endif
+if STARPU_USE_OPENCL
+cpp_incrementer_cpp_SOURCES += \
+	incrementer/incrementer_kernels_opencl.c
+endif
 
 #######################
 # Incrementer example #
@@ -753,6 +760,10 @@ if STARPU_USE_CUDA
 reductions_dot_product_SOURCES +=		\
 	reductions/dot_product_kernels.cu
 endif
+if STARPU_USE_OPENCL
+nobase_STARPU_OPENCL_DATA_DATA += \
+	reductions/dot_product_opencl_kernels.cl
+endif
 
 ##################
 # Mandelbrot Set #

+ 6 - 3
examples/axpy/axpy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -101,8 +101,10 @@ check(void)
 	for (i = 0; i < N; i++)
 	{
 		TYPE expected_value = alpha * vec_x[i] + 4.0;
-		if (fabs(vec_y[i] - expected_value) > EPSILON)
+		if (fabs(vec_y[i] - expected_value) > expected_value * EPSILON) {
+			FPRINTF(stderr,"at %d, %f*%f+%f=%f, expected %f\n", i, alpha, vec_x[i], 4.0, vec_y[i], expected_value);
 			return EXIT_FAILURE;
+		}
 	}
 
 	return EXIT_SUCCESS;
@@ -213,7 +215,8 @@ enodev:
 	starpu_free((void *)vec_y);
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	/* Stop StarPU */
 	starpu_shutdown();

+ 2 - 1
examples/axpy/axpy_opencl.c

@@ -16,12 +16,13 @@
 
 #include <starpu.h>
 #include <starpu_opencl.h>
+#include "axpy.h"
 
 extern struct starpu_opencl_program opencl_program;
 
 void axpy_opencl(void *buffers[], void *_args)
 {
-	float *alpha = _args;
+	TYPE *alpha = _args;
 	int id, devid;
         cl_int err;
 	cl_kernel kernel;

+ 6 - 6
examples/basic_examples/block_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,11 +36,11 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_event event;
 	int id, devid, err;
 	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]);
-	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
-	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
-	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
-        unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
-        unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
+	uint32_t nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
+	uint32_t ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
+	uint32_t nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
+        uint32_t ldy = STARPU_BLOCK_GET_LDY(descr[0]);
+        uint32_t ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
         float *multiplier = (float *)_args;
 
         id = starpu_worker_get_id();

+ 7 - 11
examples/basic_examples/block_opencl_kernel.cl

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,15 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void block(__global float *b, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
+#include <stdint.h>
+
+__kernel void block(__global float *b, uint32_t nx, uint32_t ny, uint32_t nz, uint32_t ldy, uint32_t ldz, float multiplier)
 {
-        int i, j, k;
-        for(k=0; k<nz ; k++)
-	{
-                for(j=0; j<ny ; j++)
-		{
-                        for(i=0; i<nx ; i++)
-                                b[(k*ldz)+(j*ldy)+i] *= multiplier;
-                }
-        }
+     const int i = get_global_id(0);
+     if (i < (nz*ldz)+(ny*ldy)+nx)
+	  b[i] = b[i] * multiplier;
 }

+ 2 - 1
examples/basic_examples/multiformat.c

@@ -314,7 +314,8 @@ main(void)
 	print_it();
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
         starpu_opencl_unload_opencl(&opencl_conversion_program);
 #endif
 	starpu_shutdown();

+ 2 - 1
examples/basic_examples/vector_scal.c

@@ -157,7 +157,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(vector_handle);
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 
 	/* terminate StarPU, no task can be submitted after */

+ 25 - 21
examples/cpp/incrementer_cpp.cpp

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,22 +20,22 @@
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
-#warning todo: fix cuda and opencl
-//#ifdef STARPU_USE_CUDA
-//extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
-//#endif
+#ifdef STARPU_USE_CUDA
+extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
 
-//#ifdef STARPU_USE_OPENCL
-//#include <starpu_opencl.h>
-//extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
-//struct starpu_opencl_program opencl_program;
-//#endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+extern "C" void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+struct starpu_opencl_program opencl_program;
+#endif
 
 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	val[0] += 1.0f; val[1] += 1.0f;
+	val[0] += 1.0f;
+	val[1] += 1.0f;
 }
 
 int main(int argc, char **argv)
@@ -52,20 +53,19 @@ int main(int argc, char **argv)
 
 	starpu_vector_data_register(&float_array_handle, 0, (uintptr_t)&float_array, 4, sizeof(float));
 
-//#ifdef STARPU_USE_OPENCL
-//        ret = starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
-//	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
-//#endif
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
 
         starpu_codelet_init(&cl);
-        cl.where = STARPU_CPU;//|STARPU_CUDA;//|STARPU_OPENCL,
         cl.cpu_funcs[0] = cpu_codelet;
-//#ifdef STARPU_USE_CUDA
-//        cl.cuda_funcs[0] = cuda_codelet;
-//#endif
-//#ifdef STARPU_USE_OPENCL
-//	cl.opencl_funcs[0] = opencl_codelet;
-//#endif
+#ifdef STARPU_USE_CUDA
+        cl.cuda_funcs[0] = cuda_codelet;
+#endif
+#ifdef STARPU_USE_OPENCL
+	cl.opencl_funcs[0] = opencl_codelet;
+#endif
         cl.nbuffers = 1;
         cl.modes[0] = STARPU_RW;
 
@@ -95,6 +95,10 @@ int main(int argc, char **argv)
 		ret = 1;
 	}
 
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
 	starpu_shutdown();
 
 	return ret;

+ 4 - 2
examples/interface/complex.c

@@ -160,14 +160,16 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();
 	return 0;
 
 enodev:
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_data_unregister(handle1);
 	starpu_data_unregister(handle2);

+ 5 - 5
examples/interface/complex_kernels_opencl.c

@@ -31,12 +31,12 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
 	cl_event event;
 
 	/* length of the vector */
-	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+	unsigned n = STARPU_COMPLEX_GET_NX(buffers[0]);
 	/* OpenCL copy of the vector pointer */
-	cl_mem *i_real      = (cl_mem *) STARPU_COMPLEX_GET_REAL(buffers[0]);
-	cl_mem *i_imaginary = (cl_mem *) STARPU_COMPLEX_GET_IMAGINARY(buffers[0]);
-	cl_mem *o_real      = (cl_mem *) STARPU_COMPLEX_GET_REAL(buffers[1]);
-	cl_mem *o_imaginary = (cl_mem *) STARPU_COMPLEX_GET_IMAGINARY(buffers[1]);
+	cl_mem i_real      = (cl_mem) STARPU_COMPLEX_GET_REAL(buffers[0]);
+	cl_mem i_imaginary = (cl_mem) STARPU_COMPLEX_GET_IMAGINARY(buffers[0]);
+	cl_mem o_real      = (cl_mem) STARPU_COMPLEX_GET_REAL(buffers[1]);
+	cl_mem o_imaginary = (cl_mem) STARPU_COMPLEX_GET_IMAGINARY(buffers[1]);
 
 	id = starpu_worker_get_id();
 	devid = starpu_worker_get_devid(id);

+ 168 - 7
examples/reductions/dot_product.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +17,9 @@
 
 #include <starpu.h>
 #include <assert.h>
+#include <math.h>
+
+#include <reductions/dot_product.h>
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
@@ -23,25 +27,32 @@
 #include <starpu_cuda.h>
 #endif
 
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 static float *x;
 static float *y;
 static starpu_data_handle_t *x_handles;
 static starpu_data_handle_t *y_handles;
+#ifdef STARPU_USE_OPENCL
+static struct starpu_opencl_program opencl_program;
+#endif
 
 static unsigned nblocks = 4096;
 static unsigned entries_per_block = 1024;
 
-#define DOT_TYPE double
-
 static DOT_TYPE dot = 0.0f;
 static starpu_data_handle_t dot_handle;
 
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
-	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
+
 #ifdef STARPU_USE_CUDA
 	/* Cuda device */
 	const struct cudaDeviceProp *props;
@@ -73,14 +84,41 @@ void init_cuda_func(void *descr[], void *cl_arg)
 }
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void init_opencl_func(void *buffers[], void *args)
+{
+        cl_int err;
+	cl_command_queue queue;
+
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	starpu_opencl_get_current_queue(&queue);
+	DOT_TYPE zero = (DOT_TYPE) 0.0;
+
+	err = clEnqueueWriteBuffer(queue,
+			dot,
+			CL_TRUE,
+			0,
+			sizeof(DOT_TYPE),
+			&zero,
+			0,
+			NULL,
+			NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+}
+#endif
+
 static struct starpu_codelet init_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.can_execute = can_execute,
 	.cpu_funcs = {init_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {init_cuda_func, NULL},
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {init_opencl_func, NULL},
+#endif
 	.nbuffers = 1
 };
 
@@ -100,14 +138,67 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 extern void redux_cuda_func(void *descr[], void *_args);
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void redux_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
+	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
+	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}
+#endif
+
 static struct starpu_codelet redux_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.can_execute = can_execute,
 	.cpu_funcs = {redux_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_func, NULL},
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {redux_opencl_func, NULL},
+#endif
 	.nbuffers = 2
 };
 
@@ -163,14 +254,71 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 }
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void dot_opencl_func(void *buffers[], void *args)
+{
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
+	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
+	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
+	err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
+	err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global)
+			local=global;
+
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}
+#endif
+
 static struct starpu_codelet dot_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.can_execute = can_execute,
 	.cpu_funcs = {dot_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dot_cuda_func, NULL},
 #endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dot_opencl_func, NULL},
+#endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_REDUX}
 };
@@ -188,6 +336,12 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
 	starpu_helper_cublas_init();
 
 	unsigned long nelems = nblocks*entries_per_block;
@@ -257,6 +411,10 @@ int main(int argc, char **argv)
 
 	starpu_helper_cublas_shutdown();
 
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
 	starpu_shutdown();
 
 	free(x);
@@ -264,7 +422,10 @@ int main(int argc, char **argv)
 	free(x_handles);
 	free(y_handles);
 
-	return 0;
+	if (fabs(reference_dot - dot) < reference_dot * 1e-6)
+		return EXIT_SUCCESS;
+	else
+		return EXIT_FAILURE;
 
 enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");

+ 6 - 0
examples/reductions/dot_product.h

@@ -0,0 +1,6 @@
+#ifndef DOT_PRODUCT_H__
+#define DOT_PRODUCT_H__
+
+#define DOT_TYPE double
+
+#endif /* DOT_PRODUCT_H__ */

+ 40 - 0
examples/reductions/dot_product_opencl_kernels.cl

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dot_product.h"
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+__kernel void _redux_opencl(__global DOT_TYPE *dota,
+			    __global DOT_TYPE *dotb)
+{
+        const int i = get_global_id(0);
+	*dota += *dotb;
+}
+
+__kernel void _dot_opencl(__global float *x,
+			  __global float *y,
+			  __global DOT_TYPE *dot,
+			  unsigned n)
+{
+	unsigned i;
+	__local double tmp;
+	tmp = 0.0;
+	for (i = 0; i < n ; i++)
+		tmp += x[i]*y[i];
+		
+	*dot += tmp;
+}

+ 5 - 0
examples/tag_example/tag_restartable.c

@@ -129,6 +129,11 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_SLOW_MACHINE
+	ni /= 4;
+	nk /= 16;
+#endif
+
 #ifdef STARPU_USE_GORDON
 	/* load an empty kernel and get its identifier */
 	unsigned gordon_null_kernel = load_gordon_null_kernel();

+ 8 - 4
gcc-plugin/examples/vector_scal/vector_scal.c

@@ -114,7 +114,7 @@ vector_scal_opencl (unsigned int size, float vector[size], float factor)
   if (err)
     STARPU_OPENCL_REPORT_ERROR (err);
 
-  size_t global = 1, local = 1;
+  size_t global = size, local = 1;
   err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0,
 				NULL, &event);
   if (err != CL_SUCCESS)
@@ -148,9 +148,13 @@ check (size_t size, float vector[size], float factor)
   size_t i;
 
   for (i = 0; i < size; i++)
-    if (fabs(vector[i] - i * factor) > EPSILON)
-      return false;
-
+    {
+      if (fabs(vector[i] - i * factor) > i*factor*EPSILON)
+        {
+          fprintf(stderr, "%.2f != %.2f\n", vector[i], i*factor);
+          return false;
+        }
+    }
   return true;
 }
 

+ 3 - 0
gcc-plugin/src/Makefile.am

@@ -28,3 +28,6 @@ AM_CPPFLAGS =						\
   -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1
 
 AM_LDFLAGS = -module
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 2 - 0
gcc-plugin/src/starpu-gcc-config.h.in

@@ -26,6 +26,8 @@
 
 #undef HAVE_DECL_BUILD_ZERO_CST
 
+#undef HAVE_DECL_BUILTIN_DECL_EXPLICIT
+
 #undef HAVE_C_FAMILY_C_COMMON_H
 #undef HAVE_C_COMMON_H
 

+ 18 - 5
gcc-plugin/src/starpu.c

@@ -177,6 +177,19 @@ build_zero_cst (tree type)
 
 #endif
 
+#if !HAVE_DECL_BUILTIN_DECL_EXPLICIT
+
+/* This function was introduced in GCC 4.7 as a replacement for the
+   `built_in_decls' array.  */
+
+static inline tree
+builtin_decl_explicit (enum built_in_function fncode)
+{
+  return built_in_decls[fncode];
+}
+
+#endif
+
 
 /* Helpers.  */
 
@@ -307,7 +320,7 @@ build_printf (const char *fmt, ...)
 
   va_start (args, fmt);
   vasprintf (&str, fmt, args);
-  call = build_call_expr (built_in_decls[BUILT_IN_PUTS], 1,
+  call = build_call_expr (builtin_decl_explicit (BUILT_IN_PUTS), 1,
 	 		  build_string_literal (strlen (str) + 1, str));
   free (str);
   va_end (args);
@@ -359,7 +372,7 @@ build_error_statements (location_t loc, tree error_var, const char *fmt, ...)
       tree error_code =
 	build1 (NEGATE_EXPR, TREE_TYPE (error_var), error_var);
       print =
-	build_call_expr (built_in_decls[BUILT_IN_PRINTF], 2,
+	build_call_expr (builtin_decl_explicit (BUILT_IN_PRINTF), 2,
 			 build_string_literal (strlen (fmt_long) + 1,
 					       fmt_long),
 			 build_call_expr (strerror_fn, 1, error_code));
@@ -372,7 +385,7 @@ build_error_statements (location_t loc, tree error_var, const char *fmt, ...)
 		xloc.file, xloc.line, str);
 
       print =
-	build_call_expr (built_in_decls[BUILT_IN_PUTS], 1,
+	build_call_expr (builtin_decl_explicit (BUILT_IN_PUTS), 1,
 			 build_string_literal (strlen (fmt_long) + 1,
 					       fmt_long));
     }
@@ -383,8 +396,8 @@ build_error_statements (location_t loc, tree error_var, const char *fmt, ...)
 
   tree stmts = NULL;
   append_to_statement_list (print, &stmts);
-  append_to_statement_list (build_call_expr (built_in_decls[BUILT_IN_ABORT],
-					     0),
+  append_to_statement_list (build_call_expr
+			    (builtin_decl_explicit (BUILT_IN_ABORT), 0),
 			    &stmts);
 
   return stmts;

+ 5 - 0
include/starpu.h

@@ -80,6 +80,9 @@ struct starpu_conf
 
 	/* Create only one combined worker, containing all CPU workers */
 	int single_combined_worker;
+
+        /* indicate if the asynchronous copies should be disabled */
+	int disable_asynchronous_copy;
 };
 
 /* Initialize a starpu_conf structure with default values. */
@@ -104,6 +107,8 @@ unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_spu_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
 
+int starpu_disable_asynchronous_copy();
+
 /* Return the identifier of the thread in case this is associated to a worker.
  * This will return -1 if this function is called directly from the application
  * or if it is some SPU worker where a single thread controls different SPUs. */

+ 1 - 7
include/starpu_config.h.in

@@ -76,13 +76,6 @@
 
 #undef STARPU_HAVE_WINDOWS
 
-#undef starpu_drand48
-#undef starpu_srand48
-#undef starpu_erand48
-
-#undef starpu_srand48_r
-#undef starpu_erand48_r
-
 #ifdef _MSC_VER
 typedef long starpu_ssize_t;
 #define __starpu_func__ __FUNCTION__
@@ -94,5 +87,6 @@ typedef ssize_t starpu_ssize_t;
 
 #undef STARPU_SLOW_MACHINE
 #undef STARPU_USE_DRAND48
+#undef STARPU_USE_ERAND48_R
 
 #endif

+ 9 - 3
include/starpu_rand.h

@@ -21,12 +21,18 @@
 #include <starpu_config.h>
 
 #ifdef STARPU_USE_DRAND48
-typedef struct drand48_data starpu_drand48_data;
 #  define starpu_srand48(seed)				srand48(seed)
 #  define starpu_drand48()				drand48()
 #  define starpu_erand48(xsubi)				erand48(xsubi)
-#  define starpu_srand48_r(seed, buffer)		srand48_r(seed, buffer)
-#  define starpu_erand48_r(xsubi, buffer, result)	erand48_r(xsubi, buffer, result)
+#  ifdef STARPU_USE_ERAND48_R
+typedef struct drand48_data starpu_drand48_data;
+#    define starpu_srand48_r(seed, buffer)		srand48_r(seed, buffer)
+#    define starpu_erand48_r(xsubi, buffer, result)	erand48_r(xsubi, buffer, result)
+#else
+typedef int starpu_drand48_data;
+#    define starpu_srand48_r(seed, buffer)		srand48(seed)
+#    define starpu_erand48_r(xsubi, buffer, result)	do {*(result) = erand48(xsubi); } while (0)
+#  endif
 #else
 typedef int starpu_drand48_data;
 #  define starpu_srand48(seed)				srand(seed)

+ 16 - 0
libstarpu.pc.in

@@ -1,3 +1,19 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 prefix=@prefix@
 exec_prefix=@exec_prefix@
 libdir=@libdir@

+ 3 - 1
m4/gcc.m4

@@ -70,9 +70,11 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
     dnl   build_call_expr_loc_vec   -- likewise
     dnl   build_array_ref           -- present but undeclared in 4.6.1
     dnl   build_zero_cst            -- not in GCC 4.5.x; appears in 4.6
+    dnl   builtin_decl_explicit     -- new in 4.7, replaces `built_in_decls'
     _STARPU_WITH_GCC_PLUGIN_API([
       AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec,
-                      build_array_ref, build_zero_cst],
+                      build_array_ref, build_zero_cst,
+		      builtin_decl_explicit],
         [], [], [#include <gcc-plugin.h>
 	         #include <tree.h>])
 

+ 1 - 1
socl/src/cl_enqueuemarker.c

@@ -36,5 +36,5 @@ cl_int command_marker_submit(command_marker cmd) {
 	struct starpu_task *task;
 	task = task_create(CL_COMMAND_MARKER);
 
-	task_submit(task, cmd);
+	return task_submit(task, cmd);
 }

+ 2 - 2
socl/src/command.c

@@ -76,9 +76,9 @@ void command_graph_dump_ex(cl_command cmd) {
 	for (i=0; i<cmd->num_events; i++)
 		command_graph_dump_ex(cmd->events[i]->command);
 
-	printf("CMD %lx TYPE %d DEPS", cmd, cmd->typ);
+	printf("CMD %p TYPE %d DEPS", cmd, cmd->typ);
 	for (i=0; i<cmd->num_events; i++)
-		printf(" %lx", cmd->events[i]->command);
+		printf(" %p", cmd->events[i]->command);
 	printf("\n");
 
 }

+ 3 - 3
socl/src/debug.h

@@ -22,9 +22,9 @@
 #ifdef STARPU_VERBOSE
 #define DEBUG
 #include <stdio.h>
-   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__);} while (0);
-   #define DEBUG_MSG_NOHEAD(...) fprintf(stderr, __VA_ARGS__)
-   #define DEBUG_ERROR(...) do { fprintf(stderr, "[SOCL] ERROR: "__VA_ARGS__); exit(1); } while (0);
+  #define DEBUG_MSG(...) do { if (!getenv("STARPU_SILENT")) { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__);}} while (0);
+  #define DEBUG_MSG_NOHEAD(...) do { if (!getenv("STARPU_SILENT")) { fprintf(stderr, __VA_ARGS__);}} while (0);
+  #define DEBUG_ERROR(...) do { if (!getenv("STARPU_SILENT")) { fprintf(stderr, "[SOCL] ERROR: "__VA_ARGS__); } exit(1); } while (0);
 #else
    #define DEBUG_MSG(...) while(0);
    #define DEBUG_MSG_NOHEAD(...) while(0);

+ 4 - 4
socl/src/init.c

@@ -22,18 +22,18 @@
  * Initialize SOCL
  */
 __attribute__((constructor)) static void socl_init() {
-  
+
   struct starpu_conf conf;
   starpu_conf_init(&conf);
   conf.ncuda = 0;
   putenv("STARPU_NCUDA=0");
   putenv("STARPU_NOPENCL=1");
-  putenv("STARPU_NCPUS=0");
+  putenv("STARPU_NCPUS=1");
 
   mem_object_init();
 
   starpu_init(&conf);
-  
+
   /* Disable dataflow implicit dependencies */
   starpu_data_set_default_sequential_consistency_flag(0);
 
@@ -54,7 +54,7 @@ __attribute__((destructor)) static void socl_shutdown() {
   int active_entities = gc_active_entity_count();
 
   if (active_entities != 0)
-    fprintf(stderr, "Unreleased entities: %d\n", active_entities);
+    DEBUG_MSG("Unreleased entities: %d\n", active_entities);
 
   starpu_shutdown();
 }

+ 2 - 2
src/common/list.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -137,7 +137,7 @@
       else if (l1->_head != NULL) { l1->_tail->_next = l2->_head; l2->_head->_prev = l1->_tail; l2->_head = l1->_head; } } \
   /** @internal */static inline void ENAME##_list_push_list_back(struct ENAME##_list *l1, struct ENAME##_list *l2) \
     { if(l1->_head == NULL) { l1->_head = l2->_head; l1->_tail = l2->_tail; } \
-      else if (l2->_head != NULL) { l1->_tail->_next = l2->_head; l2->_head->_prev = l1->_tail; l1->_tail = l2->_head; } } \
+      else if (l2->_head != NULL) { l1->_tail->_next = l2->_head; l2->_head->_prev = l1->_tail; l1->_tail = l2->_tail; } } \
   /** @internal */static inline struct ENAME *ENAME##_list_front(struct ENAME##_list *l) \
     { return l->_head; } \
   /** @internal */static inline struct ENAME *ENAME##_list_back(struct ENAME##_list *l) \

+ 0 - 1
src/core/jobs.c

@@ -250,7 +250,6 @@ void _starpu_handle_job_termination(struct _starpu_job *j, int workerid)
 		int ret = _starpu_submit_job(j);
 		STARPU_ASSERT(!ret);
 	}
-
 	_starpu_decrement_nsubmitted_tasks();
 	_starpu_decrement_nready_tasks();
 

+ 7 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -254,6 +254,13 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
 
+	if (_starpu_opencl_get_device_type(dev) == CL_DEVICE_TYPE_CPU)
+	{
+		/* Let's not use too much RAM when running OpenCL on a CPU: it
+		 * would make the OS swap like crazy. */
+		size /= 2;
+	}
+
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 

+ 21 - 0
src/core/workers.c

@@ -378,6 +378,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
 
+	conf->disable_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
+
 	return 0;
 }
 
@@ -454,6 +456,20 @@ int starpu_init(struct starpu_conf *user_conf)
 	 * initialization */
 	config.user_conf = user_conf;
 
+	if (user_conf)
+	{
+	     int disable_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
+	     if (disable_asynchronous_copy == 1)
+		  config.disable_asynchronous_copy = 1;
+	     else
+		  config.disable_asynchronous_copy = (user_conf->disable_asynchronous_copy == 1);
+	}
+	else
+	{
+	     int disable_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
+	     config.disable_asynchronous_copy = (disable_asynchronous_copy == 1);
+	}
+
 	_starpu_init_all_sched_ctxs(&config);
 	ret = _starpu_build_topology(&config);
 	if (ret)
@@ -705,6 +721,11 @@ unsigned starpu_spu_worker_get_count(void)
 	return config.topology.ngordon_spus;
 }
 
+int starpu_disable_asynchronous_copy()
+{
+	return config.disable_asynchronous_copy;
+}
+
 /* When analyzing performance, it is useful to see what is the processing unit
  * that actually performed the task. This function returns the id of the
  * processing unit actually executing it, therefore it makes no sense to use it

+ 5 - 2
src/core/workers.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -164,7 +164,10 @@ struct _starpu_machine_config
 
 	/* this flag is set until the runtime is stopped */
 	unsigned running;
-	
+
+        /* indicate if the asynchronous copies should be disabled */
+        int disable_asynchronous_copy;
+
 	/* all the sched ctx of the current instance of starpu */
 	struct _starpu_sched_ctx sched_ctxs[STARPU_NMAX_SCHED_CTXS];
 };

+ 21 - 0
src/datawizard/interfaces/data_interface.c

@@ -288,6 +288,27 @@ void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 	*handleptr = handle;
 	handle->mf_node = home_node;
 
+	int disable_asynchronous_copy = starpu_disable_asynchronous_copy();
+	if (STARPU_UNLIKELY(disable_asynchronous_copy))
+	{
+#ifdef STARPU_USE_CUDA
+	     if (ops->copy_methods->ram_to_cuda_async)
+		  ((struct starpu_data_copy_methods *)ops->copy_methods)->ram_to_cuda_async = NULL;
+	     if (ops->copy_methods->cuda_to_ram_async)
+		  ((struct starpu_data_copy_methods *)ops->copy_methods)->cuda_to_ram_async = NULL;
+	     if (ops->copy_methods->cuda_to_cuda_async)
+		  ((struct starpu_data_copy_methods *)ops->copy_methods)->cuda_to_cuda_async = NULL;
+#endif
+#ifdef STARPU_USE_OPENCL
+	     if (ops->copy_methods->ram_to_opencl_async)
+		  ((struct starpu_data_copy_methods *)ops->copy_methods)->ram_to_opencl_async = NULL;
+	     if (ops->copy_methods->opencl_to_ram_async)
+		  ((struct starpu_data_copy_methods *)ops->copy_methods)->opencl_to_ram_async = NULL;
+	     if (ops->copy_methods->opencl_to_opencl_async)
+		  ((struct starpu_data_copy_methods *)ops->copy_methods)->opencl_to_opencl_async = NULL;
+#endif
+	}
+
 	/* fill the interface fields with the appropriate method */
 	STARPU_ASSERT(ops->register_data_handle);
 	ops->register_data_handle(handle, home_node, data_interface);

+ 1 - 4
tests/Makefile.am

@@ -192,6 +192,7 @@ noinst_PROGRAMS =				\
 	datawizard/gpu_register   		\
 	datawizard/wt_host			\
 	datawizard/wt_broadcast			\
+	datawizard/readonly			\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/invalid_blocking_calls	\
 	errorcheck/invalid_tasks		\
@@ -282,8 +283,6 @@ endif
 if STARPU_USE_OPENCL
 datawizard_mpi_like_async_SOURCES +=			\
 	datawizard/opencl_codelet_unsigned_inc.c
-nobase_STARPU_OPENCL_DATA_DATA+= \
-	datawizard/opencl_codelet_unsigned_inc_kernel.cl
 endif
 
 datawizard_sync_and_notify_data_SOURCES =	\
@@ -539,8 +538,6 @@ perfmodels_non_linear_regression_based_SOURCES=\
 if STARPU_USE_OPENCL
 perfmodels_non_linear_regression_based_SOURCES+=\
 	perfmodels/opencl_memset.c
-nobase_STARPU_OPENCL_DATA_DATA += \
-	perfmodels/opencl_memset_kernel.cl
 endif
 
 showcheck:

+ 4 - 2
tests/datawizard/acquire_release.c

@@ -116,7 +116,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(token_handle);
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();
 
@@ -133,7 +134,8 @@ enodev:
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;

+ 4 - 2
tests/datawizard/acquire_release2.c

@@ -115,7 +115,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(token_handle);
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();
 
@@ -132,7 +133,8 @@ enodev:
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;

+ 205 - 55
tests/datawizard/gpu_register.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,42 +21,75 @@
 #include "../helper.h"
 #include "scal.h"
 
-int main(int argc, char **argv)
+static int
+submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 {
-	int ret;
+	int i, ret;
+
+	for (i = 0; i < pieces; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		task->cl = &scal_codelet;
+		task->execute_on_a_specific_worker = 1;
+		task->workerid = i%n;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			return -ENODEV;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	return 0;
+}
+
+static int
+find_a_worker(enum starpu_archtype type)
+{
+	int worker;
+	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	if (ret == 0)
+		return -ENODEV;
+	return worker;
+}
+
+static int
+check_result(unsigned *t, size_t size)
+{
+	int i;
+	for (i = 0; i < size; i++)
+	{
+		if (t[i] != i*2)
+		{
+			FPRINTF(stderr,"t[%d] is %d instead of %d\n", i, t[i], 2*i);
+			return 1;
+		}
+	}
+	return 0;
+}
+
 #ifdef STARPU_USE_CUDA
 #if CUDART_VERSION >= 4000
+static int
+test_cuda(void)
+{
+	int ret;
 	unsigned *foo_gpu;
 	unsigned *foo;
-	starpu_data_handle_t handle;
 	int n, i, size, pieces;
 	int devid;
-	unsigned workerid;
-	int chosen = -1;
+	int chosen;
 	cudaError_t cures;
-#endif
-#endif
-	ret = starpu_init(NULL);
-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-#ifdef STARPU_USE_CUDA
-#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
-	/* TODO OpenCL, too */
-	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
-		if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER) {
-			chosen = workerid;
-			break;
-		}
-	}
-
-	if (chosen == -1)
-		return STARPU_TEST_SKIPPED;
+	starpu_data_handle_t handle;
 
-#ifdef STARPU_USE_OPENCL
-	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
-#endif
+	/* Find a CUDA worker */
+	chosen = find_a_worker(STARPU_CUDA_WORKER);
+	if (chosen == -ENODEV)
+		return -ENODEV;
 
 	n = starpu_worker_get_count();
 	size = 10 * n;
@@ -92,21 +126,9 @@ int main(int argc, char **argv)
 
 	starpu_data_partition(handle, &f);
 
-	for (i = 0; i < pieces; i++) {
-		struct starpu_task *task = starpu_task_create();
-
-		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
-		task->cl = &scal_codelet;
-		task->execute_on_a_specific_worker = 1;
-		task->workerid = i%n;
-
-		ret = starpu_task_submit(task);
-		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-	}
-
-	ret = starpu_task_wait_for_all();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	ret = submit_tasks(handle, pieces, n);
+	if (ret == -ENODEV)
+		return -ENODEV;
 
 	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
 	starpu_data_unregister(handle);
@@ -116,24 +138,152 @@ int main(int argc, char **argv)
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	starpu_shutdown();
+	return check_result(foo, size);
+}
+#endif
+#endif
 
-	for (i = 0; i < size; i++) {
-		if (foo[i] != i*2) {
-			fprintf(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
-			return EXIT_FAILURE;
-		}
-	}
+#ifdef STARPU_USE_OPENCL
+static int
+test_opencl(void)
+{
+	int i;
+	int ret;
+	int chosen;
+	int n;
+	int size;
+	int pieces;
+	cl_mem foo_gpu;
+	starpu_data_handle_t handle;
+
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	/* Find an OpenCL worker */
+	chosen = find_a_worker(STARPU_OPENCL_WORKER);
+	if (chosen == -ENODEV)
+		return -ENODEV;
+
+	n = starpu_worker_get_count();
+	size = 10 * n;
+
+	int devid;
+	cl_int err;
+	cl_context context;
+	cl_command_queue queue;
+
+	devid = starpu_worker_get_devid(chosen);
+
+	starpu_opencl_get_context(devid, &context);
+	starpu_opencl_get_queue(devid, &queue);
+
+	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	unsigned int *foo = malloc(size*sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	err = clEnqueueWriteBuffer(queue,
+				foo_gpu,
+				CL_TRUE,
+				0,
+				size*sizeof(int),
+				foo,
+				0,
+				NULL,
+				NULL);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	
+	starpu_vector_data_register(&handle,
+				    starpu_worker_get_memory_node(chosen),
+				    (uintptr_t)foo_gpu,
+				    size,
+				    sizeof(int));
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	/* Even with just one worker, split in at least two */
+	if (n == 1)
+		pieces = 2;
+	else
+		pieces = n;
 
-        return EXIT_SUCCESS;
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = pieces,
+	};
+
+	starpu_data_partition(handle, &f);
+	
+	ret = submit_tasks(handle, pieces, n);
+	if (ret == -ENODEV)
+		return -ENODEV;
 
-enodev:
+	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
 	starpu_data_unregister(handle);
+
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	err = clEnqueueReadBuffer(queue,
+		foo_gpu,
+		CL_TRUE,
+		0,
+		size*sizeof(*foo),
+		foo,
+		0,
+		NULL,
+		NULL);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+	return check_result(foo, size);
+}
+#endif /* !STARPU_USE_OPENCL */
+
+int main(int argc, char **argv)
+{
+	int skipped_cuda = 1, skipped_opencl = 1;
+	int ret;
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+	ret = test_cuda();
+	if (ret == 1)
+		goto fail;
+	else if (ret == 0)
+		skipped_cuda = 0;
+#endif
 #endif
+
+#ifdef STARPU_USE_OPENCL
+	ret = test_opencl();
+	if (ret == 1)
+		goto fail;
+	else if (ret == 0)
+		skipped_opencl = 0;
 #endif
-	fprintf(stderr, "WARNING: No one can execute this task\n");
-	/* yes, we do not perform the computation but we did detect that no one
- 	 * could perform the kernel, so this is not an error from StarPU */
+
+	starpu_shutdown();
+
+	if (skipped_cuda == 1 && skipped_opencl == 1)
+		return STARPU_TEST_SKIPPED;
+
+	return EXIT_SUCCESS;
+
+fail:
 	starpu_shutdown();
-	return STARPU_TEST_SKIPPED;
+	return EXIT_FAILURE;
 }

+ 7 - 0
tests/datawizard/increment_redux.c

@@ -87,6 +87,7 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 	h_dst += h_src;
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clFinish(queue);
 }
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
@@ -100,6 +101,7 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 	starpu_opencl_get_current_queue(&queue);
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 
@@ -167,6 +169,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 
@@ -248,6 +251,7 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
 		if (var != ntasks * (loop+1))
 		{
+			FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var, ntasks * (loop+1));
 			starpu_data_release(handle);
 			starpu_data_unregister(handle);
 			goto err;
@@ -257,7 +261,10 @@ int main(int argc, char **argv)
 
 	starpu_data_unregister(handle);
 	if (var != ntasks * nloops)
+	{
+		FPRINTF(stderr, "Value %u != Expected value %u\n", var, ntasks * (loop+1));
 		goto err;
+	}
 
 	starpu_shutdown();
 

+ 16 - 4
tests/datawizard/increment_redux_lazy.c

@@ -228,20 +228,32 @@ int main(int argc, char **argv)
 		ret = starpu_data_acquire(handle, STARPU_R);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
 		var = (unsigned*) starpu_variable_get_local_ptr(handle);
-		STARPU_ASSERT(*var == ntasks*(loop + 1));
 		starpu_data_release(handle);
+
+		if (*var != ntasks*(loop + 1))
+		{
+			ret = EXIT_FAILURE;
+			FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", *var, ntasks * (loop+1));
+			goto err;
+		}
 	}
 
 	ret = starpu_data_acquire(handle, STARPU_R);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
 	var = (unsigned*) starpu_variable_get_local_ptr(handle);
-	STARPU_ASSERT(*var == ntasks*nloops);
 	starpu_data_release(handle);
 	starpu_data_unregister(handle);
 
-	starpu_shutdown();
+	if (*var != ntasks*nloops)
+	{
+		ret = EXIT_FAILURE;
+		FPRINTF(stderr, "Value %u != Expected value %u\n", *var, ntasks * (loop+1));
+		goto err;
+	}
 
-	STARPU_RETURN(EXIT_SUCCESS);
+err:
+	starpu_shutdown();
+	STARPU_RETURN(ret);
 
 enodev:
 	starpu_data_unregister(handle);

+ 3 - 1
tests/datawizard/increment_redux_v2.c

@@ -86,6 +86,7 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 	h_dst += h_src;
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clFinish(queue);
 }
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
@@ -99,6 +100,7 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 	starpu_opencl_get_current_queue(&queue);
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 
@@ -166,6 +168,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 
@@ -282,7 +285,6 @@ int main(int argc, char **argv)
 		_STARPU_DEBUG("%d != %d\n", var, ntasks*nloops);
 		goto err;
 	}
-	
 
 	starpu_shutdown();
 

+ 2 - 1
tests/datawizard/interfaces/bcsr/bcsr_opencl.c

@@ -126,5 +126,6 @@ test_bcsr_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 }

+ 2 - 1
tests/datawizard/interfaces/block/block_opencl.c

@@ -116,5 +116,6 @@ test_block_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 }

+ 2 - 1
tests/datawizard/interfaces/csr/csr_opencl.c

@@ -126,5 +126,6 @@ test_csr_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 }

+ 2 - 1
tests/datawizard/interfaces/matrix/matrix_opencl.c

@@ -124,6 +124,7 @@ void test_matrix_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&matrix_program);
+        ret = starpu_opencl_unload_opencl(&matrix_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 }
 

+ 2 - 1
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c

@@ -107,5 +107,6 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&opencl_conversion_program);
+        ret = starpu_opencl_unload_opencl(&opencl_conversion_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 }

+ 2 - 1
tests/datawizard/interfaces/multiformat/multiformat_opencl.c

@@ -130,5 +130,6 @@ void test_multiformat_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&multiformat_program);
+        ret = starpu_opencl_unload_opencl(&multiformat_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 }

+ 5 - 0
tests/datawizard/interfaces/test_interfaces.c

@@ -589,6 +589,11 @@ out:
 static void
 run_async(void)
 {
+	int async = starpu_disable_asynchronous_copy();
+	if (async == 1) {
+		FPRINTF(stderr, "Asynchronous copies have been disabled\n");
+		return;
+	}
 #ifdef STARPU_USE_CUDA
 	run_cuda(1);
 #endif /* !STARPU_USE_CUDA */

+ 2 - 1
tests/datawizard/interfaces/variable/variable_opencl.c

@@ -123,6 +123,7 @@ void test_variable_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 	return;
 }

+ 2 - 1
tests/datawizard/interfaces/vector/test_vector_opencl.c

@@ -126,5 +126,6 @@ test_vector_opencl_func(void *buffers[], void *args)
 	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 }

+ 1 - 1
tests/datawizard/reclaim.c

@@ -112,7 +112,7 @@ int main(int argc, char **argv)
 		if (host_ptr_array[i] == NULL)
 		{
 			mb = i;
-			fprintf(stderr, "Cannot allocate more than %d buffers\n", mb);
+			FPRINTF(stderr, "Cannot allocate more than %d buffers\n", mb);
 			break;
 		}
 		starpu_variable_data_register(&handle_array[i], 0, (uintptr_t)host_ptr_array[i], BLOCK_SIZE);

+ 5 - 2
tests/datawizard/scratch.c

@@ -114,7 +114,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(B_handle);
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();
 
@@ -125,6 +126,7 @@ int main(int argc, char **argv)
 	{
 		if (A[i] != NLOOPS)
 		{
+			FPRINTF(stderr, "Error: Incorrect value A[%d] = %u != %d\n", i, A[i], NLOOPS);
 			ret = EXIT_FAILURE;
 			break;
 		}
@@ -137,7 +139,8 @@ enodev:
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(B_handle);
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();
 	/* yes, we do not perform the computation but we did detect that no one

+ 13 - 0
tests/datawizard/scratch_opencl.c

@@ -29,6 +29,7 @@ void opencl_f(void *buffers[], void *args)
 	cl_event event;
 
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+	unsigned elemsize = STARPU_VECTOR_GET_ELEMSIZE(buffers[0]);
 	cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 	cl_mem tmp = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
 
@@ -39,6 +40,18 @@ void opencl_f(void *buffers[], void *args)
 	if (err != CL_SUCCESS)
 		STARPU_OPENCL_REPORT_ERROR(err);
 
+	err = clEnqueueCopyBuffer(queue,
+		val,
+		tmp,
+		0,           /* offset in val */
+		0,           /* offset in tmp */
+		n * elemsize,
+		0,           /* num_events_in_wait_list */
+		NULL,        /* event_wait_list */
+		NULL);       /* event */
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
 	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	err|= clSetKernelArg(kernel, 1, sizeof(tmp), &tmp);
 	err|= clSetKernelArg(kernel, 2, sizeof(n), &n);

+ 3 - 7
tests/datawizard/scratch_opencl_kernel.cl

@@ -18,12 +18,8 @@ __kernel void increment_vector_opencl(__global unsigned *val,
 				      __global unsigned *tmp,
 				      unsigned nx)
 {
-        const int tid = get_global_id(0);
-	const uint nthreads = get_local_size(0);
+        const int id = get_global_id(0);
 
-	int i;
-	for (i = tid; i < nx; i += nthreads)
-	{
-		val[i] = tmp[i] + 1;
-	}
+	if (id < nx)
+		val[id] = tmp[id] + 1;
 }

+ 4 - 3
tests/datawizard/user_interaction_implicit.c

@@ -72,6 +72,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
 	/* do some cleanup */
+	ret = EXIT_SUCCESS;
 	for (b = 0; b < NBUFFERS; b++)
 	{
 		starpu_data_unregister(buffers[b].handle);
@@ -79,12 +80,12 @@ int main(int argc, char **argv)
 		/* check result */
 		if (buffers[b].val != NITER)
 		{
-			fprintf(stderr, "buffer[%u] = %u should be %d\n", b, buffers[b].val, NITER);
-			STARPU_ABORT();
+			FPRINTF(stderr, "buffer[%u] = %u should be %d\n", b, buffers[b].val, NITER);
+			ret = EXIT_FAILURE;
 		}
 	}
 
 	starpu_shutdown();
 
-	return EXIT_SUCCESS;
+	return ret;
 }

+ 1 - 1
tests/datawizard/write_only_tmp_buffer.c

@@ -72,7 +72,7 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	if (*buf != 42)
 	{
-		fprintf(stderr, "Value = %c (should be %c)\n", *buf, 42);
+		FPRINTF(stderr, "Value = %c (should be %c)\n", *buf, 42);
 		exit(-1);
 	}
 }

+ 6 - 4
tests/datawizard/wt_broadcast.c

@@ -128,14 +128,16 @@ int main(int argc, char **argv)
 
 	starpu_data_unregister(handle);
 
+	ret = EXIT_SUCCESS;
 	if (var != ntasks*nloops)
-		fprintf(stderr, "VAR is %d should be %d\n", var, ntasks);
-
-	STARPU_ASSERT(var == ntasks*nloops);
+	{
+		FPRINTF(stderr, "VAR is %d should be %d\n", var, ntasks);
+		ret = EXIT_FAILURE;
+	}
 
 	starpu_shutdown();
 
-	return EXIT_SUCCESS;
+	STARPU_RETURN(ret);
 
 enodev:
 	starpu_data_unregister(handle);

+ 6 - 4
tests/datawizard/wt_host.c

@@ -128,14 +128,16 @@ int main(int argc, char **argv)
 
 	starpu_data_unregister(handle);
 
+	ret = EXIT_SUCCESS;
 	if (var != ntasks*nloops)
-		fprintf(stderr, "VAR is %d should be %d\n", var, ntasks);
-
-	STARPU_ASSERT(var == ntasks*nloops);
+	{
+		ret = EXIT_FAILURE;
+		FPRINTF(stderr, "VAR is %d should be %d\n", var, ntasks);
+	}
 
 	starpu_shutdown();
 
-	return EXIT_SUCCESS;
+	STARPU_RETURN(ret);
 
 enodev:
 	starpu_data_unregister(handle);

+ 4 - 4
tests/errorcheck/invalid_tasks.c

@@ -31,10 +31,10 @@ static void dummy_func(void *descr[], void *arg)
 {
 }
 
-static struct starpu_codelet cuda_only_cl =
+static struct starpu_codelet gpu_only_cl =
 {
-	.where = STARPU_CUDA,
 	.cuda_funcs = {dummy_func, NULL},
+	.opencl_funcs = {dummy_func, NULL},
 	.model = NULL,
 	.nbuffers = 0
 };
@@ -56,7 +56,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	struct starpu_task *task = starpu_task_create();
-	task->cl = &cuda_only_cl;
+	task->cl = &gpu_only_cl;
 
 	/* Only a CUDA device could execute that task ! */
 	ret = starpu_task_submit(task);
@@ -66,7 +66,7 @@ int main(int argc, char **argv)
 	starpu_task_destroy(task);
 
 	struct starpu_task *task_specific = starpu_task_create();
-	task_specific->cl = &cuda_only_cl;
+	task_specific->cl = &gpu_only_cl;
 	task_specific->execute_on_a_specific_worker = 1;
 	task_specific->workerid = 0;
 

+ 1 - 1
tests/main/execute_on_a_specific_worker.c

@@ -55,7 +55,7 @@ static void callback(void *arg)
 static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
 //	int id = starpu_worker_get_id();
-//	fprintf(stderr, "worker #%d\n", id);
+//	FPRINTF(stderr, "worker #%d\n", id);
 }
 
 static struct starpu_codelet cl_r =

+ 1 - 1
tests/parallel_tasks/explicit_combined_worker.c

@@ -34,7 +34,7 @@ static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 #if 0
 	int id = starpu_worker_get_id();
 	int combined_id = starpu_combined_worker_get_id();
-	fprintf(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
 #endif
 }
 

+ 1 - 1
tests/parallel_tasks/parallel_kernels.c

@@ -34,7 +34,7 @@ static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 #if 0
 	int id = starpu_worker_get_id();
 	int combined_id = starpu_combined_worker_get_id();
-	fprintf(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
 #endif
 }
 

+ 1 - 1
tests/parallel_tasks/parallel_kernels_spmd.c

@@ -36,7 +36,7 @@ static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 	int id = starpu_worker_get_id();
 	int combined_id = starpu_combined_worker_get_id();
 	int rank = starpu_combined_worker_get_rank();
-	fprintf(stderr, "worker id %d - combined id %d - worker size %d - SPMD rank %d\n", id, combined_id, worker_size, rank);
+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d - SPMD rank %d\n", id, combined_id, worker_size, rank);
 #endif
 }
 

+ 2 - 2
tests/parallel_tasks/spmd_pgreedy.c

@@ -33,14 +33,14 @@ static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 	int worker_size = starpu_combined_worker_get_size();
 	STARPU_ASSERT(worker_size > 0);
 
-//	fprintf(stderr, "WORKERSIZE : %d\n", worker_size);
+//	FPRINTF(stderr, "WORKERSIZE : %d\n", worker_size);
 
 	usleep(1000/worker_size);
 #if 0
 	int id = starpu_worker_get_id();
 	int combined_id = starpu_combined_worker_get_id();
 	int rank = starpu_combined_worker_get_rank();
-	fprintf(stderr, "worker id %d - combined id %d - worker size %d - SPMD rank %d\n", id, combined_id, worker_size, rank);
+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d - SPMD rank %d\n", id, combined_id, worker_size, rank);
 #endif
 }
 

+ 2 - 1
tests/perfmodels/non_linear_regression_based.c

@@ -126,7 +126,8 @@ int main(int argc, char **argv)
 	}
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 
 	starpu_shutdown();

+ 2 - 1
tests/perfmodels/opencl_memset.c

@@ -21,7 +21,7 @@ extern struct starpu_opencl_program opencl_program;
 
 void memset_opencl(void *buffers[], void *args)
 {
-	(void) *args;
+	(void) args;
 	int id, devid;
         cl_int err;
 	cl_kernel kernel;
@@ -39,6 +39,7 @@ void memset_opencl(void *buffers[], void *args)
 		STARPU_OPENCL_REPORT_ERROR(err);
 
 	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
+	err|= clSetKernelArg(kernel, 1, sizeof(n), &n);
 	if (err)
 		STARPU_OPENCL_REPORT_ERROR(err);
 

+ 2 - 1
tests/perfmodels/regression_based.c

@@ -190,7 +190,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(handle);
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_unload_opencl(&opencl_program);
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
 	starpu_shutdown();