Andra Hugo 13 年 前
コミット
5f656f76cf
共有61 個のファイルを変更した1142 個の追加489 個の削除を含む
  1. 5 1
      ChangeLog
  2. 4 0
      Makefile.am
  3. 4 4
      STARPU-VERSION
  4. 17 3
      configure.ac
  5. 30 30
      doc/chapters/basic-api.texi
  6. 4 14
      doc/chapters/basic-examples.texi
  7. 91 29
      doc/chapters/c-extensions.texi
  8. 9 7
      examples/cpp/incrementer_cpp.cpp
  9. 16 17
      examples/filters/custom_mf/custom_interface.c
  10. 32 32
      examples/interface/complex_interface.c
  11. 5 0
      examples/reductions/minmax_reduction.c
  12. 9 12
      gcc-plugin/examples/vector_scal/vector_scal.c
  13. 193 50
      gcc-plugin/src/starpu.c
  14. 22 1
      gcc-plugin/tests/base.c
  15. 3 2
      gcc-plugin/tests/lib-user.c
  16. 5 7
      gcc-plugin/tests/my-lib.c
  17. 4 4
      gcc-plugin/tests/my-lib.h
  18. 3 5
      gcc-plugin/tests/pointer-tasks.c
  19. 3 4
      gcc-plugin/tests/scalar-tasks.c
  20. 2 9
      gcc-plugin/tests/task-errors.c
  21. 410 0
      include/pthread_win32/pthread.h
  22. 67 0
      include/pthread_win32/semaphore.h
  23. 1 1
      include/starpu_deprecated_api.h
  24. 2 6
      include/starpu_opencl.h
  25. 2 1
      socl/src/gc.c
  26. 3 0
      socl/src/task.c
  27. 0 2
      src/core/task.c
  28. 0 15
      src/core/topology.c
  29. 1 0
      src/datawizard/coherency.c
  30. 8 6
      src/datawizard/interfaces/bcsr_interface.c
  31. 10 10
      src/datawizard/interfaces/block_interface.c
  32. 8 6
      src/datawizard/interfaces/csr_interface.c
  33. 6 6
      src/datawizard/interfaces/matrix_interface.c
  34. 16 16
      src/datawizard/interfaces/multiformat_interface.c
  35. 4 4
      src/datawizard/interfaces/variable_interface.c
  36. 5 5
      src/datawizard/interfaces/vector_interface.c
  37. 4 0
      src/drivers/cuda/driver_cuda.c
  38. 25 81
      src/drivers/opencl/driver_opencl.c
  39. 2 0
      src/top/starpu_top_core.h
  40. 1 11
      tests/datawizard/acquire_release_opencl.c
  41. 10 10
      tests/datawizard/data_invalidation.c
  42. 24 19
      tests/datawizard/gpu_register.c
  43. 1 1
      tests/datawizard/interfaces/bcsr/bcsr_interface.c
  44. 1 1
      tests/datawizard/interfaces/block/block_interface.c
  45. 1 1
      tests/datawizard/interfaces/csr/csr_interface.c
  46. 1 1
      tests/datawizard/interfaces/matrix/matrix_interface.c
  47. 1 1
      tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release.c
  48. 2 1
      tests/datawizard/interfaces/multiformat/multiformat_interface.c
  49. 2 1
      tests/datawizard/interfaces/variable/variable_interface.c
  50. 1 1
      tests/datawizard/interfaces/vector/test_vector_interface.c
  51. 1 1
      tests/datawizard/interfaces/void/void_interface.c
  52. 20 20
      tests/datawizard/lazy_allocation.c
  53. 6 3
      tests/datawizard/manual_reduction.c
  54. 2 2
      tests/datawizard/mpi_like.c
  55. 1 11
      tests/datawizard/opencl_codelet_unsigned_inc.c
  56. 1 1
      tests/datawizard/opencl_codelet_unsigned_inc_kernel.cl
  57. 2 1
      tests/datawizard/reclaim.c
  58. 14 3
      tests/errorcheck/starpu_init_noworker.c
  59. 10 6
      tests/helper/starpu_data_cpy.c
  60. 3 2
      tests/main/task_wait_api.c
  61. 2 1
      tools/dev/internal/rename_internal.sed

+ 5 - 1
ChangeLog

@@ -14,7 +14,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-StarPU 1.0 (svn revision xxxx)
+StarPU 1.0.0 (svn revision 6306)
 ==============================================
 The extensions-again release
 
@@ -61,6 +61,10 @@ Changes:
         To port code to use new names (this is not mandatory), the
         tools/dev/rename.sh script can be used, and pkg-config starpu-1.0 should
         be used.
+  * The communication cost in the heft and dmda scheduling strategies now
+        take into account the contention brought by the number of GPUs. This
+        changes the meaning of the beta factor, whose default 1.0 value should
+        now be good enough in most case.
 
 Small features:
   * Allow users to disable asynchronous data transfers between CPUs and

+ 4 - 0
Makefile.am

@@ -71,6 +71,10 @@ versinclude_HEADERS = 				\
 nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h
 
+noinst_HEADERS = \
+	include/pthread_win32/pthread.h		\
+	include/pthread_win32/semaphore.h
+
 if BUILD_STARPU_TOP
 starpu-top/starpu_top$(EXEEXT): all-local
 all-local:

+ 4 - 4
STARPU-VERSION

@@ -3,19 +3,19 @@
 # Versioning (SONAMEs) for StarPU libraries.
 
 # Libtool interface versioning (info "(libtool) Versioning").
-LIBSTARPU_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPU_INTERFACE_CURRENT=1	# increment upon ABI change
 LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change
 LIBSTARPU_INTERFACE_AGE=0	# set to CURRENT - PREVIOUS interface
 STARPU_EFFECTIVE_VERSION=1.0
 
-LIBSTARPUFFT_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPUFFT_INTERFACE_CURRENT=1	# increment upon ABI change
 LIBSTARPUFFT_INTERFACE_REVISION=0	# increment upon implementation change
 LIBSTARPUFFT_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
 
-LIBSTARPUMPI_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPUMPI_INTERFACE_CURRENT=1	# increment upon ABI change
 LIBSTARPUMPI_INTERFACE_REVISION=0	# increment upon implementation change
 LIBSTARPUMPI_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
 
-LIBSOCL_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSOCL_INTERFACE_CURRENT=1	# increment upon ABI change
 LIBSOCL_INTERFACE_REVISION=0	# increment upon implementation change
 LIBSOCL_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface

+ 17 - 3
configure.ac

@@ -16,7 +16,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AC_INIT([StarPU],1.0.0rc3, [starpu-devel@lists.gforge.inria.fr], starpu)
+AC_INIT([StarPU],1.0.0, [starpu-devel@lists.gforge.inria.fr], starpu)
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
 
@@ -116,7 +116,21 @@ case "${srcdir}" in
      AC_SUBST(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})") ;;
 esac
 
-AC_CHECK_LIB([pthread], [pthread_create])
+
+case "$target" in
+*-*-mingw*|*-*-cygwin*)
+    AC_ARG_ENABLE(native-winthreads, [AS_HELP_STRING([--enable-native-winthreads],
+    				   [Use native windows threads instead of pthread])],
+    				   enable_native_winthreads=$enableval, enable_native_winthreads=no)
+    ;;
+esac
+if test x"$enable_native_winthreads" = xyes
+then
+    CPPFLAGS+=" -I$STARPU_SRC_DIR/include/pthread_win32 "
+else
+    AC_CHECK_LIB([pthread], [pthread_create])
+fi
+
 AC_COMPILE_IFELSE(
   AC_LANG_PROGRAM([[
     #include <pthread.h>
@@ -1044,7 +1058,7 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
-running_mpi_check=0
+running_mpi_check=no
 if test -d "$srcdir/.svn" ; then
     running_mpi_check=yes
 fi

ファイルの差分が大きいため隠しています
+ 30 - 30
doc/chapters/basic-api.texi


+ 4 - 14
doc/chapters/basic-examples.texi

@@ -71,12 +71,8 @@ has a single implementation for CPU:
 /* Task declaration.  */
 static void my_task (int x) __attribute__ ((task));
 
-/* Declaration of the CPU implementation of `my_task'.  */
-static void my_task_cpu (int x)
-   __attribute__ ((task_implementation ("cpu", my_task)));
-
-/* Definition of said CPU implementation.  */
-static void my_task_cpu (int x)
+/* Definition of the CPU implementation of `my_task'.  */
+static void my_task (int x)
 @{
   printf ("Hello, world!  With x = %d\n", x);
 @}
@@ -331,19 +327,13 @@ has to be defined:
 @cartouche
 @smallexample
 /* Declare the `vector_scal' task.  */
-
 static void vector_scal (unsigned size, float vector[size],
                          float factor)
   __attribute__ ((task));
 
-/* Declare and define the standard CPU implementation.  */
-
-static void vector_scal_cpu (unsigned size, float vector[size],
-                             float factor)
-  __attribute__ ((task_implementation ("cpu", vector_scal)));
-
+/* Define the standard CPU implementation.  */
 static void
-vector_scal_cpu (unsigned size, float vector[size], float factor)
+vector_scal (unsigned size, float vector[size], float factor)
 @{
   unsigned i;
   for (i = 0; i < size; i++)

+ 91 - 29
doc/chapters/c-extensions.texi

@@ -87,8 +87,11 @@ The following function attributes are provided:
 @item task
 @cindex @code{task} attribute
 Declare the given function as a StarPU task.  Its return type must be
-@code{void}, and it must not be defined---instead, a definition will
-automatically be provided by the compiler.
+@code{void}.  When a function declared as @code{task} has a user-defined
+body, that body is interpreted as the @dfn{implicit definition of the
+task's CPU implementation} (see example below).  In all cases, the
+actual definition of a task's body is automatically generated by the
+compiler.
 
 Under the hood, declaring a task leads to the declaration of the
 corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
@@ -157,6 +160,44 @@ A @code{matmult} task is defined; it has only one implementation,
 @var{B} are input buffers, whereas @var{C} is considered an input/output
 buffer.
 
+@cindex implicit task CPU implementation
+For convenience, when a function declared with the @code{task} attribute
+has a user-defined body, that body is assumed to be that of the CPU
+implementation of a task, which we call an @dfn{implicit task CPU
+implementation}.  Thus, the above snippet can be simplified like this:
+
+@cartouche
+@smallexample
+#define __output  __attribute__ ((output))
+
+static void matmul (const float *A, const float *B,
+                    __output float *C,
+                    unsigned nx, unsigned ny, unsigned nz)
+  __attribute__ ((task));
+
+/* Implicit definition of the CPU implementation of the
+   `matmul' task.  */
+static void
+matmul (const float *A, const float *B, __output float *C,
+        unsigned nx, unsigned ny, unsigned nz)
+@{
+  unsigned i, j, k;
+
+  for (j = 0; j < ny; j++)
+    for (i = 0; i < nx; i++)
+      @{
+        for (k = 0; k < nz; k++)
+          C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
+      @}
+@}
+@end smallexample
+@end cartouche
+
+@noindent
+Use of implicit CPU task implementations as above has the advantage that
+the code is valid sequential code when StarPU's GCC plug-in is not used
+(@pxref{Conditional Extensions}).
+
 CUDA and OpenCL implementations can be declared in a similar way:
 
 @cartouche
@@ -349,36 +390,37 @@ The code below illustrates how to define a task and its implementations
 in a way that allows it to be compiled without the GCC plug-in:
 
 @smallexample
-/* The macros below abstract over the attributes specific to
-   StarPU-GCC and the name of the CPU implementation.  */
-#ifdef STARPU_GCC_PLUGIN
-# define __task  __attribute__ ((task))
-# define CPU_TASK_IMPL(task)  task ## _cpu
-#else
-# define __task
-# define CPU_TASK_IMPL(task)  task
-#endif
+/* This program is valid, whether or not StarPU's GCC plug-in
+   is being used.  */
 
 #include <stdlib.h>
 
-static void matmul (const float *A, const float *B, float *C,
-                    unsigned nx, unsigned ny, unsigned nz) __task;
-
-#ifdef STARPU_GCC_PLUGIN
+/* The attribute below is ignored when GCC is not used.  */
+static void matmul (const float *A, const float *B, float * C,
+                    unsigned nx, unsigned ny, unsigned nz)
+  __attribute__ ((task));
 
-static void matmul_cpu (const float *A, const float *B, float *C,
-                        unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task_implementation ("cpu", matmul)));
+static void
+matmul (const float *A, const float *B, float * C,
+        unsigned nx, unsigned ny, unsigned nz)
+@{
+  /* Code of the CPU kernel here...  */
+@}
 
-#endif
+#ifdef STARPU_GCC_PLUGIN
+/* Optional OpenCL task implementation.  */
 
+static void matmul_opencl (const float *A, const float *B, float * C,
+                           unsigned nx, unsigned ny, unsigned nz)
+  __attribute__ ((task_implementation ("opencl", matmul)));
 
 static void
-CPU_TASK_IMPL (matmul) (const float *A, const float *B, float *C,
-                        unsigned nx, unsigned ny, unsigned nz)
+matmul_opencl (const float *A, const float *B, float * C,
+               unsigned nx, unsigned ny, unsigned nz)
 @{
-  /* Code of the CPU kernel here...  */
+  /* Code that invokes the OpenCL kernel here...  */
 @}
+#endif
 
 int
 main (int argc, char *argv[])
@@ -395,7 +437,7 @@ main (int argc, char *argv[])
 
   /* When StarPU-GCC is used, the call below is asynchronous;
      otherwise, it is synchronous.  */
-  matmul (A, B, C, 123, 42, 7);
+  matmul ((float *) A, (float *) B, (float *) C, 123, 42, 7);
 
 #pragma starpu wait
 #pragma starpu shutdown
@@ -404,12 +446,32 @@ main (int argc, char *argv[])
 @}
 @end smallexample
 
-Note that attributes such as @code{task} are simply ignored by GCC when
-the StarPU plug-in is not loaded, so the @code{__task} macro could be
-omitted altogether.  However, @command{gcc -Wall} emits a warning for
-unknown attributes, which can be inconvenient, and other compilers may
-be unable to parse the attribute syntax.  Thus, using macros such as
-@code{__task} above is recommended.
+@noindent
+The above program is a valid StarPU program when StarPU's GCC plug-in is
+used; it is also a valid sequential program when the plug-in is not
+used.
+
+Note that attributes such as @code{task} as well as @code{starpu}
+pragmas are simply ignored by GCC when the StarPU plug-in is not loaded.
+However, @command{gcc -Wall} emits a warning for unknown attributes and
+pragmas, which can be inconvenient.  In addition, other compilers may be
+unable to parse the attribute syntax@footnote{In practice, Clang and
+several proprietary compilers implement attributes.}, so you may want to
+wrap attributes in macros like this:
+
+@smallexample
+/* Use the `task' attribute only when StarPU's GCC plug-in
+   is available.   */
+#ifdef STARPU_GCC_PLUGIN
+# define __task  __attribute__ ((task))
+#else
+# define __task
+#endif
+
+static void matmul (const float *A, const float *B, float *C,
+                    unsigned nx, unsigned ny, unsigned nz) __task;
+@end smallexample
+
 
 @c Local Variables:
 @c TeX-master: "../starpu.texi"

+ 9 - 7
examples/cpp/incrementer_cpp.cpp

@@ -89,17 +89,19 @@ int main(int argc, char **argv)
 	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
                 float_array[1], float_array[2], float_array[3]);
 
-	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
-	{
-		FPRINTF(stderr, "Incorrect result\n");
-		ret = 1;
-	}
-
 #ifdef STARPU_USE_OPENCL
 	ret = starpu_opencl_unload_opencl(&opencl_program);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
 #endif
+
 	starpu_shutdown();
 
-	return ret;
+	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
+	{
+		FPRINTF(stderr, "Incorrect result\n");
+		return EXIT_FAILURE;
+	}
+
+
+	return EXIT_SUCCESS;
 }

+ 16 - 17
examples/filters/custom_mf/custom_interface.c

@@ -542,14 +542,14 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 				size, CL_MEM_READ_WRITE);
 		assert(ret == CL_SUCCESS);
 	}
-	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_custom->cpu_ptr,
-							  src_node,
-							  dst_custom->cpu_ptr,
-							  dst_node,
-							  size,
-							  0,
-							  NULL,
-							  &ret);
+	err = starpu_opencl_copy_ram_to_opencl(src_custom->cpu_ptr,
+					       src_node,
+					       dst_custom->cpu_ptr,
+					       dst_node,
+					       size,
+					       0,
+					       NULL,
+					       &ret);
 	assert(err == 0);
 	return 0;
 }
@@ -584,15 +584,14 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 		assert(dst_custom->opencl_ptr != NULL);
 	}
 
-	err = starpu_opencl_copy_opencl_to_ram_async_sync(
-			src_custom->opencl_ptr,
-			src_node,
-			dst_custom->opencl_ptr,
-			dst_node,
-			size,
-			0,
-			NULL,
-			&ret);
+	err = starpu_opencl_copy_opencl_to_ram(src_custom->opencl_ptr,
+					       src_node,
+					       dst_custom->opencl_ptr,
+					       dst_node,
+					       size,
+					       0,
+					       NULL,
+					       &ret);
 	assert(err == 0);
 	return 0;
 }

+ 32 - 32
examples/interface/complex_interface.c

@@ -204,25 +204,25 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
 
 	cl_int err;
 
-	err = starpu_opencl_copy_ram_to_opencl(
-		src_complex->real,
-		src_node,
-		(cl_mem) dst_complex->real,
-		dst_node,
-		src_complex->nx * sizeof(src_complex->real[0]),
-		0,
-		NULL);
+	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
+					       src_node,
+					       (cl_mem) dst_complex->real,
+					       dst_node,
+					       src_complex->nx * sizeof(src_complex->real[0]),
+					       0,
+					       NULL,
+					       NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_ram_to_opencl(
-		src_complex->imaginary,
-		src_node,
-		(cl_mem) dst_complex->imaginary,
-		dst_node,
-		src_complex->nx * sizeof(src_complex->imaginary[0]),
-		0,
-		NULL);
+	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
+					       src_node,
+					       (cl_mem) dst_complex->imaginary,
+					       dst_node,
+					       src_complex->nx * sizeof(src_complex->imaginary[0]),
+					       0,
+					       NULL,
+					       NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -236,25 +236,25 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
 	struct starpu_complex_interface *dst_complex = dst_interface;
 
 	cl_int err;
-	err = starpu_opencl_copy_opencl_to_ram(
-		(cl_mem) src_complex->real,
-		src_node,
-		dst_complex->real,
-		dst_node,
-		src_complex->nx * sizeof(src_complex->real[0]),
-		0,
-		NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
+					       src_node,
+					       dst_complex->real,
+					       dst_node,
+					       src_complex->nx * sizeof(src_complex->real[0]),
+					       0,
+					       NULL,
+					       NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_opencl_to_ram(
-		(cl_mem) src_complex->imaginary,
-		src_node,
-		dst_complex->imaginary,
-		dst_node,
-		src_complex->nx * sizeof(src_complex->imaginary[0]),
-		0,
-		NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
+					       src_node,
+					       dst_complex->imaginary,
+					       dst_node,
+					       src_complex->nx * sizeof(src_complex->imaginary[0]),
+					       0,
+					       NULL,
+					       NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
 

+ 5 - 0
examples/reductions/minmax_reduction.c

@@ -19,8 +19,13 @@
 #include <limits.h>
 #include <starpu.h>
 
+#ifdef STARPU_SLOW_MACHINE
+static unsigned nblocks = 512;
+static unsigned entries_per_bock = 64;
+#else
 static unsigned nblocks = 8192;
 static unsigned entries_per_bock = 1024;
+#endif
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 

+ 9 - 12
gcc-plugin/examples/vector_scal/vector_scal.c

@@ -13,13 +13,12 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
-#ifndef STARPU_GCC_PLUGIN
-# error must be compiled with the StarPU GCC plug-in
-#endif
 
-/* This examples showcases features of the StarPU GCC plug-in.  It defines a
+/* This example showcases features of the StarPU GCC plug-in.  It defines a
    "vector scaling" task with multiple CPU implementations, an OpenCL
-   implementation, and a CUDA implementation.   */
+   implementation, and a CUDA implementation.
+
+   Compiling it without `-fplugin=starpu.so' yields valid sequential code.  */
 
 #include <math.h>
 #include <stdbool.h>
@@ -32,11 +31,9 @@
 static void vector_scal (unsigned int size, float vector[size], float factor)
   __attribute__ ((task));
 
-static void vector_scal_cpu (unsigned int size, float vector[size], float factor)
-  __attribute__ ((task_implementation ("cpu", vector_scal)));
-
+/* The CPU implementation.  */
 static void
-vector_scal_cpu (unsigned int size, float vector[size], float factor)
+vector_scal (unsigned int size, float vector[size], float factor)
 {
   unsigned int i;
   for (i = 0; i < size; i++)
@@ -44,7 +41,7 @@ vector_scal_cpu (unsigned int size, float vector[size], float factor)
 }
 
 
-#ifdef __SSE__
+#if defined STARPU_GCC_PLUGIN && defined __SSE__
 /* The SSE-capable CPU implementation.  */
 
 #include <xmmintrin.h>
@@ -78,7 +75,7 @@ vector_scal_sse (unsigned int size, float vector[size], float factor)
 
 /* Declaration and definition of the OpenCL implementation.  */
 
-#ifdef STARPU_USE_OPENCL
+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
 
 #include <starpu_opencl.h>
 
@@ -166,7 +163,7 @@ main (void)
 
 #pragma starpu initialize
 
-#ifdef STARPU_USE_OPENCL
+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
   starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
 				       &cl_programs, "");
 #endif

+ 193 - 50
gcc-plugin/src/starpu.c

@@ -133,11 +133,15 @@ static tree unpack_fn, data_lookup_fn;
 /* Forward declarations.  */
 
 static tree build_codelet_declaration (tree task_decl);
+static tree build_cpu_codelet_identifier (const_tree task);
 static void define_task (tree task_decl);
 static tree build_pointer_lookup (tree pointer);
 
 static bool task_p (const_tree decl);
 static bool task_implementation_p (const_tree decl);
+static tree task_implementation_task (const_tree task_impl);
+static bool implicit_cpu_task_implementation_p (const_tree fn);
+
 
 static int task_implementation_target_to_int (const_tree target);
 
@@ -990,6 +994,31 @@ register_pragmas (void *gcc_data, void *user_data)
 /* Attributes.  */
 
 
+/* Turn FN into a task, and push its associated codelet declaration.  */
+
+static void
+taskify_function (tree fn)
+{
+  gcc_assert (TREE_CODE (fn) == FUNCTION_DECL);
+
+  /* Add a `task' attribute and an empty `task_implementation_list'
+     attribute.  */
+  DECL_ATTRIBUTES (fn) =
+    tree_cons (get_identifier (task_implementation_list_attribute_name),
+	       NULL_TREE,
+	       tree_cons (get_identifier (task_attribute_name), NULL_TREE,
+			  DECL_ATTRIBUTES (fn)));
+
+  /* Push a declaration for the corresponding `struct starpu_codelet' object and
+     add it as an attribute of FN.  */
+  tree cl = build_codelet_declaration (fn);
+  DECL_ATTRIBUTES (fn) =
+    tree_cons (get_identifier (task_codelet_attribute_name), cl,
+	       DECL_ATTRIBUTES (fn));
+
+  pushdecl (cl);
+}
+
 /* Handle the `task' function attribute.  */
 
 static tree
@@ -1020,23 +1049,8 @@ handle_task_attribute (tree *node, tree name, tree args,
 	error_at (DECL_SOURCE_LOCATION (fn),
 		  "maximum number of pointer parameters exceeded");
 
-      /* This is a function declaration for something local to this
-	 translation unit, so add the `task' attribute to FN.  */
-      *no_add_attrs = false;
-
-      /* Add an empty `task_implementation_list' attribute.  */
-      DECL_ATTRIBUTES (fn) =
-	tree_cons (get_identifier (task_implementation_list_attribute_name),
-		   NULL_TREE,
-		   NULL_TREE);
-
-      /* Push a declaration for the corresponding `struct starpu_codelet' object and
-	 add it as an attribute of FN.  */
-      tree cl = build_codelet_declaration (fn);
-      DECL_ATTRIBUTES (fn) =
-	tree_cons (get_identifier (task_codelet_attribute_name), cl,
-		   DECL_ATTRIBUTES (fn));
-      pushdecl (cl);
+      /* Turn FN into an actual task.  */
+      taskify_function (fn);
     }
 
   /* Lookup & cache function declarations for later reuse.  */
@@ -1067,6 +1081,7 @@ validate_opencl_argument_type (location_t loc, const_tree type)
 	      /* Scalar types defined in OpenCL 1.2.  See
 		 <http://www.khronos.org/files/opencl-1-2-quick-reference-card.pdf>.  */
 	      { "char", "cl_char" },
+	      { "signed char", "cl_char" },
 	      { "unsigned char", "cl_uchar" },
 	      { "uchar", "cl_uchar" },
 	      { "short int", "cl_short" },
@@ -1135,6 +1150,43 @@ validate_opencl_argument_type (location_t loc, const_tree type)
     }
 }
 
+/* Add FN to the list of implementations of TASK_DECL.  */
+
+static void
+add_task_implementation (tree task_decl, tree fn, const_tree where)
+{
+  location_t loc;
+  tree attr, impls;
+
+  attr = lookup_attribute (task_implementation_list_attribute_name,
+			   DECL_ATTRIBUTES (task_decl));
+  gcc_assert (attr != NULL_TREE);
+
+  gcc_assert (TREE_CODE (where) == STRING_CST);
+
+  loc = DECL_SOURCE_LOCATION (fn);
+
+  impls = tree_cons (NULL_TREE, fn, TREE_VALUE (attr));
+  TREE_VALUE (attr) = impls;
+
+  TREE_USED (fn) = true;
+
+  /* Check the `where' argument to raise a warning if needed.  */
+  if (task_implementation_target_to_int (where) == 0)
+    warning_at (loc, 0,
+		"unsupported target %E; task implementation won't be used",
+		where);
+  else if (task_implementation_target_to_int (where) == STARPU_OPENCL)
+    {
+      local_define (void, validate, (tree t))
+	{
+	  validate_opencl_argument_type (loc, t);
+	};
+
+      for_each (validate, TYPE_ARG_TYPES (TREE_TYPE (fn)));
+    }
+}
+
 /* Handle the `task_implementation (WHERE, TASK)' attribute.  WHERE is a
    string constant ("cpu", "cuda", etc.), and TASK is the identifier of a
    function declared with the `task' attribute.  */
@@ -1155,6 +1207,12 @@ handle_task_implementation_attribute (tree *node, tree name, tree args,
   where = TREE_VALUE (args);
   task_decl = TREE_VALUE (TREE_CHAIN (args));
 
+  if (implicit_cpu_task_implementation_p (task_decl))
+    /* TASK_DECL is actually a CPU implementation.  Implicit CPU task
+       implementations can lead to this situation, because the task is
+       renamed and modified to become a CPU implementation.  */
+    task_decl = task_implementation_task (task_decl);
+
   loc = DECL_SOURCE_LOCATION (fn);
 
   /* Get rid of the `task_implementation' attribute by default so that FN
@@ -1184,30 +1242,7 @@ handle_task_implementation_attribute (tree *node, tree name, tree args,
   else
     {
       /* Add FN to the list of implementations of TASK_DECL.  */
-
-      tree attr, impls;
-
-      attr = lookup_attribute (task_implementation_list_attribute_name,
-			       DECL_ATTRIBUTES (task_decl));
-      impls = tree_cons (NULL_TREE, fn, TREE_VALUE (attr));
-      TREE_VALUE (attr) = impls;
-
-      TREE_USED (fn) = TREE_USED (task_decl);
-
-      /* Check the `where' argument to raise a warning if needed.  */
-      if (task_implementation_target_to_int (where) == 0)
-	warning_at (loc, 0,
-		    "unsupported target %E; task implementation won't be used",
-		    where);
-      else if (task_implementation_target_to_int (where) == STARPU_OPENCL)
-	{
-	  local_define (void, validate, (tree t))
-	  {
-	    validate_opencl_argument_type (loc, t);
-	  };
-
-	  for_each (validate, TYPE_ARG_TYPES (TREE_TYPE (fn)));
-	}
+      add_task_implementation (task_decl, fn, where);
 
       /* Keep the attribute.  */
       *no_add_attrs = false;
@@ -1455,7 +1490,7 @@ task_where (const_tree task_decl)
 static tree
 task_implementation_task (const_tree task_impl)
 {
-  tree impl_attr, args;
+  tree impl_attr, args, task;
 
   gcc_assert (TREE_CODE (task_impl) == FUNCTION_DECL);
 
@@ -1465,7 +1500,13 @@ task_implementation_task (const_tree task_impl)
 
   args = TREE_VALUE (impl_attr);
 
-  return TREE_VALUE (TREE_CHAIN (args));
+  task = TREE_VALUE (TREE_CHAIN (args));
+  if (task_implementation_p (task))
+    /* TASK is an implicit CPU task implementation, so return its real
+       task.  */
+    return task_implementation_task (task);
+
+  return task;
 }
 
 /* Return the FUNCTION_DECL of the wrapper generated for TASK_IMPL.  */
@@ -1484,6 +1525,23 @@ task_implementation_wrapper (const_tree task_impl)
   return TREE_VALUE (attr);
 }
 
+/* Return true when FN is an implicit CPU task implementation.  */
+
+static bool
+implicit_cpu_task_implementation_p (const_tree fn)
+{
+  if (task_implementation_p (fn)
+      && task_implementation_where (fn) == STARPU_CPU)
+    {
+      /* XXX: Hackish heuristic.  */
+      const_tree cpu_id;
+      cpu_id = build_cpu_codelet_identifier (task_implementation_task (fn));
+      return cpu_id == DECL_NAME (fn);
+    }
+
+  return false;
+}
+
 /* Return true when VAR_DECL has the `heap_allocated' attribute.  */
 
 static bool
@@ -2063,6 +2121,27 @@ declare_codelet (tree task_decl)
   return cl_decl;
 }
 
+/* Return the identifier for an automatically-generated CPU codelet of
+   TASK.  */
+
+static tree
+build_cpu_codelet_identifier (const_tree task)
+{
+  static const char suffix[] = ".cpu_implementation";
+
+  tree id;
+  char *cl_name;
+  const char *task_name;
+
+  id = DECL_NAME (task);
+  task_name = IDENTIFIER_POINTER (id);
+
+  cl_name = (char *) alloca (IDENTIFIER_LENGTH (id) + strlen (suffix) + 1);
+  memcpy (cl_name, task_name, IDENTIFIER_LENGTH (id));
+  strcpy (&cl_name[IDENTIFIER_LENGTH (id)], suffix);
+
+  return get_identifier (cl_name);
+}
 
 static void
 handle_pre_genericize (void *gcc_data, void *user_data)
@@ -2072,10 +2151,56 @@ handle_pre_genericize (void *gcc_data, void *user_data)
   gcc_assert (TREE_CODE (fn) == FUNCTION_DECL);
 
   if (task_p (fn) && TREE_STATIC (fn))
-    /* The user defined a body for task FN, which is forbidden.  */
-    error_at (DECL_SOURCE_LOCATION (fn),
-	      "task %qE must not have a body", DECL_NAME (fn));
-  else if (task_implementation_p (fn))
+    {
+      /* The user defined a body for task FN, which we interpret as being the
+	 body of an implicit CPU task implementation for FN.  Thus, rename FN
+	 and turn it into the "cpu" implementation of a task that we create
+	 under FN's original name (this is easier than moving the body to a
+	 different function, which would require traversing the body to
+	 rewrite all references to FN to point to the new function.)  Later,
+	 `lower_starpu' rewrites calls to FN as calls to the newly created
+	 task.  */
+
+      tree task_name = DECL_NAME (fn);
+
+      tree cpu_impl = fn;
+      DECL_NAME (cpu_impl) = build_cpu_codelet_identifier (fn);
+      if (verbose_output_p)
+	inform (DECL_SOURCE_LOCATION (fn),
+		"implicit CPU implementation renamed from %qE to %qE",
+		task_name, DECL_NAME (cpu_impl));
+
+      tree task = build_decl (DECL_SOURCE_LOCATION (fn), FUNCTION_DECL,
+			      task_name, TREE_TYPE (fn));
+
+      TREE_PUBLIC (task) = TREE_PUBLIC (fn);
+      TREE_PUBLIC (cpu_impl) = false;
+
+      taskify_function (task);
+
+      /* Inherit the task implementation list from FN.  */
+      tree impls = lookup_attribute (task_implementation_list_attribute_name,
+				     DECL_ATTRIBUTES (fn));
+      gcc_assert (impls != NULL_TREE);
+      impls = TREE_VALUE (impls);
+
+      DECL_ATTRIBUTES (task) =
+	tree_cons (get_identifier (task_implementation_list_attribute_name),
+		   impls, DECL_ATTRIBUTES (task));
+
+      /* Make CPU_IMPL an implementation of FN.  */
+      DECL_ATTRIBUTES (cpu_impl) =
+	tree_cons (get_identifier (task_implementation_attribute_name),
+		   tree_cons (NULL_TREE, build_string (3, "cpu"),
+			      tree_cons (NULL_TREE, task, NULL_TREE)),
+		   NULL_TREE);
+
+      add_task_implementation (task, cpu_impl, build_string (3, "cpu"));
+
+      /* And now, process CPU_IMPL.  */
+    }
+
+  if (task_implementation_p (fn))
     {
       tree task = task_implementation_task (fn);
 
@@ -2117,6 +2242,7 @@ handle_pre_genericize (void *gcc_data, void *user_data)
 	  rest_of_decl_compilation (task, true, 0);
 	  allocate_struct_function (task, false);
 	  cgraph_finalize_function (task, false);
+	  cgraph_mark_needed_node (cgraph_get_node (task));
 	}
     }
 }
@@ -2265,6 +2391,7 @@ define_task (tree task_decl)
   DECL_SAVED_TREE (task_decl) = bind;
   TREE_STATIC (task_decl) = true;
   DECL_EXTERNAL (task_decl) = false;
+  DECL_ARTIFICIAL (task_decl) = true;
   DECL_INITIAL (task_decl) =
     build_block (error_var, NULL_TREE, task_decl, NULL_TREE);
   DECL_RESULT (task_decl) =
@@ -2398,12 +2525,28 @@ lower_starpu (void)
     {
       gcc_assert (callee->callee != NULL);
 
-      tree callee_decl;
+      tree callee_decl, caller_decl;
 
       callee_decl = callee->callee->decl;
+      caller_decl = callee->caller->decl;
+
+      if (implicit_cpu_task_implementation_p (callee_decl)
+	  && !DECL_ARTIFICIAL (caller_decl))
+	{
+	  /* Rewrite the call to point to the actual task beneath
+	     CALLEE_DECL.  */
+	  callee_decl = task_implementation_task (callee_decl);
+	  if (verbose_output_p)
+	    inform (gimple_location (callee->call_stmt),
+		    "call to %qE rewritten as a call to task %qE",
+		    DECL_NAME (callee->callee->decl),
+		    DECL_NAME (callee_decl));
+
+	  gimple_call_set_fn (callee->call_stmt,
+			      build_addr (callee_decl, callee->caller->decl));
+	}
 
-      if (lookup_attribute (task_attribute_name,
-			    DECL_ATTRIBUTES (callee_decl)))
+      if (task_p (callee_decl))
 	{
 	  if (verbose_output_p)
 	    inform (gimple_location (callee->call_stmt),

+ 22 - 1
gcc-plugin/tests/base.c

@@ -72,7 +72,26 @@ my_other_task_opencl (int x)
   printf ("opencl\n");
 }
 
+
+/* Task with a body.  */
+
+static void my_task_with_body (int x) __attribute__ ((task));
+
+static void
+my_task_with_body (int x)
+{
+  /* This body is implicitly the "cpu" implementation of the task.  */
+  int y = x + 2;
+  printf ("body: %i\n", y - 2);
+}
 
+static void my_task_with_body_opencl (int x)
+  __attribute__ ((task_implementation ("opencl", my_task_with_body)));
+
+static void
+my_task_with_body_opencl (int x)
+{
+}
 
 
 int
@@ -124,7 +143,9 @@ main (int argc, char *argv[])
 
   my_other_task (42);
 
-  assert (tasks_submitted == 10);
+  my_task_with_body (42);
+
+  assert (tasks_submitted == 11);
 
 #pragma starpu shutdown
   assert (initialized == 0);

+ 3 - 2
gcc-plugin/tests/lib-user.c

@@ -1,5 +1,5 @@
 /* GCC-StarPU
-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 
    GCC-StarPU is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,7 +30,8 @@ main (int argc, char *argv[])
 
   /* Align X so that the assumptions behind `dummy_pointer_to_handle'
      hold.  */
-  static const char x[] __attribute__ ((aligned (8))) = { 0, 1, 2, 3, 4, 5 };
+  static const signed char x[] __attribute__ ((aligned (8))) =
+    { 0, 1, 2, 3, 4, 5 };
 
   float y[sizeof x];
 

+ 5 - 7
gcc-plugin/tests/my-lib.c

@@ -1,5 +1,5 @@
 /* GCC-StarPU
-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 
    GCC-StarPU is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -24,11 +24,9 @@
 /* Task implementations: one is `static', one is global.  The codelet
    wrapper, codelet, and task body should be instantiated in this file.  */
 
-static void my_task_opencl (char, const char *, float *, int)
-  __attribute__ ((task_implementation ("opencl", my_task)));
-
+/* The implicit CPU implementation of `my_task'.  */
 void
-my_task_cpu (char a, const char *p, float *q, int b)
+my_task (signed char a, const signed char *p, float *q, int b)
 {
   int i;
 
@@ -36,8 +34,8 @@ my_task_cpu (char a, const char *p, float *q, int b)
     *q = *p + a;
 }
 
-static void
-my_task_opencl (char a, const char *p, float *q, int b)
+void
+my_task_opencl (signed char a, const signed char *p, float *q, int b)
 {
   int i;
 

+ 4 - 4
gcc-plugin/tests/my-lib.h

@@ -1,5 +1,5 @@
 /* GCC-StarPU
-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 
    GCC-StarPU is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -19,12 +19,12 @@
 #ifndef MY_LIB_H
 #define MY_LIB_H
 
-extern void my_task (char, const char *, float *, int)
+extern void my_task (signed char, const signed char *, float *, int)
   __attribute__ ((task));
 
 /* One of the implementations of MY_TASK.  Since it's `extern', this should
    not trigger generation of the codelet, wrapper, etc.  */
-extern void my_task_cpu (char, const char *, float *, int)
-  __attribute__ ((task_implementation ("cpu", my_task)));
+extern void my_task_opencl (signed char, const signed char *, float *, int)
+  __attribute__ ((task_implementation ("opencl", my_task)));
 
 #endif /* MY_LIB_H */

+ 3 - 5
gcc-plugin/tests/pointer-tasks.c

@@ -1,5 +1,5 @@
 /* GCC-StarPU
-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 
    GCC-StarPU is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -28,17 +28,15 @@
 static void my_pointer_task (const int *x, char a, long long *y, int b)
   __attribute__ ((task));
 
-static void my_pointer_task_cpu (const int *x, char a, long long *y, int b)
-  __attribute__ ((task_implementation ("cpu", my_pointer_task), noinline));
-
 static int implementations_called;
 
 /* The input arguments.  */
 static const int pointer_arg1[] = { 42, 1, 2, 3, 4, 5 };
 static long long *pointer_arg2;
 
+/* CPU implementation of `my_pointer_task'.  */
 static void
-my_pointer_task_cpu (const int *x, char a, long long *y, int b)
+my_pointer_task (const int *x, char a, long long *y, int b)
 {
   implementations_called |= STARPU_CPU;
   assert (x == pointer_arg1);

+ 3 - 4
gcc-plugin/tests/scalar-tasks.c

@@ -1,5 +1,5 @@
 /* GCC-StarPU
-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 
    GCC-StarPU is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,15 +26,14 @@
 
 static void my_scalar_task (int x, int y) __attribute__ ((task));
 
-static void my_scalar_task_cpu (int, int)
-  __attribute__ ((task_implementation ("cpu", my_scalar_task), noinline));
 static void my_scalar_task_opencl (int, int)
   __attribute__ ((task_implementation ("opencl", my_scalar_task), noinline));
 
 static int implementations_called;
 
+/* CPU implementation of `my_scalar_task'.  */
 static void
-my_scalar_task_cpu (int x, int y)
+my_scalar_task (int x, int y)
 {
   implementations_called |= STARPU_CPU;
   assert (x == 42);

+ 2 - 9
gcc-plugin/tests/task-errors.c

@@ -18,7 +18,8 @@
 
 extern void my_external_task (int foo, char *bar) __attribute__ ((task));
 
-static void my_task (int foo, char *bar) __attribute__ ((task));
+void my_task (int foo, char *bar) /* (error "none of the implementations") */
+  __attribute__ ((task));
 static void my_task_cpu (int foo, float *bar)    /* (error "type differs") */
   __attribute__ ((task_implementation ("cpu", my_task)));
 
@@ -48,9 +49,6 @@ static void my_task_wrong_task_arg (int foo, char *bar)   /* (error "not a funct
 static void my_task_wrong_target_arg (int foo, char *bar) /* (error "string constant expected") */
   __attribute__ ((task_implementation (123, my_task)));
 
-static void my_task_with_a_body (int foo, char *bar)
-  __attribute__ ((task, unused));
-
 extern int my_task_not_void (int foo) /* (error "return type") */
   __attribute__ ((task));
 
@@ -116,11 +114,6 @@ my_task_wrong_target_arg (int foo, char *bar)
 {
 }
 
-static void
-my_task_with_a_body (int foo, char *bar)  /* (error "must not have a body") */
-{
-}
-
 void
 my_task_that_invokes_task_cpu (int x, char *y)
 {

+ 410 - 0
include/pthread_win32/pthread.h

@@ -0,0 +1,410 @@
+/*
+ * StarPU
+ * Copyright (C) Université Bordeaux 1, CNRS 2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* This is a minimal pthread implementation based on windows functions.
+ * It is *not* intended to be complete - just complete enough to get
+ * StarPU running.
+ */
+
+#ifndef __STARPU_PTHREAD_H__
+#define __STARPU_PTHREAD_H__
+
+/* TODO:
+ * pthread_rwlock_*
+ * pthread_spinlock_*
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <windows.h>
+#undef interface
+#include <stdio.h>
+#include <errno.h>
+
+#ifdef __CYGWIN32__
+#include <sys/cygwin.h>
+#define unixErrno() cygwin_internal(CW_GET_ERRNO_FROM_WINERROR, (GetLastError())
+#else
+#define unixErrno() EIO
+#endif
+#if 0
+#define setSystemErrno() do { fprintf(stderr,"%s:%d: win %d\n", __FILE__, __LINE__, GetLastError()); errno = unixErrno(); } while (0)
+#define winPthreadAssertWindows(expr) do { if (!(expr)) { fprintf(stderr,"%s:%d: %d\n", __FILE__, __LINE__, unixErrno()); return unixErrno(); } } while (0)
+#define winPthreadAssertPthread(expr) do { int ret = (expr); if (ret) { fprintf(stderr,"%s:%d: %d\n", __FILE__, __LINE__, ret); return ret; } } while (0)
+#define winPthreadAssert(expr) do { if (!(expr)) { fprintf(stderr,"%s:%d: %d\n", __FILE__, __LINE__, errno); return EIO; } } while (0)
+#else
+#define setSystemErrno() errno = unixErrno()
+#define winPthreadAssertWindows(expr) do { if (!(expr)) { return unixErrno(); } } while (0)
+#define winPthreadAssertPthread(expr) do { int ret = (expr); if (ret) return ret; } while (0)
+#define winPthreadAssert(expr) do { if (!(expr)) return EIO; } while (0)
+#endif
+#if 0
+#else
+#endif
+
+/***********
+ * threads *
+ ***********/
+
+typedef DWORD pthread_attr_t;
+typedef HANDLE pthread_t;
+
+static inline pthread_t pthread_self(void) {
+  return GetCurrentThread();
+}
+
+static inline int pthread_equal(pthread_t t1, pthread_t t2) {
+  return t1 == t2;
+}
+
+static inline int pthread_attr_init (pthread_attr_t *attr) {
+  *attr = 0;
+  return 0;
+}
+
+#define PTHREAD_CREATE_DETACHED 1
+static inline int pthread_attr_setdetachstate (pthread_attr_t *attr, int yes) {
+  (void)attr;
+  (void)yes;
+  /* not supported, ignore */
+  return 0;
+}
+
+static inline int pthread_attr_setstacksize (pthread_attr_t *attr, size_t stacksize) {
+  (void)attr;
+  (void)stacksize;
+  /* not supported, ignore */
+  return 0;
+}
+
+static inline int pthread_attr_destroy (pthread_attr_t *attr) {
+  (void)attr;
+  return 0;
+}
+
+/* "real" cleanup handling not yet implemented */
+typedef struct {
+  void (*routine) (void *);
+  void *arg;
+} __pthread_cleanup_handler;
+
+void pthread_cleanup_push (void (*routine) (void *), void *arg);
+#define pthread_cleanup_push(routine, arg) do { \
+  __pthread_cleanup_handler __cleanup_handler = {routine, arg};
+
+void pthread_cleanup_pop (int execute);
+#define pthread_cleanup_pop(execute) \
+  if (execute) __cleanup_handler.routine(__cleanup_handler.arg); \
+} while (0);
+
+static inline int pthread_create (
+  pthread_t *thread, const pthread_attr_t *attr,
+  void * (*fun) (void *), void *arg
+) {
+  if (attr && *attr)
+    return EINVAL;
+  winPthreadAssertWindows(*thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) fun, arg, 0, NULL));
+  return 0;
+}
+
+static inline int pthread_setcancelstate (int state, int *oldstate) {
+  (void)state;
+  (void)oldstate;
+  /* not yet implemented :( */
+  return 0;
+}
+
+static inline int pthread_cancel (pthread_t thread) {
+  /* This is quite harsh :( */
+  winPthreadAssertWindows(TerminateThread(thread, 0));
+  return 0;
+}
+
+static inline void pthread_exit (void *res) {
+  ExitThread((DWORD) res);
+}
+
+static inline int pthread_join (pthread_t thread, void **res) {
+again:
+  switch (WaitForSingleObject(thread, INFINITE)) {
+    default:
+    case WAIT_FAILED:
+      return unixErrno();
+    case WAIT_ABANDONED:
+    case WAIT_OBJECT_0:
+      break;
+    case WAIT_TIMEOUT:
+      goto again;
+  }
+  if (res) {
+    DWORD _res;
+    if (GetExitCodeThread(thread, &_res))
+      *res = (void *)_res;
+  }
+  return 0;
+}
+
+/***********
+ * mutexes *
+ ***********/
+
+#define PTHREAD_MUTEX_INITIALIZER NULL
+typedef HANDLE pthread_mutex_t;
+#define PTHREAD_MUTEX_RECURSIVE 1
+#define PTHREAD_MUTEX_ERRORCHECK 2
+typedef int pthread_mutexattr_t;
+
+static inline int pthread_mutexattr_init(pthread_mutexattr_t *attr) {
+  *attr = PTHREAD_MUTEX_RECURSIVE;
+  return 0;
+}
+
+static inline int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type) {
+  if (type != PTHREAD_MUTEX_RECURSIVE && type != PTHREAD_MUTEX_ERRORCHECK)
+    return EINVAL;
+  *attr = type;
+  return 0;
+}
+
+static inline int pthread_mutex_init (pthread_mutex_t *mutex, pthread_mutexattr_t *attr) {
+  if (attr && *attr!=PTHREAD_MUTEX_RECURSIVE)
+    return EINVAL;
+  winPthreadAssertWindows(*mutex = CreateMutex(NULL, FALSE, NULL));
+  return 0;
+}
+
+static inline int pthread_mutex_unlock (pthread_mutex_t *mutex) {
+  winPthreadAssertWindows(ReleaseMutex(*mutex));
+  return 0;
+}
+
+static inline int pthread_mutex_lock (pthread_mutex_t *mutex);
+static inline int __pthread_mutex_alloc_concurrently (pthread_mutex_t *mutex) {
+  HANDLE mutex_init_mutex;
+  /* Get access to one global named mutex to serialize mutex initialization */
+  winPthreadAssertWindows((mutex_init_mutex = CreateMutex(NULL, FALSE, "StarPU mutex init")));
+  winPthreadAssertPthread(pthread_mutex_lock(&mutex_init_mutex));
+  /* Now we are the one that can initialize it */
+  if (!*mutex)
+    winPthreadAssertPthread(pthread_mutex_init(mutex,NULL));
+  winPthreadAssertPthread(pthread_mutex_unlock(&mutex_init_mutex));
+  winPthreadAssertWindows(CloseHandle(mutex_init_mutex));
+  return 0;
+}
+
+static inline int pthread_mutex_lock (pthread_mutex_t *mutex) {
+  if (!*mutex)
+    __pthread_mutex_alloc_concurrently (mutex);
+again:
+  switch (WaitForSingleObject(*mutex, INFINITE)) {
+    default:
+    case WAIT_FAILED:
+      return unixErrno();;
+    case WAIT_ABANDONED:
+    case WAIT_OBJECT_0:
+      return 0;
+    case WAIT_TIMEOUT:
+      goto again;
+  }
+}
+
+static inline int pthread_mutex_trylock (pthread_mutex_t *mutex) {
+  if (!*mutex)
+    __pthread_mutex_alloc_concurrently (mutex);
+  switch (WaitForSingleObject(*mutex, 0)) {
+    default:
+    case WAIT_FAILED:
+      return unixErrno();
+    case WAIT_ABANDONED:
+    case WAIT_OBJECT_0:
+      return 0;
+    case WAIT_TIMEOUT:
+      return EBUSY;
+  }
+}
+
+static inline int pthread_mutex_destroy (pthread_mutex_t *mutex) {
+  winPthreadAssertWindows(CloseHandle(*mutex));
+  *mutex = INVALID_HANDLE_VALUE;
+  return 0;
+}
+
+/********************************************
+ * rwlock                                   *
+ * VERY LAZY, don't even look at it please! *
+ * Should be fine unoptimized for now.      *
+ * TODO: FIXME, using conds for instance?   *
+ ********************************************/
+
+#define PTHREAD_RWLOCK_INITIALIZER NULL
+typedef pthread_mutex_t pthread_rwlock_t;
+#define pthread_rwlock_init(lock, attr) pthread_mutex_init(lock, NULL)
+#define pthread_rwlock_wrlock(lock) pthread_mutex_lock(lock)
+#define pthread_rwlock_rdlock(lock) pthread_mutex_lock(lock)
+#define pthread_rwlock_unlock(lock) pthread_mutex_unlock(lock)
+#define pthread_rwlock_destroy(lock) pthread_mutex_destroy(lock)
+
+/**************
+ * conditions *
+ **************/
+
+typedef struct {
+  HANDLE sem;
+  volatile unsigned nbwait;
+} pthread_cond_t;
+#define PTHREAD_COND_INITIALIZER { NULL, 0}
+
+#ifndef STARPU_TIMESPEC_DEFINED
+#define STARPU_TIMESPEC_DEFINED 1
+struct timespec {
+  time_t  tv_sec;  /* Seconds */
+  long    tv_nsec; /* Nanoseconds */
+};
+#endif /* STARPU_TIMESPEC_DEFINED */
+
+typedef unsigned pthread_condattr_t;
+
+static inline int pthread_cond_init (pthread_cond_t *cond, const pthread_condattr_t *attr) {
+  if (attr)
+    return EINVAL;
+  winPthreadAssertWindows(cond->sem = CreateSemaphore(NULL, 0, MAXLONG, NULL));
+  cond->nbwait = 0;
+  return 0;
+}
+
+static inline int pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *time) {
+  if (!cond->sem)
+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
+  cond->nbwait++;
+  winPthreadAssertPthread(pthread_mutex_unlock(mutex));
+again:
+  switch (WaitForSingleObject(cond->sem, time->tv_sec*1000+time->tv_nsec/1000)) {
+    default:
+    case WAIT_FAILED:
+    {
+      int error = unixErrno();
+      winPthreadAssertPthread(pthread_mutex_lock(mutex));
+      return error;
+    }
+    case WAIT_TIMEOUT:
+      goto again;
+    case WAIT_ABANDONED:
+    case WAIT_OBJECT_0:
+      break;
+  }
+  winPthreadAssertPthread(pthread_mutex_lock(mutex));
+  cond->nbwait--;
+  return 0;
+}
+
+static inline int pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex) {
+  if (!cond->sem)
+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
+  cond->nbwait++;
+  winPthreadAssertPthread(pthread_mutex_unlock(mutex));
+again:
+  switch (WaitForSingleObject(cond->sem, INFINITE)) {
+    case WAIT_FAILED:
+    {
+      int error;
+      error = unixErrno();
+      winPthreadAssertPthread(pthread_mutex_lock(mutex));
+      return error;
+    }
+    case WAIT_TIMEOUT:
+      goto again;
+    case WAIT_ABANDONED:
+    case WAIT_OBJECT_0:
+      break;
+  }
+  winPthreadAssertPthread(pthread_mutex_lock(mutex));
+  cond->nbwait--;
+  return 0;
+}
+
+static inline int pthread_cond_signal (pthread_cond_t *cond) {
+  if (!cond->sem)
+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
+  if (cond->nbwait)
+    ReleaseSemaphore(cond->sem, 1, NULL);
+  return 0;
+}
+
+static inline int pthread_cond_broadcast (pthread_cond_t *cond) {
+  if (!cond->sem)
+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
+  ReleaseSemaphore(cond->sem, cond->nbwait, NULL);
+  return 0;
+}
+
+static inline int pthread_cond_destroy (pthread_cond_t *cond) {
+  if (cond->sem) {
+    winPthreadAssertWindows(CloseHandle(cond->sem));
+    cond->sem = NULL;
+  }
+  return 0;
+}
+
+/*******
+ * TLS *
+ *******/
+
+typedef DWORD pthread_key_t;
+#define PTHREAD_ONCE_INIT {PTHREAD_MUTEX_INITIALIZER, 0}
+typedef struct {
+  pthread_mutex_t mutex;
+  unsigned done;
+} pthread_once_t;
+
+static inline int pthread_once (pthread_once_t *once, void (*oncefun)(void)) {
+  winPthreadAssertPthread(pthread_mutex_lock(&once->mutex));
+  if (!once->done) {
+    oncefun();
+    once->done = 1;
+  }
+  winPthreadAssertPthread(pthread_mutex_unlock(&once->mutex));
+  return 0;
+}
+
+static inline int pthread_key_create (pthread_key_t *key, void (*freefun)(void *)) {
+  (void)freefun;
+  DWORD res;
+  winPthreadAssertWindows((res = TlsAlloc()) != 0xFFFFFFFF);
+  *key = res;
+  return 0;
+}
+
+static inline int pthread_key_delete (pthread_key_t key) {
+  winPthreadAssertWindows(TlsFree(key));
+  return 0;
+}
+
+static inline void *pthread_getspecific (pthread_key_t key) {
+  return TlsGetValue(key);
+}
+
+static inline int pthread_setspecific (pthread_key_t key, const void *data) {
+  winPthreadAssertWindows(TlsSetValue(key, (LPVOID) data));
+  return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __STARPU_PTHREAD_H__ */

+ 67 - 0
include/pthread_win32/semaphore.h

@@ -0,0 +1,67 @@
+/*
+ * StarPU
+ * Copyright (C) Université Bordeaux 1, CNRS 2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* This is a minimal pthread implementation based on windows functions.
+ * It is *not* intended to be complete - just complete enough to get
+ * StarPU running.
+ */
+
+#ifndef __STARPU_SEMAPHORE_H__
+#define __STARPU_SEMAPHORE_H__
+
+#include "pthread.h"
+
+/**************
+ * semaphores *
+ **************/
+
+typedef HANDLE sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
+  (void)pshared;
+  winPthreadAssertWindows(*sem = CreateSemaphore(NULL, value, MAXLONG, NULL));
+  return 0;
+}
+
+static inline int do_sem_wait(sem_t *sem, DWORD timeout) {
+  switch (WaitForSingleObject(*sem, timeout)) {
+    default:
+    case WAIT_FAILED:
+      setSystemErrno();
+      return -1;
+    case WAIT_TIMEOUT:
+      errno = EAGAIN;
+      return -1;
+    case WAIT_ABANDONED:
+    case WAIT_OBJECT_0:
+      return 0;
+  }
+}
+
+#define sem_wait(sem) do_sem_wait(sem, INFINITE)
+#define sem_trywait(sem) do_sem_wait(sem, 0)
+
+static inline int sem_post(sem_t *sem) {
+  winPthreadAssertWindows(ReleaseSemaphore(*sem, 1, NULL));
+  return 0;
+}
+
+static inline int sem_destroy(sem_t *sem) {
+  winPthreadAssertWindows(CloseHandle(*sem));
+  return 0;
+}
+
+#endif /* __STARPU_SEMAPHORE_H__ */

+ 1 - 1
include/starpu_deprecated_api.h

@@ -23,7 +23,7 @@ extern "C"
 {
 #endif
 
-#warning "Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh"
+#warning Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh.
 
 typedef starpu_data_handle_t starpu_data_handle;
 typedef struct starpu_block_interface starpu_block_interface_t;

+ 2 - 6
include/starpu_opencl.h

@@ -94,13 +94,9 @@ int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
 cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
 
-cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event);
+cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 
-cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event);
-
-cl_int starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
-
-cl_int starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
+cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 
 #ifdef __cplusplus
 }

+ 2 - 1
socl/src/gc.c

@@ -61,7 +61,8 @@ static void * gc_thread_routine(void *UNUSED(arg)) {
 
       /* Release entity */
       entity next = r->next;
-      free(r);
+#warning FIXME: free memory
+//      free(r);
 
       r = next;
     }

+ 3 - 0
socl/src/task.c

@@ -120,10 +120,13 @@ static void cputask_task(__attribute__((unused)) void *descr[], void *args) {
 
   arg->callback(arg->arg);
 
+#warning FIXME: free memory
+/*
   if (arg->free_arg)
     free(arg->arg);
 
   free(arg);
+*/
 }
 
 static struct starpu_codelet cputask_codelet = {

+ 0 - 2
src/core/task.c

@@ -339,8 +339,6 @@ void _starpu_task_check_deprecated_fields(struct starpu_task *task)
 				task->handles[i] = task->buffers[i].handle;
 				task->cl->modes[i] = task->buffers[i].mode;
 			}
-			task->buffers[i].handle = NULL;
-			task->buffers[i].mode = STARPU_NONE;
 		}
 	}
 }

+ 0 - 15
src/core/topology.c

@@ -300,13 +300,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config)
 		{
 			/* Nothing was specified, so let's choose ! */
 			ncuda = nb_devices;
-			if (ncuda > STARPU_MAXCUDADEVS)
-			{
-				fprintf(stderr,
-					"# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n",
-					nb_devices, STARPU_MAXCUDADEVS);
-				ncuda = STARPU_MAXCUDADEVS;
-			}
 		}
 		else
 		{
@@ -318,14 +311,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config)
 					ncuda, nb_devices);
 				ncuda = nb_devices;
 			}
-			/* Let's make sure this value is OK. */
-			if (ncuda > STARPU_MAXCUDADEVS)
-			{
-				fprintf(stderr,
-					"# Warning: %d CUDA devices requested. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n",
-					ncuda, STARPU_MAXCUDADEVS);
-				ncuda = STARPU_MAXCUDADEVS;
-			}
 		}
 	}
 

+ 1 - 0
src/datawizard/coherency.c

@@ -683,6 +683,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	return 0;
 
 enomem:
+	_STARPU_TRACE_END_FETCH_INPUT(NULL);
 	/* try to unreference all the input that were successfully taken */
 	/* XXX broken ... */
 	_STARPU_DISP("something went wrong with buffer %u\n", index);

+ 8 - 6
src/datawizard/interfaces/bcsr_interface.c

@@ -350,6 +350,7 @@ fail_rowptr:
 	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_colind);
+			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
 			cudaFree((void*)addr_colind);
@@ -369,6 +370,7 @@ fail_colind:
 	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_nzval);
+			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
 			cudaFree((void*)addr_nzval);
@@ -483,15 +485,15 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, src_node, (void *)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, src_node, (void *)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, src_node, (void *)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, src_node, (void *)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, src_node, (void *)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, src_node, (void *)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -514,15 +516,15 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, src_node, (cl_mem)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, src_node, (cl_mem)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, src_node, (cl_mem)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, src_node, (cl_mem)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, src_node, (cl_mem)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, src_node, (cl_mem)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 

+ 10 - 10
src/datawizard/interfaces/block_interface.c

@@ -621,9 +621,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-                        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
-                                                                           src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
-                                                                           dst_block->offset, (cl_event*)_event, &ret);
+                        err = starpu_opencl_copy_ram_to_opencl((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
+							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+							       dst_block->offset, (cl_event*)_event, &ret);
                         if (STARPU_UNLIKELY(err))
                                 STARPU_OPENCL_REPORT_ERROR(err);
                 }
@@ -644,9 +644,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 			{
                                 void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
                                 err = starpu_opencl_copy_ram_to_opencl(ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
-                                                                        src_block->nx*src_block->elemsize,
-                                                                        layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
-                                                                        + dst_block->offset, NULL);
+								       src_block->nx*src_block->elemsize,
+								       layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
+								       + dst_block->offset, NULL, NULL);
                                 if (STARPU_UNLIKELY(err))
                                         STARPU_OPENCL_REPORT_ERROR(err);
                         }
@@ -687,9 +687,9 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 		/* Is that a single contiguous buffer ? */
 		if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-                        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
-                                                                           src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
-                                                                           src_block->offset, (cl_event*)_event, &ret);
+                        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
+							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+							       src_block->offset, (cl_event*)_event, &ret);
                         if (STARPU_UNLIKELY(err))
                                 STARPU_OPENCL_REPORT_ERROR(err);
                 }
@@ -713,7 +713,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
                                 err = starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, src_node, ptr, dst_node,
                                                                         src_block->nx*src_block->elemsize,
                                                                         layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
-                                                                        src_block->offset, NULL);
+								       src_block->offset, NULL, NULL);
 				if (STARPU_UNLIKELY(err))
 					STARPU_OPENCL_REPORT_ERROR(err);
                         }

+ 8 - 6
src/datawizard/interfaces/csr_interface.c

@@ -324,6 +324,7 @@ fail_rowptr:
 	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_colind);
+			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
 		{
@@ -348,6 +349,7 @@ fail_colind:
 	{
 		case STARPU_CPU_RAM:
 			free((void *)addr_nzval);
+			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
 		{
@@ -666,15 +668,15 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -694,15 +696,15 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, src_node, (cl_mem)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, src_node, (cl_mem)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, src_node, (cl_mem)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
+	err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, src_node, (cl_mem)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, src_node, (cl_mem)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, src_node, (cl_mem)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 

+ 6 - 6
src/datawizard/interfaces/matrix_interface.c

@@ -607,9 +607,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
 
-	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
-                                                           src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
-                                                           dst_matrix->offset, (cl_event*)_event, &ret);
+	err = starpu_opencl_copy_ram_to_opencl((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
+					       src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+					       dst_matrix->offset, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -626,9 +626,9 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
 
-        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
-                                                           src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
-                                                           src_matrix->offset, (cl_event*)_event, &ret);
+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
+					       src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
+					       src_matrix->offset, (cl_event*)_event, &ret);
 
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);

+ 16 - 16
src/datawizard/interfaces/multiformat_interface.c

@@ -637,14 +637,14 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize;
 
 
-	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_multiformat->cpu_ptr,
-							   src_node,
-							   (cl_mem) dst_multiformat->cpu_ptr,
-							   dst_node,
-							   size,
-							   0,
-							   (cl_event *) _event,
-							   &ret);
+	err = starpu_opencl_copy_ram_to_opencl(src_multiformat->cpu_ptr,
+					       src_node,
+					       (cl_mem) dst_multiformat->cpu_ptr,
+					       dst_node,
+					       size,
+					       0,
+					       (cl_event *) _event,
+					       &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -675,14 +675,14 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 		/* XXX : it is weird that we might have to allocate memory here... */
 		dst_multiformat->opencl_ptr = malloc(dst_multiformat->nx * dst_multiformat->ops->opencl_elemsize);
 	}
-	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_multiformat->opencl_ptr,
-							   src_node,
-							   dst_multiformat->opencl_ptr,
-							   dst_node,
-							   size,
-                                                           0,
-							   (cl_event *)_event,
-							   &ret);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_multiformat->opencl_ptr,
+					       src_node,
+					       dst_multiformat->opencl_ptr,
+					       dst_node,
+					       size,
+					       0,
+					       (cl_event *)_event,
+					       &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 

+ 4 - 4
src/datawizard/interfaces/variable_interface.c

@@ -440,8 +440,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 	struct starpu_variable_interface *dst_variable = dst_interface;
         int err,ret;
 
-        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, src_node, (cl_mem)dst_variable->ptr, dst_node, src_variable->elemsize,
-                                                           0, (cl_event*)_event, &ret);
+        err = starpu_opencl_copy_ram_to_opencl((void*)src_variable->ptr, src_node, (cl_mem)dst_variable->ptr, dst_node, src_variable->elemsize,
+					       0, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -456,8 +456,8 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 	struct starpu_variable_interface *dst_variable = dst_interface;
         int err, ret;
 
-	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, src_node, (void*)dst_variable->ptr, dst_node, src_variable->elemsize,
-                                                           0, (cl_event*)_event, &ret);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_variable->ptr, src_node, (void*)dst_variable->ptr, dst_node, src_variable->elemsize,
+					       0, (cl_event*)_event, &ret);
 
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);

+ 5 - 5
src/datawizard/interfaces/vector_interface.c

@@ -483,9 +483,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 	struct starpu_vector_interface *dst_vector = dst_interface;
         int err, ret;
 
-	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, src_node, (cl_mem)dst_vector->dev_handle, dst_node,
-                                                           src_vector->nx*src_vector->elemsize,
-                                                           dst_vector->offset, (cl_event*)_event, &ret);
+	err = starpu_opencl_copy_ram_to_opencl((void*)src_vector->ptr, src_node, (cl_mem)dst_vector->dev_handle, dst_node,
+					       src_vector->nx*src_vector->elemsize,
+					       dst_vector->offset, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -501,8 +501,8 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 	struct starpu_vector_interface *dst_vector = dst_interface;
         int err, ret;
 
-	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, src_node, (void*)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize,
-                                                           src_vector->offset, (cl_event *)_event, &ret);
+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_vector->dev_handle, src_node, (void*)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize,
+					       src_vector->offset, (cl_event *)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 

+ 4 - 0
src/drivers/cuda/driver_cuda.c

@@ -165,6 +165,10 @@ static void deinit_context(int workerid, int devid)
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 
+
+/* Return the number of devices usable in the system.
+ * The value returned cannot be greater than MAXCUDADEVS */
+
 unsigned _starpu_get_cuda_device_count(void)
 {
 	int cnt;

+ 25 - 81
src/drivers/opencl/driver_opencl.c

@@ -203,8 +203,9 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem, size_t size, cl_mem_flags flag
 	 */
 	char dummy = 0;
 	err = clEnqueueWriteBuffer(queues[worker->devid], memory, CL_TRUE,
-				0, sizeof(dummy), &dummy,
-				0, NULL, NULL);
+				   0, sizeof(dummy), &dummy,
+				   0, NULL, NULL);
+	clFinish(queues[worker->devid]);
 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
 		return err;
 	if (err != CL_SUCCESS)
@@ -214,111 +215,54 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem, size_t size, cl_mem_flags flag
         return CL_SUCCESS;
 }
 
-cl_int starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
+cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
 {
         cl_int err;
         struct _starpu_worker *worker = _starpu_get_local_worker_key();
-        cl_bool blocking;
-
-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
 
         if (event)
                 _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
         if (event)
                 _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
-                *ret = (event == NULL) ? 0 : -EAGAIN;
-                return CL_SUCCESS;
-        }
-        else
-	{
-                if (event != NULL)
-		{
-                        /* The asynchronous copy has failed, try to copy synchronously */
-                        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
-                }
-                if (STARPU_LIKELY(err == CL_SUCCESS))
+		if (event == NULL)
 		{
-                        *ret = 0;
-                        return CL_SUCCESS;
-                }
-                else
+			/* We want a synchronous copy, let's synchronise the queue */
+			clFinish(transfer_queues[worker->devid]);
+		}
+		if (ret)
 		{
-                        STARPU_OPENCL_REPORT_ERROR(err);
-                        return err;
-                }
-        }
-}
-
-cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
-{
-        cl_int err;
-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
-        cl_bool blocking;
-
-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        if (event)
-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
-        if (event)
-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-
-        return CL_SUCCESS;
+			*ret = (event == NULL) ? 0 : -EAGAIN;
+		}
+	}
+	return err;
 }
 
-cl_int starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
+cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
 {
         cl_int err;
         struct _starpu_worker *worker = _starpu_get_local_worker_key();
-        cl_bool blocking;
 
-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
         if (event)
                 _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
         if (event)
                 _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
-                *ret = (event == NULL) ? 0 : -EAGAIN;
-                return CL_SUCCESS;
-        }
-        else
-	{
-                if (event != NULL)
-                        /* The asynchronous copy has failed, try to copy synchronously */
-                        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
-                if (STARPU_LIKELY(err == CL_SUCCESS))
+		if (event == NULL)
 		{
-                        *ret = 0;
-                        return CL_SUCCESS;
-                }
-                else
+			/* We want a synchronous copy, let's synchronise the queue */
+			clFinish(transfer_queues[worker->devid]);
+		}
+		if (ret)
 		{
-                        STARPU_OPENCL_REPORT_ERROR(err);
-                        return err;
-                }
-        }
-}
-
-cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
-{
-        cl_int err;
-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
-        cl_bool blocking;
-
-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        if (event)
-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
-        if (event)
-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-
-        return CL_SUCCESS;
+			*ret = (event == NULL) ? 0 : -EAGAIN;
+		}
+	}
+	return err;
 }
 
 #if 0

+ 2 - 0
src/top/starpu_top_core.h

@@ -15,6 +15,8 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <time.h>
+
 struct starpu_task;
 
 /*

+ 1 - 11
tests/datawizard/acquire_release_opencl.c

@@ -42,17 +42,7 @@ void increment_opencl(void *buffers[], void *args)
 
 	{
 		size_t global=1;
-		size_t local;
-                size_t s;
-                cl_device_id device;
-
-                starpu_opencl_get_device(devid, &device);
-
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
+		size_t local=1;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS)

+ 10 - 10
tests/datawizard/data_invalidation.c

@@ -70,15 +70,15 @@ static void opencl_memset_codelet(void *buffers[], void *args)
 	memset(v, 42, length);
 
 	clEnqueueWriteBuffer(queue,
-			buffer,
-			CL_TRUE,
-			0,      /* offset */
-			length, /* sizeof (char) */
-			v,
-			0,      /* num_events_in_wait_list */
-			NULL,   /* event_wait_list */
-			NULL    /* event */);
-			
+			     buffer,
+			     CL_FALSE,
+			     0,      /* offset */
+			     length, /* sizeof (char) */
+			     v,
+			     0,      /* num_events_in_wait_list */
+			     NULL,   /* event_wait_list */
+			     NULL    /* event */);
+	clFinish(queue);
 }
 #endif /* !STARPU_USE_OPENCL */
 
@@ -121,7 +121,7 @@ static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) vo
 	{
 		if (buf[i] != 42)
 		{
-			FPRINTF(stderr, "buf[%u] is %c while it should be %c\n", i, buf[i], 42);
+			FPRINTF(stderr, "buf[%u] is '%c' while it should be '%c'\n", i, buf[i], 42);
 			exit(-1);
 		}
 	}

+ 24 - 19
tests/datawizard/gpu_register.c

@@ -186,19 +186,18 @@ test_opencl(void)
 		foo[i] = i;
 
 	err = clEnqueueWriteBuffer(queue,
-				foo_gpu,
-				CL_TRUE,
-				0,
-				size*sizeof(int),
-				foo,
-				0,
-				NULL,
-				NULL);
+				   foo_gpu,
+				   CL_FALSE,
+				   0,
+				   size*sizeof(int),
+				   foo,
+				   0,
+				   NULL,
+				   NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
+	clFinish(queue);
 
-
-	
 	starpu_vector_data_register(&handle,
 				    starpu_worker_get_memory_node(chosen),
 				    (uintptr_t)foo_gpu,
@@ -222,7 +221,7 @@ test_opencl(void)
 	};
 
 	starpu_data_partition(handle, &f);
-	
+
 	ret = submit_tasks(handle, pieces, n);
 	if (ret == -ENODEV)
 		return -ENODEV;
@@ -235,16 +234,17 @@ test_opencl(void)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 
 	err = clEnqueueReadBuffer(queue,
-		foo_gpu,
-		CL_TRUE,
-		0,
-		size*sizeof(*foo),
-		foo,
-		0,
-		NULL,
-		NULL);
+				  foo_gpu,
+				  CL_FALSE,
+				  0,
+				  size*sizeof(*foo),
+				  foo,
+				  0,
+				  NULL,
+				  NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
+	clFinish(queue);
 	return check_result(foo, size);
 }
 #endif /* !STARPU_USE_OPENCL */
@@ -258,6 +258,11 @@ int main(int argc, char **argv)
 		return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
 #ifdef STARPU_USE_CUDA
 #if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
 	ret = test_cuda();

+ 1 - 1
tests/datawizard/interfaces/bcsr/bcsr_interface.c

@@ -178,7 +178,7 @@ main(void)
 		.nopencl = 1
 	};
 
-	if (starpu_init(&conf) == -ENODEV)
+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		return STARPU_TEST_SKIPPED;
 
 	register_data();

+ 1 - 1
tests/datawizard/interfaces/block/block_interface.c

@@ -140,7 +140,7 @@ main(void)
 		.nopencl = 1
 	};
 
-	if (starpu_init(&conf) == -ENODEV)
+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;
 
 	register_data();

+ 1 - 1
tests/datawizard/interfaces/csr/csr_interface.c

@@ -148,7 +148,7 @@ main(void)
 		.nopencl = 1
 	};
 
-	if (starpu_init(&conf) == -ENODEV)
+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;
 
 	register_data();

+ 1 - 1
tests/datawizard/interfaces/matrix/matrix_interface.c

@@ -123,7 +123,7 @@ main(void)
 		.nopencl = 1
 	};
 
-	if (starpu_init(&conf) == -ENODEV)
+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;
 
 	register_data();

+ 1 - 1
tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release.c

@@ -131,7 +131,7 @@ main(void)
 	};
 	memset(&global_stats, 0, sizeof(global_stats));
 	ret = starpu_init(&conf);
-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	register_handle();

+ 2 - 1
tests/datawizard/interfaces/multiformat/multiformat_interface.c

@@ -144,7 +144,8 @@ main(void)
 	};
 
 	ret = starpu_init(&conf);
-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)
+		return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	register_data();

+ 2 - 1
tests/datawizard/interfaces/variable/variable_interface.c

@@ -101,7 +101,8 @@ main(void)
 	};
 
 	ret = starpu_init(&conf);
-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)
+		return STARPU_TEST_SKIPPED;
 
 	register_data();
 

+ 1 - 1
tests/datawizard/interfaces/vector/test_vector_interface.c

@@ -110,7 +110,7 @@ main(void)
 		.nopencl = 1
 	};
 
-	if (starpu_init(&conf) == -ENODEV)
+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;
 
 	register_data();

+ 1 - 1
tests/datawizard/interfaces/void/void_interface.c

@@ -66,7 +66,7 @@ main(void)
 		.nopencl = 1
 	};
 
-	if (starpu_init(&conf) == -ENODEV)
+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;
 
 	register_data();

+ 20 - 20
tests/datawizard/lazy_allocation.c

@@ -65,15 +65,15 @@ static void opencl_memset_codelet(void *buffers[], void *args)
 	memset(v, 42, length);
 
 	clEnqueueWriteBuffer(queue,
-			buffer,
-			CL_TRUE,
-			0,      /* offset */
-			length, /* sizeof (char) */
-			v,
-			0,      /* num_events_in_wait_list */
-			NULL,   /* event_wait_list */
-			NULL    /* event */);
-			
+			     buffer,
+			     CL_FALSE,
+			     0,      /* offset */
+			     length, /* sizeof (char) */
+			     v,
+			     0,      /* num_events_in_wait_list */
+			     NULL,   /* event_wait_list */
+			     NULL    /* event */);
+	clFinish(queue);
 }
 #endif
 
@@ -160,19 +160,19 @@ static void opencl_check_content_codelet(void *buffers[], void *args)
 	for (i = 0; i < length; i++)
 	{
 		char dst;
-		clEnqueueReadBuffer(
-			queue,
-			buf,
-			CL_TRUE,
-			i * sizeof(dst),
-			sizeof(dst),
-			&dst,
-			0,      /* num_events_in_wait_list */
-			NULL,   /* event_wait_list */
-			NULL    /* event */);
+		clEnqueueReadBuffer(queue,
+				    buf,
+				    CL_FALSE,
+				    i * sizeof(dst),
+				    sizeof(dst),
+				    &dst,
+				    0,      /* num_events_in_wait_list */
+				    NULL,   /* event_wait_list */
+				    NULL    /* event */);
+		clFinish(queue);
 		if (dst != 42)
 		{
-			FPRINTF(stderr, "buf[%u] is %c while it should be %c\n", i, dst, 42);
+			FPRINTF(stderr, "buf[%u] is '%c' while it should be '%c'\n", i, dst, 42);
 			exit(-1);
 		}
 	}

+ 6 - 3
tests/datawizard/manual_reduction.c

@@ -57,7 +57,8 @@ static void initialize_per_worker_handle(void *arg __attribute__((unused)))
 			cl_mem ptr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(variable), NULL, NULL);
 			/* Poor's man memset */
 			unsigned zero = 0;
-			clEnqueueWriteBuffer(queue, ptr, CL_TRUE, 0, sizeof(variable), (void *)&zero, 0, NULL, NULL);
+			clEnqueueWriteBuffer(queue, ptr, CL_FALSE, 0, sizeof(variable), (void *)&zero, 0, NULL, NULL);
+			clFinish(queue);
 			per_worker[workerid] = (uintptr_t)ptr;
 			}
 
@@ -142,9 +143,11 @@ static void opencl_func_incr(void *descr[], void *cl_arg __attribute__((unused))
 	cl_command_queue queue;
 	starpu_opencl_get_current_queue(&queue);
 
-	clEnqueueReadBuffer(queue, d_val, CL_TRUE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
+	clEnqueueReadBuffer(queue, d_val, CL_FALSE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
+	clFinish(queue);
 	h_val++;
-	clEnqueueWriteBuffer(queue, d_val, CL_TRUE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
+	clEnqueueWriteBuffer(queue, d_val, CL_FALSE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 

+ 2 - 2
tests/datawizard/mpi_like.c

@@ -138,7 +138,7 @@ static void send_handle(struct thread_data *thread_data)
 	/* wait until it's received (ie. neighbour's recv_flag is set back to 0) */
 	while (neighbour_data->recv_flag)
 		_STARPU_PTHREAD_COND_WAIT(&neighbour_data->recv_cond, &neighbour_data->recv_mutex);
-	
+
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&neighbour_data->recv_mutex);
 
 	starpu_data_release(thread_data->handle);
@@ -159,7 +159,7 @@ static void *thread_func(void *arg)
 		{
 			recv_handle(thread_data);
 		}
-		
+
 		increment_handle(thread_data);
 
 		if (!((index == (NTHREADS - 1)) && (iter == (NITER - 1))))

+ 1 - 11
tests/datawizard/opencl_codelet_unsigned_inc.c

@@ -44,17 +44,7 @@ void opencl_codelet_unsigned_inc(void *buffers[], void *args)
 
 	{
 		size_t global=1;
-		size_t local;
-                size_t s;
-                cl_device_id device;
-
-                starpu_opencl_get_device(devid, &device);
-
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
+		size_t local=1;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS)

+ 1 - 1
tests/datawizard/opencl_codelet_unsigned_inc_kernel.cl

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void _opencl_unsigned_inc(__global unsigned int *val)
+__kernel void _opencl_unsigned_inc(__global unsigned *val)
 {
 	val[0]++;
 }

+ 2 - 1
tests/datawizard/reclaim.c

@@ -30,11 +30,12 @@
 
 #ifdef STARPU_SLOW_MACHINE
 #  define BLOCK_SIZE (64*1024)
+static unsigned ntasks = 250;
 #else
 #  define BLOCK_SIZE (64*1024*1024)
+static unsigned ntasks = 1000;
 #endif
 
-static unsigned ntasks = 1000;
 
 #ifdef STARPU_HAVE_HWLOC
 static uint64_t get_total_memory_size(void)

+ 14 - 3
tests/errorcheck/starpu_init_noworker.c

@@ -33,7 +33,7 @@ static void unset_env_variables(void)
 {
 	(void) unsetenv("STARPU_NCPUS");
 	(void) unsetenv("STARPU_NCUDA");
-	(void) unsetenv("STARPU_NNOPENCL");
+	(void) unsetenv("STARPU_NOPENCL");
 }
 
 int main(int argc, char **argv)
@@ -58,9 +58,20 @@ int main(int argc, char **argv)
 
 	/* starpu_init should return -ENODEV */
 	ret = starpu_init(&conf);
-	if (ret != -ENODEV)
+	if (ret == -ENODEV)
+	     return EXIT_SUCCESS;
+	else
+	{
+	     	unsigned ncpu = starpu_cpu_worker_get_count();
+		unsigned ncuda = starpu_cuda_worker_get_count();
+		unsigned nopencl = starpu_opencl_worker_get_count();
+		FPRINTF(stderr, "StarPU has found :\n");
+		FPRINTF(stderr, "\t%d CPU cores\n", ncpu);
+		FPRINTF(stderr, "\t%d CUDA devices\n", ncuda);
+		FPRINTF(stderr, "\t%d OpenCL devices\n", nopencl);
 		return EXIT_FAILURE;
+	}
+
 
-	return EXIT_SUCCESS;
 }
 #endif

+ 10 - 6
tests/helper/starpu_data_cpy.c

@@ -17,19 +17,18 @@
 #include <starpu.h>
 #include "../helper.h"
 
-int var1, var2;
-starpu_data_handle_t var1_handle, var2_handle;
-
 int main(int argc, char **argv)
 {
 	int ret;
+	int var1, var2;
+	starpu_data_handle_t var1_handle, var2_handle;
 
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	var1 = 42;
-	var2 = 0;
+	var2 = 12;
 
 	starpu_variable_data_register(&var1_handle, 0, (uintptr_t)&var1, sizeof(var1));
 	starpu_variable_data_register(&var2_handle, 0, (uintptr_t)&var2, sizeof(var2));
@@ -38,12 +37,17 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_cpy");
 
 	starpu_data_acquire(var2_handle, STARPU_R);
-	STARPU_ASSERT(var2 == 42);
+	ret = EXIT_SUCCESS;
+	if (var2 != var1)
+	{
+	     FPRINTF(stderr, "var2 is %d but it should be %d\n", var2, var1);
+	     ret = EXIT_FAILURE;
+	}
 	starpu_data_release(var2_handle);
 
 	starpu_data_unregister(var1_handle);
 	starpu_data_unregister(var2_handle);
 	starpu_shutdown();
 
-	STARPU_RETURN(EXIT_SUCCESS);
+	STARPU_RETURN(ret);
 }

+ 3 - 2
tests/main/task_wait_api.c

@@ -120,8 +120,9 @@ int main(int argc, char **argv)
 	ret = starpu_task_wait(taskK); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 	ret = starpu_task_wait(taskL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 
-	/* Destroy all the tasks that were not detached and that we did not
-	 * wait for previously */
+	starpu_task_wait_for_all();
+
+	/* Destroy all the tasks that were not detached */
 	starpu_task_destroy(taskA);
 	starpu_task_destroy(taskC);
 	starpu_task_destroy(taskD);

+ 2 - 1
tools/dev/internal/rename_internal.sed

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -14,6 +14,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
+s/\bstruct starpu_priority_taskq_s\b/struct _starpu_priority_taskq/g
 s/\bSTARPU_FUT_APPS_KEY\b/_STARPU_FUT_APPS_KEY/g
 s/\bSTARPU_FUT_CPU_KEY\b/_STARPU_FUT_CPU_KEY/g
 s/\bSTARPU_FUT_CUDA_KEY\b/_STARPU_FUT_CUDA_KEY/g