13 years ago · 5f656f76cf
--- a/ChangeLog
+++ b/ChangeLog
@@ -14,7 +14,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-StarPU 1.0 (svn revision xxxx)
			
 
				+StarPU 1.0.0 (svn revision 6306)
			
 
				 ==============================================
			
 
				 The extensions-again release
			
 
				 
			
@@ -61,6 +61,10 @@ Changes:
 
				         To port code to use new names (this is not mandatory), the
			
 
				         tools/dev/rename.sh script can be used, and pkg-config starpu-1.0 should
			
 
				         be used.
			
 
				+  * The communication cost in the heft and dmda scheduling strategies now
			
 
				+        take into account the contention brought by the number of GPUs. This
			
 
				+        changes the meaning of the beta factor, whose default 1.0 value should
			
 
				+        now be good enough in most case.
			
 
				 
			
 
				 Small features:
			
 
				   * Allow users to disable asynchronous data transfers between CPUs and
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -71,6 +71,10 @@ versinclude_HEADERS = 				\
 
				 nodist_versinclude_HEADERS = 			\
			
 
				 	include/starpu_config.h
			
 
				 
			
 
				+noinst_HEADERS = \
			
 
				+	include/pthread_win32/pthread.h		\
			
 
				+	include/pthread_win32/semaphore.h
			
 
				+
			
 
				 if BUILD_STARPU_TOP
			
 
				 starpu-top/starpu_top$(EXEEXT): all-local
			
 
				 all-local:
			
--- a/STARPU-VERSION
+++ b/STARPU-VERSION
@@ -3,19 +3,19 @@
 
				 # Versioning (SONAMEs) for StarPU libraries.
			
 
				 
			
 
				 # Libtool interface versioning (info "(libtool) Versioning").
			
 
				-LIBSTARPU_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSTARPU_INTERFACE_CURRENT=1	# increment upon ABI change
			
 
				 LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				 LIBSTARPU_INTERFACE_AGE=0	# set to CURRENT - PREVIOUS interface
			
 
				 STARPU_EFFECTIVE_VERSION=1.0
			
 
				 
			
 
				-LIBSTARPUFFT_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSTARPUFFT_INTERFACE_CURRENT=1	# increment upon ABI change
			
 
				 LIBSTARPUFFT_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				 LIBSTARPUFFT_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
			
 
				 
			
 
				-LIBSTARPUMPI_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSTARPUMPI_INTERFACE_CURRENT=1	# increment upon ABI change
			
 
				 LIBSTARPUMPI_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				 LIBSTARPUMPI_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
			
 
				 
			
 
				-LIBSOCL_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSOCL_INTERFACE_CURRENT=1	# increment upon ABI change
			
 
				 LIBSOCL_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				 LIBSOCL_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
			
--- a/configure.ac
+++ b/configure.ac
@@ -16,7 +16,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AC_INIT([StarPU],1.0.0rc3, [starpu-devel@lists.gforge.inria.fr], starpu)
			
 
				+AC_INIT([StarPU],1.0.0, [starpu-devel@lists.gforge.inria.fr], starpu)
			
 
				 AC_CONFIG_SRCDIR(include/starpu.h)
			
 
				 AC_CONFIG_AUX_DIR([build-aux])
			
 
				 
			
@@ -116,7 +116,21 @@ case "${srcdir}" in
 
				      AC_SUBST(STARPU_SRC_DIR, "$(eval echo $PWD/${srcdir})") ;;
			
 
				 esac
			
 
				 
			
 
				-AC_CHECK_LIB([pthread], [pthread_create])
			
 
				+
			
 
				+case "$target" in
			
 
				+*-*-mingw*|*-*-cygwin*)
			
 
				+    AC_ARG_ENABLE(native-winthreads, [AS_HELP_STRING([--enable-native-winthreads],
			
 
				+    				   [Use native windows threads instead of pthread])],
			
 
				+    				   enable_native_winthreads=$enableval, enable_native_winthreads=no)
			
 
				+    ;;
			
 
				+esac
			
 
				+if test x"$enable_native_winthreads" = xyes
			
 
				+then
			
 
				+    CPPFLAGS+=" -I$STARPU_SRC_DIR/include/pthread_win32 "
			
 
				+else
			
 
				+    AC_CHECK_LIB([pthread], [pthread_create])
			
 
				+fi
			
 
				+
			
 
				 AC_COMPILE_IFELSE(
			
 
				   AC_LANG_PROGRAM([[
			
 
				     #include <pthread.h>
			
@@ -1044,7 +1058,7 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
				 
			
 
				 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
			
 
				 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
			
 
				-running_mpi_check=0
			
 
				+running_mpi_check=no
			
 
				 if test -d "$srcdir/.svn" ; then
			
 
				     running_mpi_check=yes
			
 
				 fi
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -71,12 +71,8 @@ has a single implementation for CPU:
 
				 /* Task declaration.  */
			
 
				 static void my_task (int x) __attribute__ ((task));
			
 
				 
			
 
				-/* Declaration of the CPU implementation of `my_task'.  */
			
 
				-static void my_task_cpu (int x)
			
 
				-   __attribute__ ((task_implementation ("cpu", my_task)));
			
 
				-
			
 
				-/* Definition of said CPU implementation.  */
			
 
				-static void my_task_cpu (int x)
			
 
				+/* Definition of the CPU implementation of `my_task'.  */
			
 
				+static void my_task (int x)
			
 
				 @{
			
 
				   printf ("Hello, world!  With x = %d\n", x);
			
 
				 @}
			
@@ -331,19 +327,13 @@ has to be defined:
 
				 @cartouche
			
 
				 @smallexample
			
 
				 /* Declare the `vector_scal' task.  */
			
 
				-
			
 
				 static void vector_scal (unsigned size, float vector[size],
			
 
				                          float factor)
			
 
				   __attribute__ ((task));
			
 
				 
			
 
				-/* Declare and define the standard CPU implementation.  */
			
 
				-
			
 
				-static void vector_scal_cpu (unsigned size, float vector[size],
			
 
				-                             float factor)
			
 
				-  __attribute__ ((task_implementation ("cpu", vector_scal)));
			
 
				-
			
 
				+/* Define the standard CPU implementation.  */
			
 
				 static void
			
 
				-vector_scal_cpu (unsigned size, float vector[size], float factor)
			
 
				+vector_scal (unsigned size, float vector[size], float factor)
			
 
				 @{
			
 
				   unsigned i;
			
 
				   for (i = 0; i < size; i++)
			
--- a/doc/chapters/c-extensions.texi
+++ b/doc/chapters/c-extensions.texi
@@ -87,8 +87,11 @@ The following function attributes are provided:
 
				 @item task
			
 
				 @cindex @code{task} attribute
			
 
				 Declare the given function as a StarPU task.  Its return type must be
			
 
				-@code{void}, and it must not be defined---instead, a definition will
			
 
				-automatically be provided by the compiler.
			
 
				+@code{void}.  When a function declared as @code{task} has a user-defined
			
 
				+body, that body is interpreted as the @dfn{implicit definition of the
			
 
				+task's CPU implementation} (see example below).  In all cases, the
			
 
				+actual definition of a task's body is automatically generated by the
			
 
				+compiler.
			
 
				 
			
 
				 Under the hood, declaring a task leads to the declaration of the
			
 
				 corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
			
@@ -157,6 +160,44 @@ A @code{matmult} task is defined; it has only one implementation,
 
				 @var{B} are input buffers, whereas @var{C} is considered an input/output
			
 
				 buffer.
			
 
				 
			
 
				+@cindex implicit task CPU implementation
			
 
				+For convenience, when a function declared with the @code{task} attribute
			
 
				+has a user-defined body, that body is assumed to be that of the CPU
			
 
				+implementation of a task, which we call an @dfn{implicit task CPU
			
 
				+implementation}.  Thus, the above snippet can be simplified like this:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#define __output  __attribute__ ((output))
			
 
				+
			
 
				+static void matmul (const float *A, const float *B,
			
 
				+                    __output float *C,
			
 
				+                    unsigned nx, unsigned ny, unsigned nz)
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+/* Implicit definition of the CPU implementation of the
			
 
				+   `matmul' task.  */
			
 
				+static void
			
 
				+matmul (const float *A, const float *B, __output float *C,
			
 
				+        unsigned nx, unsigned ny, unsigned nz)
			
 
				+@{
			
 
				+  unsigned i, j, k;
			
 
				+
			
 
				+  for (j = 0; j < ny; j++)
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+      @{
			
 
				+        for (k = 0; k < nz; k++)
			
 
				+          C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
			
 
				+      @}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+Use of implicit CPU task implementations as above has the advantage that
			
 
				+the code is valid sequential code when StarPU's GCC plug-in is not used
			
 
				+(@pxref{Conditional Extensions}).
			
 
				+
			
 
				 CUDA and OpenCL implementations can be declared in a similar way:
			
 
				 
			
 
				 @cartouche
			
@@ -349,36 +390,37 @@ The code below illustrates how to define a task and its implementations
 
				 in a way that allows it to be compiled without the GCC plug-in:
			
 
				 
			
 
				 @smallexample
			
 
				-/* The macros below abstract over the attributes specific to
			
 
				-   StarPU-GCC and the name of the CPU implementation.  */
			
 
				-#ifdef STARPU_GCC_PLUGIN
			
 
				-# define __task  __attribute__ ((task))
			
 
				-# define CPU_TASK_IMPL(task)  task ## _cpu
			
 
				-#else
			
 
				-# define __task
			
 
				-# define CPU_TASK_IMPL(task)  task
			
 
				-#endif
			
 
				+/* This program is valid, whether or not StarPU's GCC plug-in
			
 
				+   is being used.  */
			
 
				 
			
 
				 #include <stdlib.h>
			
 
				 
			
 
				-static void matmul (const float *A, const float *B, float *C,
			
 
				-                    unsigned nx, unsigned ny, unsigned nz) __task;
			
 
				-
			
 
				-#ifdef STARPU_GCC_PLUGIN
			
 
				+/* The attribute below is ignored when GCC is not used.  */
			
 
				+static void matmul (const float *A, const float *B, float * C,
			
 
				+                    unsigned nx, unsigned ny, unsigned nz)
			
 
				+  __attribute__ ((task));
			
 
				 
			
 
				-static void matmul_cpu (const float *A, const float *B, float *C,
			
 
				-                        unsigned nx, unsigned ny, unsigned nz)
			
 
				-  __attribute__ ((task_implementation ("cpu", matmul)));
			
 
				+static void
			
 
				+matmul (const float *A, const float *B, float * C,
			
 
				+        unsigned nx, unsigned ny, unsigned nz)
			
 
				+@{
			
 
				+  /* Code of the CPU kernel here...  */
			
 
				+@}
			
 
				 
			
 
				-#endif
			
 
				+#ifdef STARPU_GCC_PLUGIN
			
 
				+/* Optional OpenCL task implementation.  */
			
 
				 
			
 
				+static void matmul_opencl (const float *A, const float *B, float * C,
			
 
				+                           unsigned nx, unsigned ny, unsigned nz)
			
 
				+  __attribute__ ((task_implementation ("opencl", matmul)));
			
 
				 
			
 
				 static void
			
 
				-CPU_TASK_IMPL (matmul) (const float *A, const float *B, float *C,
			
 
				-                        unsigned nx, unsigned ny, unsigned nz)
			
 
				+matmul_opencl (const float *A, const float *B, float * C,
			
 
				+               unsigned nx, unsigned ny, unsigned nz)
			
 
				 @{
			
 
				-  /* Code of the CPU kernel here...  */
			
 
				+  /* Code that invokes the OpenCL kernel here...  */
			
 
				 @}
			
 
				+#endif
			
 
				 
			
 
				 int
			
 
				 main (int argc, char *argv[])
			
@@ -395,7 +437,7 @@ main (int argc, char *argv[])
 
				 
			
 
				   /* When StarPU-GCC is used, the call below is asynchronous;
			
 
				      otherwise, it is synchronous.  */
			
 
				-  matmul (A, B, C, 123, 42, 7);
			
 
				+  matmul ((float *) A, (float *) B, (float *) C, 123, 42, 7);
			
 
				 
			
 
				 #pragma starpu wait
			
 
				 #pragma starpu shutdown
			
@@ -404,12 +446,32 @@ main (int argc, char *argv[])
 
				 @}
			
 
				 @end smallexample
			
 
				 
			
 
				-Note that attributes such as @code{task} are simply ignored by GCC when
			
 
				-the StarPU plug-in is not loaded, so the @code{__task} macro could be
			
 
				-omitted altogether.  However, @command{gcc -Wall} emits a warning for
			
 
				-unknown attributes, which can be inconvenient, and other compilers may
			
 
				-be unable to parse the attribute syntax.  Thus, using macros such as
			
 
				-@code{__task} above is recommended.
			
 
				+@noindent
			
 
				+The above program is a valid StarPU program when StarPU's GCC plug-in is
			
 
				+used; it is also a valid sequential program when the plug-in is not
			
 
				+used.
			
 
				+
			
 
				+Note that attributes such as @code{task} as well as @code{starpu}
			
 
				+pragmas are simply ignored by GCC when the StarPU plug-in is not loaded.
			
 
				+However, @command{gcc -Wall} emits a warning for unknown attributes and
			
 
				+pragmas, which can be inconvenient.  In addition, other compilers may be
			
 
				+unable to parse the attribute syntax@footnote{In practice, Clang and
			
 
				+several proprietary compilers implement attributes.}, so you may want to
			
 
				+wrap attributes in macros like this:
			
 
				+
			
 
				+@smallexample
			
 
				+/* Use the `task' attribute only when StarPU's GCC plug-in
			
 
				+   is available.   */
			
 
				+#ifdef STARPU_GCC_PLUGIN
			
 
				+# define __task  __attribute__ ((task))
			
 
				+#else
			
 
				+# define __task
			
 
				+#endif
			
 
				+
			
 
				+static void matmul (const float *A, const float *B, float *C,
			
 
				+                    unsigned nx, unsigned ny, unsigned nz) __task;
			
 
				+@end smallexample
			
 
				+
			
 
				 
			
 
				 @c Local Variables:
			
 
				 @c TeX-master: "../starpu.texi"
			
--- a/examples/cpp/incrementer_cpp.cpp
+++ b/examples/cpp/incrementer_cpp.cpp
@@ -89,17 +89,19 @@ int main(int argc, char **argv)
 
				 	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
			
 
				                 float_array[1], float_array[2], float_array[3]);
			
 
				 
			
 
				-	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
			
 
				-	{
			
 
				-		FPRINTF(stderr, "Incorrect result\n");
			
 
				-		ret = 1;
			
 
				-	}
			
 
				-
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	ret = starpu_opencl_unload_opencl(&opencl_program);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
			
 
				 #endif
			
 
				+
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return ret;
			
 
				+	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Incorrect result\n");
			
 
				+		return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				 }
			
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -542,14 +542,14 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 
				 				size, CL_MEM_READ_WRITE);
			
 
				 		assert(ret == CL_SUCCESS);
			
 
				 	}
			
 
				-	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_custom->cpu_ptr,
			
 
				-							  src_node,
			
 
				-							  dst_custom->cpu_ptr,
			
 
				-							  dst_node,
			
 
				-							  size,
			
 
				-							  0,
			
 
				-							  NULL,
			
 
				-							  &ret);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl(src_custom->cpu_ptr,
			
 
				+					       src_node,
			
 
				+					       dst_custom->cpu_ptr,
			
 
				+					       dst_node,
			
 
				+					       size,
			
 
				+					       0,
			
 
				+					       NULL,
			
 
				+					       &ret);
			
 
				 	assert(err == 0);
			
 
				 	return 0;
			
 
				 }
			
@@ -584,15 +584,14 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 
				 		assert(dst_custom->opencl_ptr != NULL);
			
 
				 	}
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram_async_sync(
			
 
				-			src_custom->opencl_ptr,
			
 
				-			src_node,
			
 
				-			dst_custom->opencl_ptr,
			
 
				-			dst_node,
			
 
				-			size,
			
 
				-			0,
			
 
				-			NULL,
			
 
				-			&ret);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram(src_custom->opencl_ptr,
			
 
				+					       src_node,
			
 
				+					       dst_custom->opencl_ptr,
			
 
				+					       dst_node,
			
 
				+					       size,
			
 
				+					       0,
			
 
				+					       NULL,
			
 
				+					       &ret);
			
 
				 	assert(err == 0);
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -204,25 +204,25 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
 
				 
			
 
				 	cl_int err;
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl(
			
 
				-		src_complex->real,
			
 
				-		src_node,
			
 
				-		(cl_mem) dst_complex->real,
			
 
				-		dst_node,
			
 
				-		src_complex->nx * sizeof(src_complex->real[0]),
			
 
				-		0,
			
 
				-		NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
			
 
				+					       src_node,
			
 
				+					       (cl_mem) dst_complex->real,
			
 
				+					       dst_node,
			
 
				+					       src_complex->nx * sizeof(src_complex->real[0]),
			
 
				+					       0,
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl(
			
 
				-		src_complex->imaginary,
			
 
				-		src_node,
			
 
				-		(cl_mem) dst_complex->imaginary,
			
 
				-		dst_node,
			
 
				-		src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				-		0,
			
 
				-		NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
			
 
				+					       src_node,
			
 
				+					       (cl_mem) dst_complex->imaginary,
			
 
				+					       dst_node,
			
 
				+					       src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				+					       0,
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -236,25 +236,25 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				 
			
 
				 	cl_int err;
			
 
				-	err = starpu_opencl_copy_opencl_to_ram(
			
 
				-		(cl_mem) src_complex->real,
			
 
				-		src_node,
			
 
				-		dst_complex->real,
			
 
				-		dst_node,
			
 
				-		src_complex->nx * sizeof(src_complex->real[0]),
			
 
				-		0,
			
 
				-		NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
			
 
				+					       src_node,
			
 
				+					       dst_complex->real,
			
 
				+					       dst_node,
			
 
				+					       src_complex->nx * sizeof(src_complex->real[0]),
			
 
				+					       0,
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram(
			
 
				-		(cl_mem) src_complex->imaginary,
			
 
				-		src_node,
			
 
				-		dst_complex->imaginary,
			
 
				-		dst_node,
			
 
				-		src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				-		0,
			
 
				-		NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
			
 
				+					       src_node,
			
 
				+					       dst_complex->imaginary,
			
 
				+					       dst_node,
			
 
				+					       src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				+					       0,
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -19,8 +19,13 @@
 
				 #include <limits.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+static unsigned nblocks = 512;
			
 
				+static unsigned entries_per_bock = 64;
			
 
				+#else
			
 
				 static unsigned nblocks = 8192;
			
 
				 static unsigned entries_per_bock = 1024;
			
 
				+#endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
--- a/gcc-plugin/examples/vector_scal/vector_scal.c
+++ b/gcc-plugin/examples/vector_scal/vector_scal.c
@@ -13,13 +13,12 @@
 
				  *
			
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				-#ifndef STARPU_GCC_PLUGIN
			
 
				-# error must be compiled with the StarPU GCC plug-in
			
 
				-#endif
			
 
				 
			
 
				-/* This examples showcases features of the StarPU GCC plug-in.  It defines a
			
 
				+/* This example showcases features of the StarPU GCC plug-in.  It defines a
			
 
				    "vector scaling" task with multiple CPU implementations, an OpenCL
			
 
				-   implementation, and a CUDA implementation.   */
			
 
				+   implementation, and a CUDA implementation.
			
 
				+
			
 
				+   Compiling it without `-fplugin=starpu.so' yields valid sequential code.  */
			
 
				 
			
 
				 #include <math.h>
			
 
				 #include <stdbool.h>
			
@@ -32,11 +31,9 @@
 
				 static void vector_scal (unsigned int size, float vector[size], float factor)
			
 
				   __attribute__ ((task));
			
 
				 
			
 
				-static void vector_scal_cpu (unsigned int size, float vector[size], float factor)
			
 
				-  __attribute__ ((task_implementation ("cpu", vector_scal)));
			
 
				-
			
 
				+/* The CPU implementation.  */
			
 
				 static void
			
 
				-vector_scal_cpu (unsigned int size, float vector[size], float factor)
			
 
				+vector_scal (unsigned int size, float vector[size], float factor)
			
 
				 {
			
 
				   unsigned int i;
			
 
				   for (i = 0; i < size; i++)
			
@@ -44,7 +41,7 @@ vector_scal_cpu (unsigned int size, float vector[size], float factor)
 
				 }
			
 
				 
			
 
				 
			
 
				-#ifdef __SSE__
			
 
				+#if defined STARPU_GCC_PLUGIN && defined __SSE__
			
 
				 /* The SSE-capable CPU implementation.  */
			
 
				 
			
 
				 #include <xmmintrin.h>
			
@@ -78,7 +75,7 @@ vector_scal_sse (unsigned int size, float vector[size], float factor)
 
				 
			
 
				 /* Declaration and definition of the OpenCL implementation.  */
			
 
				 
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
			
 
				 
			
 
				 #include <starpu_opencl.h>
			
 
				 
			
@@ -166,7 +163,7 @@ main (void)
 
				 
			
 
				 #pragma starpu initialize
			
 
				 
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
			
 
				   starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
			
 
				 				       &cl_programs, "");
			
 
				 #endif
			
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
@@ -133,11 +133,15 @@ static tree unpack_fn, data_lookup_fn;
 
				 /* Forward declarations.  */
			
 
				 
			
 
				 static tree build_codelet_declaration (tree task_decl);
			
 
				+static tree build_cpu_codelet_identifier (const_tree task);
			
 
				 static void define_task (tree task_decl);
			
 
				 static tree build_pointer_lookup (tree pointer);
			
 
				 
			
 
				 static bool task_p (const_tree decl);
			
 
				 static bool task_implementation_p (const_tree decl);
			
 
				+static tree task_implementation_task (const_tree task_impl);
			
 
				+static bool implicit_cpu_task_implementation_p (const_tree fn);
			
 
				+
			
 
				 
			
 
				 static int task_implementation_target_to_int (const_tree target);
			
 
				 
			
@@ -990,6 +994,31 @@ register_pragmas (void *gcc_data, void *user_data)
 
				 /* Attributes.  */
			
 
				 
			
 
				 
			
 
				+/* Turn FN into a task, and push its associated codelet declaration.  */
			
 
				+
			
 
				+static void
			
 
				+taskify_function (tree fn)
			
 
				+{
			
 
				+  gcc_assert (TREE_CODE (fn) == FUNCTION_DECL);
			
 
				+
			
 
				+  /* Add a `task' attribute and an empty `task_implementation_list'
			
 
				+     attribute.  */
			
 
				+  DECL_ATTRIBUTES (fn) =
			
 
				+    tree_cons (get_identifier (task_implementation_list_attribute_name),
			
 
				+	       NULL_TREE,
			
 
				+	       tree_cons (get_identifier (task_attribute_name), NULL_TREE,
			
 
				+			  DECL_ATTRIBUTES (fn)));
			
 
				+
			
 
				+  /* Push a declaration for the corresponding `struct starpu_codelet' object and
			
 
				+     add it as an attribute of FN.  */
			
 
				+  tree cl = build_codelet_declaration (fn);
			
 
				+  DECL_ATTRIBUTES (fn) =
			
 
				+    tree_cons (get_identifier (task_codelet_attribute_name), cl,
			
 
				+	       DECL_ATTRIBUTES (fn));
			
 
				+
			
 
				+  pushdecl (cl);
			
 
				+}
			
 
				+
			
 
				 /* Handle the `task' function attribute.  */
			
 
				 
			
 
				 static tree
			
@@ -1020,23 +1049,8 @@ handle_task_attribute (tree *node, tree name, tree args,
 
				 	error_at (DECL_SOURCE_LOCATION (fn),
			
 
				 		  "maximum number of pointer parameters exceeded");
			
 
				 
			
 
				-      /* This is a function declaration for something local to this
			
 
				-	 translation unit, so add the `task' attribute to FN.  */
			
 
				-      *no_add_attrs = false;
			
 
				-
			
 
				-      /* Add an empty `task_implementation_list' attribute.  */
			
 
				-      DECL_ATTRIBUTES (fn) =
			
 
				-	tree_cons (get_identifier (task_implementation_list_attribute_name),
			
 
				-		   NULL_TREE,
			
 
				-		   NULL_TREE);
			
 
				-
			
 
				-      /* Push a declaration for the corresponding `struct starpu_codelet' object and
			
 
				-	 add it as an attribute of FN.  */
			
 
				-      tree cl = build_codelet_declaration (fn);
			
 
				-      DECL_ATTRIBUTES (fn) =
			
 
				-	tree_cons (get_identifier (task_codelet_attribute_name), cl,
			
 
				-		   DECL_ATTRIBUTES (fn));
			
 
				-      pushdecl (cl);
			
 
				+      /* Turn FN into an actual task.  */
			
 
				+      taskify_function (fn);
			
 
				     }
			
 
				 
			
 
				   /* Lookup & cache function declarations for later reuse.  */
			
@@ -1067,6 +1081,7 @@ validate_opencl_argument_type (location_t loc, const_tree type)
 
				 	      /* Scalar types defined in OpenCL 1.2.  See
			
 
				 		 <http://www.khronos.org/files/opencl-1-2-quick-reference-card.pdf>.  */
			
 
				 	      { "char", "cl_char" },
			
 
				+	      { "signed char", "cl_char" },
			
 
				 	      { "unsigned char", "cl_uchar" },
			
 
				 	      { "uchar", "cl_uchar" },
			
 
				 	      { "short int", "cl_short" },
			
@@ -1135,6 +1150,43 @@ validate_opencl_argument_type (location_t loc, const_tree type)
 
				     }
			
 
				 }
			
 
				 
			
 
				+/* Add FN to the list of implementations of TASK_DECL.  */
			
 
				+
			
 
				+static void
			
 
				+add_task_implementation (tree task_decl, tree fn, const_tree where)
			
 
				+{
			
 
				+  location_t loc;
			
 
				+  tree attr, impls;
			
 
				+
			
 
				+  attr = lookup_attribute (task_implementation_list_attribute_name,
			
 
				+			   DECL_ATTRIBUTES (task_decl));
			
 
				+  gcc_assert (attr != NULL_TREE);
			
 
				+
			
 
				+  gcc_assert (TREE_CODE (where) == STRING_CST);
			
 
				+
			
 
				+  loc = DECL_SOURCE_LOCATION (fn);
			
 
				+
			
 
				+  impls = tree_cons (NULL_TREE, fn, TREE_VALUE (attr));
			
 
				+  TREE_VALUE (attr) = impls;
			
 
				+
			
 
				+  TREE_USED (fn) = true;
			
 
				+
			
 
				+  /* Check the `where' argument to raise a warning if needed.  */
			
 
				+  if (task_implementation_target_to_int (where) == 0)
			
 
				+    warning_at (loc, 0,
			
 
				+		"unsupported target %E; task implementation won't be used",
			
 
				+		where);
			
 
				+  else if (task_implementation_target_to_int (where) == STARPU_OPENCL)
			
 
				+    {
			
 
				+      local_define (void, validate, (tree t))
			
 
				+	{
			
 
				+	  validate_opencl_argument_type (loc, t);
			
 
				+	};
			
 
				+
			
 
				+      for_each (validate, TYPE_ARG_TYPES (TREE_TYPE (fn)));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 /* Handle the `task_implementation (WHERE, TASK)' attribute.  WHERE is a
			
 
				    string constant ("cpu", "cuda", etc.), and TASK is the identifier of a
			
 
				    function declared with the `task' attribute.  */
			
@@ -1155,6 +1207,12 @@ handle_task_implementation_attribute (tree *node, tree name, tree args,
 
				   where = TREE_VALUE (args);
			
 
				   task_decl = TREE_VALUE (TREE_CHAIN (args));
			
 
				 
			
 
				+  if (implicit_cpu_task_implementation_p (task_decl))
			
 
				+    /* TASK_DECL is actually a CPU implementation.  Implicit CPU task
			
 
				+       implementations can lead to this situation, because the task is
			
 
				+       renamed and modified to become a CPU implementation.  */
			
 
				+    task_decl = task_implementation_task (task_decl);
			
 
				+
			
 
				   loc = DECL_SOURCE_LOCATION (fn);
			
 
				 
			
 
				   /* Get rid of the `task_implementation' attribute by default so that FN
			
@@ -1184,30 +1242,7 @@ handle_task_implementation_attribute (tree *node, tree name, tree args,
 
				   else
			
 
				     {
			
 
				       /* Add FN to the list of implementations of TASK_DECL.  */
			
 
				-
			
 
				-      tree attr, impls;
			
 
				-
			
 
				-      attr = lookup_attribute (task_implementation_list_attribute_name,
			
 
				-			       DECL_ATTRIBUTES (task_decl));
			
 
				-      impls = tree_cons (NULL_TREE, fn, TREE_VALUE (attr));
			
 
				-      TREE_VALUE (attr) = impls;
			
 
				-
			
 
				-      TREE_USED (fn) = TREE_USED (task_decl);
			
 
				-
			
 
				-      /* Check the `where' argument to raise a warning if needed.  */
			
 
				-      if (task_implementation_target_to_int (where) == 0)
			
 
				-	warning_at (loc, 0,
			
 
				-		    "unsupported target %E; task implementation won't be used",
			
 
				-		    where);
			
 
				-      else if (task_implementation_target_to_int (where) == STARPU_OPENCL)
			
 
				-	{
			
 
				-	  local_define (void, validate, (tree t))
			
 
				-	  {
			
 
				-	    validate_opencl_argument_type (loc, t);
			
 
				-	  };
			
 
				-
			
 
				-	  for_each (validate, TYPE_ARG_TYPES (TREE_TYPE (fn)));
			
 
				-	}
			
 
				+      add_task_implementation (task_decl, fn, where);
			
 
				 
			
 
				       /* Keep the attribute.  */
			
 
				       *no_add_attrs = false;
			
@@ -1455,7 +1490,7 @@ task_where (const_tree task_decl)
 
				 static tree
			
 
				 task_implementation_task (const_tree task_impl)
			
 
				 {
			
 
				-  tree impl_attr, args;
			
 
				+  tree impl_attr, args, task;
			
 
				 
			
 
				   gcc_assert (TREE_CODE (task_impl) == FUNCTION_DECL);
			
 
				 
			
@@ -1465,7 +1500,13 @@ task_implementation_task (const_tree task_impl)
 
				 
			
 
				   args = TREE_VALUE (impl_attr);
			
 
				 
			
 
				-  return TREE_VALUE (TREE_CHAIN (args));
			
 
				+  task = TREE_VALUE (TREE_CHAIN (args));
			
 
				+  if (task_implementation_p (task))
			
 
				+    /* TASK is an implicit CPU task implementation, so return its real
			
 
				+       task.  */
			
 
				+    return task_implementation_task (task);
			
 
				+
			
 
				+  return task;
			
 
				 }
			
 
				 
			
 
				 /* Return the FUNCTION_DECL of the wrapper generated for TASK_IMPL.  */
			
@@ -1484,6 +1525,23 @@ task_implementation_wrapper (const_tree task_impl)
 
				   return TREE_VALUE (attr);
			
 
				 }
			
 
				 
			
 
				+/* Return true when FN is an implicit CPU task implementation.  */
			
 
				+
			
 
				+static bool
			
 
				+implicit_cpu_task_implementation_p (const_tree fn)
			
 
				+{
			
 
				+  if (task_implementation_p (fn)
			
 
				+      && task_implementation_where (fn) == STARPU_CPU)
			
 
				+    {
			
 
				+      /* XXX: Hackish heuristic.  */
			
 
				+      const_tree cpu_id;
			
 
				+      cpu_id = build_cpu_codelet_identifier (task_implementation_task (fn));
			
 
				+      return cpu_id == DECL_NAME (fn);
			
 
				+    }
			
 
				+
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				 /* Return true when VAR_DECL has the `heap_allocated' attribute.  */
			
 
				 
			
 
				 static bool
			
@@ -2063,6 +2121,27 @@ declare_codelet (tree task_decl)
 
				   return cl_decl;
			
 
				 }
			
 
				 
			
 
				+/* Return the identifier for an automatically-generated CPU codelet of
			
 
				+   TASK.  */
			
 
				+
			
 
				+static tree
			
 
				+build_cpu_codelet_identifier (const_tree task)
			
 
				+{
			
 
				+  static const char suffix[] = ".cpu_implementation";
			
 
				+
			
 
				+  tree id;
			
 
				+  char *cl_name;
			
 
				+  const char *task_name;
			
 
				+
			
 
				+  id = DECL_NAME (task);
			
 
				+  task_name = IDENTIFIER_POINTER (id);
			
 
				+
			
 
				+  cl_name = (char *) alloca (IDENTIFIER_LENGTH (id) + strlen (suffix) + 1);
			
 
				+  memcpy (cl_name, task_name, IDENTIFIER_LENGTH (id));
			
 
				+  strcpy (&cl_name[IDENTIFIER_LENGTH (id)], suffix);
			
 
				+
			
 
				+  return get_identifier (cl_name);
			
 
				+}
			
 
				 
			
 
				 static void
			
 
				 handle_pre_genericize (void *gcc_data, void *user_data)
			
@@ -2072,10 +2151,56 @@ handle_pre_genericize (void *gcc_data, void *user_data)
 
				   gcc_assert (TREE_CODE (fn) == FUNCTION_DECL);
			
 
				 
			
 
				   if (task_p (fn) && TREE_STATIC (fn))
			
 
				-    /* The user defined a body for task FN, which is forbidden.  */
			
 
				-    error_at (DECL_SOURCE_LOCATION (fn),
			
 
				-	      "task %qE must not have a body", DECL_NAME (fn));
			
 
				-  else if (task_implementation_p (fn))
			
 
				+    {
			
 
				+      /* The user defined a body for task FN, which we interpret as being the
			
 
				+	 body of an implicit CPU task implementation for FN.  Thus, rename FN
			
 
				+	 and turn it into the "cpu" implementation of a task that we create
			
 
				+	 under FN's original name (this is easier than moving the body to a
			
 
				+	 different function, which would require traversing the body to
			
 
				+	 rewrite all references to FN to point to the new function.)  Later,
			
 
				+	 `lower_starpu' rewrites calls to FN as calls to the newly created
			
 
				+	 task.  */
			
 
				+
			
 
				+      tree task_name = DECL_NAME (fn);
			
 
				+
			
 
				+      tree cpu_impl = fn;
			
 
				+      DECL_NAME (cpu_impl) = build_cpu_codelet_identifier (fn);
			
 
				+      if (verbose_output_p)
			
 
				+	inform (DECL_SOURCE_LOCATION (fn),
			
 
				+		"implicit CPU implementation renamed from %qE to %qE",
			
 
				+		task_name, DECL_NAME (cpu_impl));
			
 
				+
			
 
				+      tree task = build_decl (DECL_SOURCE_LOCATION (fn), FUNCTION_DECL,
			
 
				+			      task_name, TREE_TYPE (fn));
			
 
				+
			
 
				+      TREE_PUBLIC (task) = TREE_PUBLIC (fn);
			
 
				+      TREE_PUBLIC (cpu_impl) = false;
			
 
				+
			
 
				+      taskify_function (task);
			
 
				+
			
 
				+      /* Inherit the task implementation list from FN.  */
			
 
				+      tree impls = lookup_attribute (task_implementation_list_attribute_name,
			
 
				+				     DECL_ATTRIBUTES (fn));
			
 
				+      gcc_assert (impls != NULL_TREE);
			
 
				+      impls = TREE_VALUE (impls);
			
 
				+
			
 
				+      DECL_ATTRIBUTES (task) =
			
 
				+	tree_cons (get_identifier (task_implementation_list_attribute_name),
			
 
				+		   impls, DECL_ATTRIBUTES (task));
			
 
				+
			
 
				+      /* Make CPU_IMPL an implementation of FN.  */
			
 
				+      DECL_ATTRIBUTES (cpu_impl) =
			
 
				+	tree_cons (get_identifier (task_implementation_attribute_name),
			
 
				+		   tree_cons (NULL_TREE, build_string (3, "cpu"),
			
 
				+			      tree_cons (NULL_TREE, task, NULL_TREE)),
			
 
				+		   NULL_TREE);
			
 
				+
			
 
				+      add_task_implementation (task, cpu_impl, build_string (3, "cpu"));
			
 
				+
			
 
				+      /* And now, process CPU_IMPL.  */
			
 
				+    }
			
 
				+
			
 
				+  if (task_implementation_p (fn))
			
 
				     {
			
 
				       tree task = task_implementation_task (fn);
			
 
				 
			
@@ -2117,6 +2242,7 @@ handle_pre_genericize (void *gcc_data, void *user_data)
 
				 	  rest_of_decl_compilation (task, true, 0);
			
 
				 	  allocate_struct_function (task, false);
			
 
				 	  cgraph_finalize_function (task, false);
			
 
				+	  cgraph_mark_needed_node (cgraph_get_node (task));
			
 
				 	}
			
 
				     }
			
 
				 }
			
@@ -2265,6 +2391,7 @@ define_task (tree task_decl)
 
				   DECL_SAVED_TREE (task_decl) = bind;
			
 
				   TREE_STATIC (task_decl) = true;
			
 
				   DECL_EXTERNAL (task_decl) = false;
			
 
				+  DECL_ARTIFICIAL (task_decl) = true;
			
 
				   DECL_INITIAL (task_decl) =
			
 
				     build_block (error_var, NULL_TREE, task_decl, NULL_TREE);
			
 
				   DECL_RESULT (task_decl) =
			
@@ -2398,12 +2525,28 @@ lower_starpu (void)
 
				     {
			
 
				       gcc_assert (callee->callee != NULL);
			
 
				 
			
 
				-      tree callee_decl;
			
 
				+      tree callee_decl, caller_decl;
			
 
				 
			
 
				       callee_decl = callee->callee->decl;
			
 
				+      caller_decl = callee->caller->decl;
			
 
				+
			
 
				+      if (implicit_cpu_task_implementation_p (callee_decl)
			
 
				+	  && !DECL_ARTIFICIAL (caller_decl))
			
 
				+	{
			
 
				+	  /* Rewrite the call to point to the actual task beneath
			
 
				+	     CALLEE_DECL.  */
			
 
				+	  callee_decl = task_implementation_task (callee_decl);
			
 
				+	  if (verbose_output_p)
			
 
				+	    inform (gimple_location (callee->call_stmt),
			
 
				+		    "call to %qE rewritten as a call to task %qE",
			
 
				+		    DECL_NAME (callee->callee->decl),
			
 
				+		    DECL_NAME (callee_decl));
			
 
				+
			
 
				+	  gimple_call_set_fn (callee->call_stmt,
			
 
				+			      build_addr (callee_decl, callee->caller->decl));
			
 
				+	}
			
 
				 
			
 
				-      if (lookup_attribute (task_attribute_name,
			
 
				-			    DECL_ATTRIBUTES (callee_decl)))
			
 
				+      if (task_p (callee_decl))
			
 
				 	{
			
 
				 	  if (verbose_output_p)
			
 
				 	    inform (gimple_location (callee->call_stmt),
			
--- a/gcc-plugin/tests/base.c
+++ b/gcc-plugin/tests/base.c
@@ -72,7 +72,26 @@ my_other_task_opencl (int x)
 
				   printf ("opencl\n");
			
 
				 }
			
 
				 
			
 
				+
			
 
				+/* Task with a body.  */
			
 
				+
			
 
				+static void my_task_with_body (int x) __attribute__ ((task));
			
 
				+
			
 
				+static void
			
 
				+my_task_with_body (int x)
			
 
				+{
			
 
				+  /* This body is implicitly the "cpu" implementation of the task.  */
			
 
				+  int y = x + 2;
			
 
				+  printf ("body: %i\n", y - 2);
			
 
				+}
			
 
				 
			
 
				+static void my_task_with_body_opencl (int x)
			
 
				+  __attribute__ ((task_implementation ("opencl", my_task_with_body)));
			
 
				+
			
 
				+static void
			
 
				+my_task_with_body_opencl (int x)
			
 
				+{
			
 
				+}
			
 
				 
			
 
				 
			
 
				 int
			
@@ -124,7 +143,9 @@ main (int argc, char *argv[])
 
				 
			
 
				   my_other_task (42);
			
 
				 
			
 
				-  assert (tasks_submitted == 10);
			
 
				+  my_task_with_body (42);
			
 
				+
			
 
				+  assert (tasks_submitted == 11);
			
 
				 
			
 
				 #pragma starpu shutdown
			
 
				   assert (initialized == 0);
			
--- a/gcc-plugin/tests/lib-user.c
+++ b/gcc-plugin/tests/lib-user.c
@@ -1,5 +1,5 @@
 
				 /* GCC-StarPU
			
 
				-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				 
			
 
				    GCC-StarPU is free software: you can redistribute it and/or modify
			
 
				    it under the terms of the GNU General Public License as published by
			
@@ -30,7 +30,8 @@ main (int argc, char *argv[])
 
				 
			
 
				   /* Align X so that the assumptions behind `dummy_pointer_to_handle'
			
 
				      hold.  */
			
 
				-  static const char x[] __attribute__ ((aligned (8))) = { 0, 1, 2, 3, 4, 5 };
			
 
				+  static const signed char x[] __attribute__ ((aligned (8))) =
			
 
				+    { 0, 1, 2, 3, 4, 5 };
			
 
				 
			
 
				   float y[sizeof x];
			
 
				 
			
--- a/gcc-plugin/tests/my-lib.c
+++ b/gcc-plugin/tests/my-lib.c
@@ -1,5 +1,5 @@
 
				 /* GCC-StarPU
			
 
				-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				 
			
 
				    GCC-StarPU is free software: you can redistribute it and/or modify
			
 
				    it under the terms of the GNU General Public License as published by
			
@@ -24,11 +24,9 @@
 
				 /* Task implementations: one is `static', one is global.  The codelet
			
 
				    wrapper, codelet, and task body should be instantiated in this file.  */
			
 
				 
			
 
				-static void my_task_opencl (char, const char *, float *, int)
			
 
				-  __attribute__ ((task_implementation ("opencl", my_task)));
			
 
				-
			
 
				+/* The implicit CPU implementation of `my_task'.  */
			
 
				 void
			
 
				-my_task_cpu (char a, const char *p, float *q, int b)
			
 
				+my_task (signed char a, const signed char *p, float *q, int b)
			
 
				 {
			
 
				   int i;
			
 
				 
			
@@ -36,8 +34,8 @@ my_task_cpu (char a, const char *p, float *q, int b)
 
				     *q = *p + a;
			
 
				 }
			
 
				 
			
 
				-static void
			
 
				-my_task_opencl (char a, const char *p, float *q, int b)
			
 
				+void
			
 
				+my_task_opencl (signed char a, const signed char *p, float *q, int b)
			
 
				 {
			
 
				   int i;
			
 
				 
			
--- a/gcc-plugin/tests/my-lib.h
+++ b/gcc-plugin/tests/my-lib.h
@@ -1,5 +1,5 @@
 
				 /* GCC-StarPU
			
 
				-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				 
			
 
				    GCC-StarPU is free software: you can redistribute it and/or modify
			
 
				    it under the terms of the GNU General Public License as published by
			
@@ -19,12 +19,12 @@
 
				 #ifndef MY_LIB_H
			
 
				 #define MY_LIB_H
			
 
				 
			
 
				-extern void my_task (char, const char *, float *, int)
			
 
				+extern void my_task (signed char, const signed char *, float *, int)
			
 
				   __attribute__ ((task));
			
 
				 
			
 
				 /* One of the implementations of MY_TASK.  Since it's `extern', this should
			
 
				    not trigger generation of the codelet, wrapper, etc.  */
			
 
				-extern void my_task_cpu (char, const char *, float *, int)
			
 
				-  __attribute__ ((task_implementation ("cpu", my_task)));
			
 
				+extern void my_task_opencl (signed char, const signed char *, float *, int)
			
 
				+  __attribute__ ((task_implementation ("opencl", my_task)));
			
 
				 
			
 
				 #endif /* MY_LIB_H */
			
--- a/gcc-plugin/tests/pointer-tasks.c
+++ b/gcc-plugin/tests/pointer-tasks.c
@@ -1,5 +1,5 @@
 
				 /* GCC-StarPU
			
 
				-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				 
			
 
				    GCC-StarPU is free software: you can redistribute it and/or modify
			
 
				    it under the terms of the GNU General Public License as published by
			
@@ -28,17 +28,15 @@
 
				 static void my_pointer_task (const int *x, char a, long long *y, int b)
			
 
				   __attribute__ ((task));
			
 
				 
			
 
				-static void my_pointer_task_cpu (const int *x, char a, long long *y, int b)
			
 
				-  __attribute__ ((task_implementation ("cpu", my_pointer_task), noinline));
			
 
				-
			
 
				 static int implementations_called;
			
 
				 
			
 
				 /* The input arguments.  */
			
 
				 static const int pointer_arg1[] = { 42, 1, 2, 3, 4, 5 };
			
 
				 static long long *pointer_arg2;
			
 
				 
			
 
				+/* CPU implementation of `my_pointer_task'.  */
			
 
				 static void
			
 
				-my_pointer_task_cpu (const int *x, char a, long long *y, int b)
			
 
				+my_pointer_task (const int *x, char a, long long *y, int b)
			
 
				 {
			
 
				   implementations_called |= STARPU_CPU;
			
 
				   assert (x == pointer_arg1);
			
--- a/gcc-plugin/tests/scalar-tasks.c
+++ b/gcc-plugin/tests/scalar-tasks.c
@@ -1,5 +1,5 @@
 
				 /* GCC-StarPU
			
 
				-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				 
			
 
				    GCC-StarPU is free software: you can redistribute it and/or modify
			
 
				    it under the terms of the GNU General Public License as published by
			
@@ -26,15 +26,14 @@
 
				 
			
 
				 static void my_scalar_task (int x, int y) __attribute__ ((task));
			
 
				 
			
 
				-static void my_scalar_task_cpu (int, int)
			
 
				-  __attribute__ ((task_implementation ("cpu", my_scalar_task), noinline));
			
 
				 static void my_scalar_task_opencl (int, int)
			
 
				   __attribute__ ((task_implementation ("opencl", my_scalar_task), noinline));
			
 
				 
			
 
				 static int implementations_called;
			
 
				 
			
 
				+/* CPU implementation of `my_scalar_task'.  */
			
 
				 static void
			
 
				-my_scalar_task_cpu (int x, int y)
			
 
				+my_scalar_task (int x, int y)
			
 
				 {
			
 
				   implementations_called |= STARPU_CPU;
			
 
				   assert (x == 42);
			
--- a/gcc-plugin/tests/task-errors.c
+++ b/gcc-plugin/tests/task-errors.c
@@ -18,7 +18,8 @@
 
				 
			
 
				 extern void my_external_task (int foo, char *bar) __attribute__ ((task));
			
 
				 
			
 
				-static void my_task (int foo, char *bar) __attribute__ ((task));
			
 
				+void my_task (int foo, char *bar) /* (error "none of the implementations") */
			
 
				+  __attribute__ ((task));
			
 
				 static void my_task_cpu (int foo, float *bar)    /* (error "type differs") */
			
 
				   __attribute__ ((task_implementation ("cpu", my_task)));
			
 
				 
			
@@ -48,9 +49,6 @@ static void my_task_wrong_task_arg (int foo, char *bar)   /* (error "not a funct
 
				 static void my_task_wrong_target_arg (int foo, char *bar) /* (error "string constant expected") */
			
 
				   __attribute__ ((task_implementation (123, my_task)));
			
 
				 
			
 
				-static void my_task_with_a_body (int foo, char *bar)
			
 
				-  __attribute__ ((task, unused));
			
 
				-
			
 
				 extern int my_task_not_void (int foo) /* (error "return type") */
			
 
				   __attribute__ ((task));
			
 
				 
			
@@ -116,11 +114,6 @@ my_task_wrong_target_arg (int foo, char *bar)
 
				 {
			
 
				 }
			
 
				 
			
 
				-static void
			
 
				-my_task_with_a_body (int foo, char *bar)  /* (error "must not have a body") */
			
 
				-{
			
 
				-}
			
 
				-
			
 
				 void
			
 
				 my_task_that_invokes_task_cpu (int x, char *y)
			
 
				 {
			
--- a/include/pthread_win32/pthread.h
+++ b/include/pthread_win32/pthread.h
@@ -0,0 +1,410 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* This is a minimal pthread implementation based on windows functions.
			
 
				+ * It is *not* intended to be complete - just complete enough to get
			
 
				+ * StarPU running.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_PTHREAD_H__
			
 
				+#define __STARPU_PTHREAD_H__
			
 
				+
			
 
				+/* TODO:
			
 
				+ * pthread_rwlock_*
			
 
				+ * pthread_spinlock_*
			
 
				+ */
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif /* __cplusplus */
			
 
				+
			
 
				+#include <windows.h>
			
 
				+#undef interface
			
 
				+#include <stdio.h>
			
 
				+#include <errno.h>
			
 
				+
			
 
				+#ifdef __CYGWIN32__
			
 
				+#include <sys/cygwin.h>
			
 
				+#define unixErrno() cygwin_internal(CW_GET_ERRNO_FROM_WINERROR, (GetLastError())
			
 
				+#else
			
 
				+#define unixErrno() EIO
			
 
				+#endif
			
 
				+#if 0
			
 
				+#define setSystemErrno() do { fprintf(stderr,"%s:%d: win %d\n", __FILE__, __LINE__, GetLastError()); errno = unixErrno(); } while (0)
			
 
				+#define winPthreadAssertWindows(expr) do { if (!(expr)) { fprintf(stderr,"%s:%d: %d\n", __FILE__, __LINE__, unixErrno()); return unixErrno(); } } while (0)
			
 
				+#define winPthreadAssertPthread(expr) do { int ret = (expr); if (ret) { fprintf(stderr,"%s:%d: %d\n", __FILE__, __LINE__, ret); return ret; } } while (0)
			
 
				+#define winPthreadAssert(expr) do { if (!(expr)) { fprintf(stderr,"%s:%d: %d\n", __FILE__, __LINE__, errno); return EIO; } } while (0)
			
 
				+#else
			
 
				+#define setSystemErrno() errno = unixErrno()
			
 
				+#define winPthreadAssertWindows(expr) do { if (!(expr)) { return unixErrno(); } } while (0)
			
 
				+#define winPthreadAssertPthread(expr) do { int ret = (expr); if (ret) return ret; } while (0)
			
 
				+#define winPthreadAssert(expr) do { if (!(expr)) return EIO; } while (0)
			
 
				+#endif
			
 
				+#if 0
			
 
				+#else
			
 
				+#endif
			
 
				+
			
 
				+/***********
			
 
				+ * threads *
			
 
				+ ***********/
			
 
				+
			
 
				+typedef DWORD pthread_attr_t;
			
 
				+typedef HANDLE pthread_t;
			
 
				+
			
 
				+static inline pthread_t pthread_self(void) {
			
 
				+  return GetCurrentThread();
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_equal(pthread_t t1, pthread_t t2) {
			
 
				+  return t1 == t2;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_attr_init (pthread_attr_t *attr) {
			
 
				+  *attr = 0;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+#define PTHREAD_CREATE_DETACHED 1
			
 
				+static inline int pthread_attr_setdetachstate (pthread_attr_t *attr, int yes) {
			
 
				+  (void)attr;
			
 
				+  (void)yes;
			
 
				+  /* not supported, ignore */
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_attr_setstacksize (pthread_attr_t *attr, size_t stacksize) {
			
 
				+  (void)attr;
			
 
				+  (void)stacksize;
			
 
				+  /* not supported, ignore */
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_attr_destroy (pthread_attr_t *attr) {
			
 
				+  (void)attr;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+/* "real" cleanup handling not yet implemented */
			
 
				+typedef struct {
			
 
				+  void (*routine) (void *);
			
 
				+  void *arg;
			
 
				+} __pthread_cleanup_handler;
			
 
				+
			
 
				+void pthread_cleanup_push (void (*routine) (void *), void *arg);
			
 
				+#define pthread_cleanup_push(routine, arg) do { \
			
 
				+  __pthread_cleanup_handler __cleanup_handler = {routine, arg};
			
 
				+
			
 
				+void pthread_cleanup_pop (int execute);
			
 
				+#define pthread_cleanup_pop(execute) \
			
 
				+  if (execute) __cleanup_handler.routine(__cleanup_handler.arg); \
			
 
				+} while (0);
			
 
				+
			
 
				+static inline int pthread_create (
			
 
				+  pthread_t *thread, const pthread_attr_t *attr,
			
 
				+  void * (*fun) (void *), void *arg
			
 
				+) {
			
 
				+  if (attr && *attr)
			
 
				+    return EINVAL;
			
 
				+  winPthreadAssertWindows(*thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) fun, arg, 0, NULL));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_setcancelstate (int state, int *oldstate) {
			
 
				+  (void)state;
			
 
				+  (void)oldstate;
			
 
				+  /* not yet implemented :( */
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_cancel (pthread_t thread) {
			
 
				+  /* This is quite harsh :( */
			
 
				+  winPthreadAssertWindows(TerminateThread(thread, 0));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline void pthread_exit (void *res) {
			
 
				+  ExitThread((DWORD) res);
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_join (pthread_t thread, void **res) {
			
 
				+again:
			
 
				+  switch (WaitForSingleObject(thread, INFINITE)) {
			
 
				+    default:
			
 
				+    case WAIT_FAILED:
			
 
				+      return unixErrno();
			
 
				+    case WAIT_ABANDONED:
			
 
				+    case WAIT_OBJECT_0:
			
 
				+      break;
			
 
				+    case WAIT_TIMEOUT:
			
 
				+      goto again;
			
 
				+  }
			
 
				+  if (res) {
			
 
				+    DWORD _res;
			
 
				+    if (GetExitCodeThread(thread, &_res))
			
 
				+      *res = (void *)_res;
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+/***********
			
 
				+ * mutexes *
			
 
				+ ***********/
			
 
				+
			
 
				+#define PTHREAD_MUTEX_INITIALIZER NULL
			
 
				+typedef HANDLE pthread_mutex_t;
			
 
				+#define PTHREAD_MUTEX_RECURSIVE 1
			
 
				+#define PTHREAD_MUTEX_ERRORCHECK 2
			
 
				+typedef int pthread_mutexattr_t;
			
 
				+
			
 
				+static inline int pthread_mutexattr_init(pthread_mutexattr_t *attr) {
			
 
				+  *attr = PTHREAD_MUTEX_RECURSIVE;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type) {
			
 
				+  if (type != PTHREAD_MUTEX_RECURSIVE && type != PTHREAD_MUTEX_ERRORCHECK)
			
 
				+    return EINVAL;
			
 
				+  *attr = type;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_mutex_init (pthread_mutex_t *mutex, pthread_mutexattr_t *attr) {
			
 
				+  if (attr && *attr!=PTHREAD_MUTEX_RECURSIVE)
			
 
				+    return EINVAL;
			
 
				+  winPthreadAssertWindows(*mutex = CreateMutex(NULL, FALSE, NULL));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_mutex_unlock (pthread_mutex_t *mutex) {
			
 
				+  winPthreadAssertWindows(ReleaseMutex(*mutex));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_mutex_lock (pthread_mutex_t *mutex);
			
 
				+static inline int __pthread_mutex_alloc_concurrently (pthread_mutex_t *mutex) {
			
 
				+  HANDLE mutex_init_mutex;
			
 
				+  /* Get access to one global named mutex to serialize mutex initialization */
			
 
				+  winPthreadAssertWindows((mutex_init_mutex = CreateMutex(NULL, FALSE, "StarPU mutex init")));
			
 
				+  winPthreadAssertPthread(pthread_mutex_lock(&mutex_init_mutex));
			
 
				+  /* Now we are the one that can initialize it */
			
 
				+  if (!*mutex)
			
 
				+    winPthreadAssertPthread(pthread_mutex_init(mutex,NULL));
			
 
				+  winPthreadAssertPthread(pthread_mutex_unlock(&mutex_init_mutex));
			
 
				+  winPthreadAssertWindows(CloseHandle(mutex_init_mutex));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_mutex_lock (pthread_mutex_t *mutex) {
			
 
				+  if (!*mutex)
			
 
				+    __pthread_mutex_alloc_concurrently (mutex);
			
 
				+again:
			
 
				+  switch (WaitForSingleObject(*mutex, INFINITE)) {
			
 
				+    default:
			
 
				+    case WAIT_FAILED:
			
 
				+      return unixErrno();;
			
 
				+    case WAIT_ABANDONED:
			
 
				+    case WAIT_OBJECT_0:
			
 
				+      return 0;
			
 
				+    case WAIT_TIMEOUT:
			
 
				+      goto again;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_mutex_trylock (pthread_mutex_t *mutex) {
			
 
				+  if (!*mutex)
			
 
				+    __pthread_mutex_alloc_concurrently (mutex);
			
 
				+  switch (WaitForSingleObject(*mutex, 0)) {
			
 
				+    default:
			
 
				+    case WAIT_FAILED:
			
 
				+      return unixErrno();
			
 
				+    case WAIT_ABANDONED:
			
 
				+    case WAIT_OBJECT_0:
			
 
				+      return 0;
			
 
				+    case WAIT_TIMEOUT:
			
 
				+      return EBUSY;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_mutex_destroy (pthread_mutex_t *mutex) {
			
 
				+  winPthreadAssertWindows(CloseHandle(*mutex));
			
 
				+  *mutex = INVALID_HANDLE_VALUE;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+/********************************************
			
 
				+ * rwlock                                   *
			
 
				+ * VERY LAZY, don't even look at it please! *
			
 
				+ * Should be fine unoptimized for now.      *
			
 
				+ * TODO: FIXME, using conds for instance?   *
			
 
				+ ********************************************/
			
 
				+
			
 
				+#define PTHREAD_RWLOCK_INITIALIZER NULL
			
 
				+typedef pthread_mutex_t pthread_rwlock_t;
			
 
				+#define pthread_rwlock_init(lock, attr) pthread_mutex_init(lock, NULL)
			
 
				+#define pthread_rwlock_wrlock(lock) pthread_mutex_lock(lock)
			
 
				+#define pthread_rwlock_rdlock(lock) pthread_mutex_lock(lock)
			
 
				+#define pthread_rwlock_unlock(lock) pthread_mutex_unlock(lock)
			
 
				+#define pthread_rwlock_destroy(lock) pthread_mutex_destroy(lock)
			
 
				+
			
 
				+/**************
			
 
				+ * conditions *
			
 
				+ **************/
			
 
				+
			
 
				+typedef struct {
			
 
				+  HANDLE sem;
			
 
				+  volatile unsigned nbwait;
			
 
				+} pthread_cond_t;
			
 
				+#define PTHREAD_COND_INITIALIZER { NULL, 0}
			
 
				+
			
 
				+#ifndef STARPU_TIMESPEC_DEFINED
			
 
				+#define STARPU_TIMESPEC_DEFINED 1
			
 
				+struct timespec {
			
 
				+  time_t  tv_sec;  /* Seconds */
			
 
				+  long    tv_nsec; /* Nanoseconds */
			
 
				+};
			
 
				+#endif /* STARPU_TIMESPEC_DEFINED */
			
 
				+
			
 
				+typedef unsigned pthread_condattr_t;
			
 
				+
			
 
				+static inline int pthread_cond_init (pthread_cond_t *cond, const pthread_condattr_t *attr) {
			
 
				+  if (attr)
			
 
				+    return EINVAL;
			
 
				+  winPthreadAssertWindows(cond->sem = CreateSemaphore(NULL, 0, MAXLONG, NULL));
			
 
				+  cond->nbwait = 0;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *time) {
			
 
				+  if (!cond->sem)
			
 
				+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
			
 
				+  cond->nbwait++;
			
 
				+  winPthreadAssertPthread(pthread_mutex_unlock(mutex));
			
 
				+again:
			
 
				+  switch (WaitForSingleObject(cond->sem, time->tv_sec*1000+time->tv_nsec/1000)) {
			
 
				+    default:
			
 
				+    case WAIT_FAILED:
			
 
				+    {
			
 
				+      int error = unixErrno();
			
 
				+      winPthreadAssertPthread(pthread_mutex_lock(mutex));
			
 
				+      return error;
			
 
				+    }
			
 
				+    case WAIT_TIMEOUT:
			
 
				+      goto again;
			
 
				+    case WAIT_ABANDONED:
			
 
				+    case WAIT_OBJECT_0:
			
 
				+      break;
			
 
				+  }
			
 
				+  winPthreadAssertPthread(pthread_mutex_lock(mutex));
			
 
				+  cond->nbwait--;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex) {
			
 
				+  if (!cond->sem)
			
 
				+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
			
 
				+  cond->nbwait++;
			
 
				+  winPthreadAssertPthread(pthread_mutex_unlock(mutex));
			
 
				+again:
			
 
				+  switch (WaitForSingleObject(cond->sem, INFINITE)) {
			
 
				+    case WAIT_FAILED:
			
 
				+    {
			
 
				+      int error;
			
 
				+      error = unixErrno();
			
 
				+      winPthreadAssertPthread(pthread_mutex_lock(mutex));
			
 
				+      return error;
			
 
				+    }
			
 
				+    case WAIT_TIMEOUT:
			
 
				+      goto again;
			
 
				+    case WAIT_ABANDONED:
			
 
				+    case WAIT_OBJECT_0:
			
 
				+      break;
			
 
				+  }
			
 
				+  winPthreadAssertPthread(pthread_mutex_lock(mutex));
			
 
				+  cond->nbwait--;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_cond_signal (pthread_cond_t *cond) {
			
 
				+  if (!cond->sem)
			
 
				+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
			
 
				+  if (cond->nbwait)
			
 
				+    ReleaseSemaphore(cond->sem, 1, NULL);
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_cond_broadcast (pthread_cond_t *cond) {
			
 
				+  if (!cond->sem)
			
 
				+    winPthreadAssertPthread(pthread_cond_init(cond,NULL));
			
 
				+  ReleaseSemaphore(cond->sem, cond->nbwait, NULL);
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_cond_destroy (pthread_cond_t *cond) {
			
 
				+  if (cond->sem) {
			
 
				+    winPthreadAssertWindows(CloseHandle(cond->sem));
			
 
				+    cond->sem = NULL;
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+/*******
			
 
				+ * TLS *
			
 
				+ *******/
			
 
				+
			
 
				+typedef DWORD pthread_key_t;
			
 
				+#define PTHREAD_ONCE_INIT {PTHREAD_MUTEX_INITIALIZER, 0}
			
 
				+typedef struct {
			
 
				+  pthread_mutex_t mutex;
			
 
				+  unsigned done;
			
 
				+} pthread_once_t;
			
 
				+
			
 
				+static inline int pthread_once (pthread_once_t *once, void (*oncefun)(void)) {
			
 
				+  winPthreadAssertPthread(pthread_mutex_lock(&once->mutex));
			
 
				+  if (!once->done) {
			
 
				+    oncefun();
			
 
				+    once->done = 1;
			
 
				+  }
			
 
				+  winPthreadAssertPthread(pthread_mutex_unlock(&once->mutex));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_key_create (pthread_key_t *key, void (*freefun)(void *)) {
			
 
				+  (void)freefun;
			
 
				+  DWORD res;
			
 
				+  winPthreadAssertWindows((res = TlsAlloc()) != 0xFFFFFFFF);
			
 
				+  *key = res;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_key_delete (pthread_key_t key) {
			
 
				+  winPthreadAssertWindows(TlsFree(key));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline void *pthread_getspecific (pthread_key_t key) {
			
 
				+  return TlsGetValue(key);
			
 
				+}
			
 
				+
			
 
				+static inline int pthread_setspecific (pthread_key_t key, const void *data) {
			
 
				+  winPthreadAssertWindows(TlsSetValue(key, (LPVOID) data));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif /* __cplusplus */
			
 
				+
			
 
				+#endif /* __STARPU_PTHREAD_H__ */
			
--- a/include/pthread_win32/semaphore.h
+++ b/include/pthread_win32/semaphore.h
@@ -0,0 +1,67 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* This is a minimal pthread implementation based on windows functions.
			
 
				+ * It is *not* intended to be complete - just complete enough to get
			
 
				+ * StarPU running.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_SEMAPHORE_H__
			
 
				+#define __STARPU_SEMAPHORE_H__
			
 
				+
			
 
				+#include "pthread.h"
			
 
				+
			
 
				+/**************
			
 
				+ * semaphores *
			
 
				+ **************/
			
 
				+
			
 
				+typedef HANDLE sem_t;
			
 
				+
			
 
				+static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
			
 
				+  (void)pshared;
			
 
				+  winPthreadAssertWindows(*sem = CreateSemaphore(NULL, value, MAXLONG, NULL));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int do_sem_wait(sem_t *sem, DWORD timeout) {
			
 
				+  switch (WaitForSingleObject(*sem, timeout)) {
			
 
				+    default:
			
 
				+    case WAIT_FAILED:
			
 
				+      setSystemErrno();
			
 
				+      return -1;
			
 
				+    case WAIT_TIMEOUT:
			
 
				+      errno = EAGAIN;
			
 
				+      return -1;
			
 
				+    case WAIT_ABANDONED:
			
 
				+    case WAIT_OBJECT_0:
			
 
				+      return 0;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+#define sem_wait(sem) do_sem_wait(sem, INFINITE)
			
 
				+#define sem_trywait(sem) do_sem_wait(sem, 0)
			
 
				+
			
 
				+static inline int sem_post(sem_t *sem) {
			
 
				+  winPthreadAssertWindows(ReleaseSemaphore(*sem, 1, NULL));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int sem_destroy(sem_t *sem) {
			
 
				+  winPthreadAssertWindows(CloseHandle(*sem));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+#endif /* __STARPU_SEMAPHORE_H__ */
			
--- a/include/starpu_deprecated_api.h
+++ b/include/starpu_deprecated_api.h
@@ -23,7 +23,7 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 
			
 
				-#warning "Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh"
			
 
				+#warning Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh.
			
 
				 
			
 
				 typedef starpu_data_handle_t starpu_data_handle;
			
 
				 typedef struct starpu_block_interface starpu_block_interface_t;
			
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -94,13 +94,9 @@ int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
				 
			
 
				 cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
			
 
				 
			
 
				-cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event);
			
 
				+cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
			
 
				 
			
 
				-cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event);
			
 
				-
			
 
				-cl_int starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
			
 
				-
			
 
				-cl_int starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
			
 
				+cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/socl/src/gc.c
+++ b/socl/src/gc.c
@@ -61,7 +61,8 @@ static void * gc_thread_routine(void *UNUSED(arg)) {
 
				 
			
 
				       /* Release entity */
			
 
				       entity next = r->next;
			
 
				-      free(r);
			
 
				+#warning FIXME: free memory
			
 
				+//      free(r);
			
 
				 
			
 
				       r = next;
			
 
				     }
			
--- a/socl/src/task.c
+++ b/socl/src/task.c
@@ -120,10 +120,13 @@ static void cputask_task(__attribute__((unused)) void *descr[], void *args) {
 
				 
			
 
				   arg->callback(arg->arg);
			
 
				 
			
 
				+#warning FIXME: free memory
			
 
				+/*
			
 
				   if (arg->free_arg)
			
 
				     free(arg->arg);
			
 
				 
			
 
				   free(arg);
			
 
				+*/
			
 
				 }
			
 
				 
			
 
				 static struct starpu_codelet cputask_codelet = {
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -339,8 +339,6 @@ void _starpu_task_check_deprecated_fields(struct starpu_task *task)
 
				 				task->handles[i] = task->buffers[i].handle;
			
 
				 				task->cl->modes[i] = task->buffers[i].mode;
			
 
				 			}
			
 
				-			task->buffers[i].handle = NULL;
			
 
				-			task->buffers[i].mode = STARPU_NONE;
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -300,13 +300,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config)
 
				 		{
			
 
				 			/* Nothing was specified, so let's choose ! */
			
 
				 			ncuda = nb_devices;
			
 
				-			if (ncuda > STARPU_MAXCUDADEVS)
			
 
				-			{
			
 
				-				fprintf(stderr,
			
 
				-					"# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n",
			
 
				-					nb_devices, STARPU_MAXCUDADEVS);
			
 
				-				ncuda = STARPU_MAXCUDADEVS;
			
 
				-			}
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -318,14 +311,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config)
 
				 					ncuda, nb_devices);
			
 
				 				ncuda = nb_devices;
			
 
				 			}
			
 
				-			/* Let's make sure this value is OK. */
			
 
				-			if (ncuda > STARPU_MAXCUDADEVS)
			
 
				-			{
			
 
				-				fprintf(stderr,
			
 
				-					"# Warning: %d CUDA devices requested. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n",
			
 
				-					ncuda, STARPU_MAXCUDADEVS);
			
 
				-				ncuda = STARPU_MAXCUDADEVS;
			
 
				-			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -683,6 +683,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 
				 	return 0;
			
 
				 
			
 
				 enomem:
			
 
				+	_STARPU_TRACE_END_FETCH_INPUT(NULL);
			
 
				 	/* try to unreference all the input that were successfully taken */
			
 
				 	/* XXX broken ... */
			
 
				 	_STARPU_DISP("something went wrong with buffer %u\n", index);
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -350,6 +350,7 @@ fail_rowptr:
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_colind);
			
 
				+			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 			cudaFree((void*)addr_colind);
			
@@ -369,6 +370,7 @@ fail_colind:
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_nzval);
			
 
				+			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 			cudaFree((void*)addr_nzval);
			
@@ -483,15 +485,15 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, src_node, (void *)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, src_node, (void *)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, src_node, (void *)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, src_node, (void *)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, src_node, (void *)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, src_node, (void *)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -514,15 +516,15 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, src_node, (cl_mem)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, src_node, (cl_mem)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, src_node, (cl_mem)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, src_node, (cl_mem)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, src_node, (cl_mem)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, src_node, (cl_mem)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -621,9 +621,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 		/* Is that a single contiguous buffer ? */
			
 
				 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				-                        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
			
 
				-                                                                           src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				-                                                                           dst_block->offset, (cl_event*)_event, &ret);
			
 
				+                        err = starpu_opencl_copy_ram_to_opencl((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
			
 
				+							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				+							       dst_block->offset, (cl_event*)_event, &ret);
			
 
				                         if (STARPU_UNLIKELY(err))
			
 
				                                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 }
			
@@ -644,9 +644,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 			{
			
 
				                                 void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
			
 
				                                 err = starpu_opencl_copy_ram_to_opencl(ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
			
 
				-                                                                        src_block->nx*src_block->elemsize,
			
 
				-                                                                        layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
			
 
				-                                                                        + dst_block->offset, NULL);
			
 
				+								       src_block->nx*src_block->elemsize,
			
 
				+								       layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
			
 
				+								       + dst_block->offset, NULL, NULL);
			
 
				                                 if (STARPU_UNLIKELY(err))
			
 
				                                         STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                         }
			
@@ -687,9 +687,9 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 		/* Is that a single contiguous buffer ? */
			
 
				 		if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				-                        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
			
 
				-                                                                           src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				-                                                                           src_block->offset, (cl_event*)_event, &ret);
			
 
				+                        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
			
 
				+							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				+							       src_block->offset, (cl_event*)_event, &ret);
			
 
				                         if (STARPU_UNLIKELY(err))
			
 
				                                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 }
			
@@ -713,7 +713,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				                                 err = starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, src_node, ptr, dst_node,
			
 
				                                                                         src_block->nx*src_block->elemsize,
			
 
				                                                                         layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
			
 
				-                                                                        src_block->offset, NULL);
			
 
				+								       src_block->offset, NULL, NULL);
			
 
				 				if (STARPU_UNLIKELY(err))
			
 
				 					STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                         }
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -324,6 +324,7 @@ fail_rowptr:
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_colind);
			
 
				+			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 		{
			
@@ -348,6 +349,7 @@ fail_colind:
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void *)addr_nzval);
			
 
				+			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 		{
			
@@ -666,15 +668,15 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
			
 
				+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -694,15 +696,15 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
				 
			
 
				         int err;
			
 
				 
			
 
				-        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, src_node, (cl_mem)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
			
 
				+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, src_node, (cl_mem)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, src_node, (cl_mem)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, src_node, (cl_mem)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL, NULL);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, src_node, (cl_mem)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
			
 
				+        err = starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, src_node, (cl_mem)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -607,9 +607,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 
			
 
				 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
			
 
				-                                                           src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				-                                                           dst_matrix->offset, (cl_event*)_event, &ret);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
			
 
				+					       src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				+					       dst_matrix->offset, (cl_event*)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -626,9 +626,9 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 
			
 
				 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				 
			
 
				-        err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
			
 
				-                                                           src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				-                                                           src_matrix->offset, (cl_event*)_event, &ret);
			
 
				+        err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
			
 
				+					       src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				+					       src_matrix->offset, (cl_event*)_event, &ret);
			
 
				 
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -637,14 +637,14 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 
				 	size = src_multiformat->nx * src_multiformat->ops->opencl_elemsize;
			
 
				 
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_multiformat->cpu_ptr,
			
 
				-							   src_node,
			
 
				-							   (cl_mem) dst_multiformat->cpu_ptr,
			
 
				-							   dst_node,
			
 
				-							   size,
			
 
				-							   0,
			
 
				-							   (cl_event *) _event,
			
 
				-							   &ret);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl(src_multiformat->cpu_ptr,
			
 
				+					       src_node,
			
 
				+					       (cl_mem) dst_multiformat->cpu_ptr,
			
 
				+					       dst_node,
			
 
				+					       size,
			
 
				+					       0,
			
 
				+					       (cl_event *) _event,
			
 
				+					       &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -675,14 +675,14 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 
				 		/* XXX : it is weird that we might have to allocate memory here... */
			
 
				 		dst_multiformat->opencl_ptr = malloc(dst_multiformat->nx * dst_multiformat->ops->opencl_elemsize);
			
 
				 	}
			
 
				-	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_multiformat->opencl_ptr,
			
 
				-							   src_node,
			
 
				-							   dst_multiformat->opencl_ptr,
			
 
				-							   dst_node,
			
 
				-							   size,
			
 
				-                                                           0,
			
 
				-							   (cl_event *)_event,
			
 
				-							   &ret);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_multiformat->opencl_ptr,
			
 
				+					       src_node,
			
 
				+					       dst_multiformat->opencl_ptr,
			
 
				+					       dst_node,
			
 
				+					       size,
			
 
				+					       0,
			
 
				+					       (cl_event *)_event,
			
 
				+					       &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -440,8 +440,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				         int err,ret;
			
 
				 
			
 
				-        err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, src_node, (cl_mem)dst_variable->ptr, dst_node, src_variable->elemsize,
			
 
				-                                                           0, (cl_event*)_event, &ret);
			
 
				+        err = starpu_opencl_copy_ram_to_opencl((void*)src_variable->ptr, src_node, (cl_mem)dst_variable->ptr, dst_node, src_variable->elemsize,
			
 
				+					       0, (cl_event*)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -456,8 +456,8 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, src_node, (void*)dst_variable->ptr, dst_node, src_variable->elemsize,
			
 
				-                                                           0, (cl_event*)_event, &ret);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_variable->ptr, src_node, (void*)dst_variable->ptr, dst_node, src_variable->elemsize,
			
 
				+					       0, (cl_event*)_event, &ret);
			
 
				 
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -483,9 +483,9 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				-	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, src_node, (cl_mem)dst_vector->dev_handle, dst_node,
			
 
				-                                                           src_vector->nx*src_vector->elemsize,
			
 
				-                                                           dst_vector->offset, (cl_event*)_event, &ret);
			
 
				+	err = starpu_opencl_copy_ram_to_opencl((void*)src_vector->ptr, src_node, (cl_mem)dst_vector->dev_handle, dst_node,
			
 
				+					       src_vector->nx*src_vector->elemsize,
			
 
				+					       dst_vector->offset, (cl_event*)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -501,8 +501,8 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				-	err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, src_node, (void*)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize,
			
 
				-                                                           src_vector->offset, (cl_event *)_event, &ret);
			
 
				+	err = starpu_opencl_copy_opencl_to_ram((cl_mem)src_vector->dev_handle, src_node, (void*)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize,
			
 
				+					       src_vector->offset, (cl_event *)_event, &ret);
			
 
				         if (STARPU_UNLIKELY(err))
			
 
				                 STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -165,6 +165,10 @@ static void deinit_context(int workerid, int devid)
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 }
			
 
				 
			
 
				+
			
 
				+/* Return the number of devices usable in the system.
			
 
				+ * The value returned cannot be greater than MAXCUDADEVS */
			
 
				+
			
 
				 unsigned _starpu_get_cuda_device_count(void)
			
 
				 {
			
 
				 	int cnt;
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -203,8 +203,9 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem, size_t size, cl_mem_flags flag
 
				 	 */
			
 
				 	char dummy = 0;
			
 
				 	err = clEnqueueWriteBuffer(queues[worker->devid], memory, CL_TRUE,
			
 
				-				0, sizeof(dummy), &dummy,
			
 
				-				0, NULL, NULL);
			
 
				+				   0, sizeof(dummy), &dummy,
			
 
				+				   0, NULL, NULL);
			
 
				+	clFinish(queues[worker->devid]);
			
 
				 	if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
			
 
				 		return err;
			
 
				 	if (err != CL_SUCCESS)
			
@@ -214,111 +215,54 @@ cl_int starpu_opencl_allocate_memory(cl_mem *mem, size_t size, cl_mem_flags flag
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-cl_int starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				+cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				 {
			
 
				         cl_int err;
			
 
				         struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				-        cl_bool blocking;
			
 
				-
			
 
				-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				 
			
 
				         if (event)
			
 
				                 _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
			
 
				         if (event)
			
 
				                 _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				 	{
			
 
				-                *ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				-                return CL_SUCCESS;
			
 
				-        }
			
 
				-        else
			
 
				-	{
			
 
				-                if (event != NULL)
			
 
				-		{
			
 
				-                        /* The asynchronous copy has failed, try to copy synchronously */
			
 
				-                        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				-                }
			
 
				-                if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				+		if (event == NULL)
			
 
				 		{
			
 
				-                        *ret = 0;
			
 
				-                        return CL_SUCCESS;
			
 
				-                }
			
 
				-                else
			
 
				+			/* We want a synchronous copy, let's synchronise the queue */
			
 
				+			clFinish(transfer_queues[worker->devid]);
			
 
				+		}
			
 
				+		if (ret)
			
 
				 		{
			
 
				-                        STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                        return err;
			
 
				-                }
			
 
				-        }
			
 
				-}
			
 
				-
			
 
				-cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
			
 
				-{
			
 
				-        cl_int err;
			
 
				-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				-        cl_bool blocking;
			
 
				-
			
 
				-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        if (event)
			
 
				-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				-        if (event)
			
 
				-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-        return CL_SUCCESS;
			
 
				+			*ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				+		}
			
 
				+	}
			
 
				+	return err;
			
 
				 }
			
 
				 
			
 
				-cl_int starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				+cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
			
 
				 {
			
 
				         cl_int err;
			
 
				         struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				-        cl_bool blocking;
			
 
				 
			
 
				-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				         if (event)
			
 
				                 _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, event);
			
 
				         if (event)
			
 
				                 _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				         if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				 	{
			
 
				-                *ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				-                return CL_SUCCESS;
			
 
				-        }
			
 
				-        else
			
 
				-	{
			
 
				-                if (event != NULL)
			
 
				-                        /* The asynchronous copy has failed, try to copy synchronously */
			
 
				-                        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				-                if (STARPU_LIKELY(err == CL_SUCCESS))
			
 
				+		if (event == NULL)
			
 
				 		{
			
 
				-                        *ret = 0;
			
 
				-                        return CL_SUCCESS;
			
 
				-                }
			
 
				-                else
			
 
				+			/* We want a synchronous copy, let's synchronise the queue */
			
 
				+			clFinish(transfer_queues[worker->devid]);
			
 
				+		}
			
 
				+		if (ret)
			
 
				 		{
			
 
				-                        STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                        return err;
			
 
				-                }
			
 
				-        }
			
 
				-}
			
 
				-
			
 
				-cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
			
 
				-{
			
 
				-        cl_int err;
			
 
				-        struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				-        cl_bool blocking;
			
 
				-
			
 
				-        blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        if (event)
			
 
				-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				-        if (event)
			
 
				-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-        return CL_SUCCESS;
			
 
				+			*ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				+		}
			
 
				+	}
			
 
				+	return err;
			
 
				 }
			
 
				 
			
 
				 #if 0
			
--- a/src/top/starpu_top_core.h
+++ b/src/top/starpu_top_core.h
@@ -15,6 +15,8 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <time.h>
			
 
				+
			
 
				 struct starpu_task;
			
 
				 
			
 
				 /*
			
--- a/tests/datawizard/acquire_release_opencl.c
+++ b/tests/datawizard/acquire_release_opencl.c
@@ -42,17 +42,7 @@ void increment_opencl(void *buffers[], void *args)
 
				 
			
 
				 	{
			
 
				 		size_t global=1;
			
 
				-		size_t local;
			
 
				-                size_t s;
			
 
				-                cl_device_id device;
			
 
				-
			
 
				-                starpu_opencl_get_device(devid, &device);
			
 
				-
			
 
				-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				-                if (err != CL_SUCCESS)
			
 
				-			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                if (local > global)
			
 
				-			local=global;
			
 
				+		size_t local=1;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS)
			
--- a/tests/datawizard/data_invalidation.c
+++ b/tests/datawizard/data_invalidation.c
@@ -70,15 +70,15 @@ static void opencl_memset_codelet(void *buffers[], void *args)
 
				 	memset(v, 42, length);
			
 
				 
			
 
				 	clEnqueueWriteBuffer(queue,
			
 
				-			buffer,
			
 
				-			CL_TRUE,
			
 
				-			0,      /* offset */
			
 
				-			length, /* sizeof (char) */
			
 
				-			v,
			
 
				-			0,      /* num_events_in_wait_list */
			
 
				-			NULL,   /* event_wait_list */
			
 
				-			NULL    /* event */);
			
 
				-			
			
 
				+			     buffer,
			
 
				+			     CL_FALSE,
			
 
				+			     0,      /* offset */
			
 
				+			     length, /* sizeof (char) */
			
 
				+			     v,
			
 
				+			     0,      /* num_events_in_wait_list */
			
 
				+			     NULL,   /* event_wait_list */
			
 
				+			     NULL    /* event */);
			
 
				+	clFinish(queue);
			
 
				 }
			
 
				 #endif /* !STARPU_USE_OPENCL */
			
 
				 
			
@@ -121,7 +121,7 @@ static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) vo
 
				 	{
			
 
				 		if (buf[i] != 42)
			
 
				 		{
			
 
				-			FPRINTF(stderr, "buf[%u] is %c while it should be %c\n", i, buf[i], 42);
			
 
				+			FPRINTF(stderr, "buf[%u] is '%c' while it should be '%c'\n", i, buf[i], 42);
			
 
				 			exit(-1);
			
 
				 		}
			
 
				 	}
			
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -186,19 +186,18 @@ test_opencl(void)
 
				 		foo[i] = i;
			
 
				 
			
 
				 	err = clEnqueueWriteBuffer(queue,
			
 
				-				foo_gpu,
			
 
				-				CL_TRUE,
			
 
				-				0,
			
 
				-				size*sizeof(int),
			
 
				-				foo,
			
 
				-				0,
			
 
				-				NULL,
			
 
				-				NULL);
			
 
				+				   foo_gpu,
			
 
				+				   CL_FALSE,
			
 
				+				   0,
			
 
				+				   size*sizeof(int),
			
 
				+				   foo,
			
 
				+				   0,
			
 
				+				   NULL,
			
 
				+				   NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	clFinish(queue);
			
 
				 
			
 
				-
			
 
				-	
			
 
				 	starpu_vector_data_register(&handle,
			
 
				 				    starpu_worker_get_memory_node(chosen),
			
 
				 				    (uintptr_t)foo_gpu,
			
@@ -222,7 +221,7 @@ test_opencl(void)
 
				 	};
			
 
				 
			
 
				 	starpu_data_partition(handle, &f);
			
 
				-	
			
 
				+
			
 
				 	ret = submit_tasks(handle, pieces, n);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return -ENODEV;
			
@@ -235,16 +234,17 @@ test_opencl(void)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 
			
 
				 	err = clEnqueueReadBuffer(queue,
			
 
				-		foo_gpu,
			
 
				-		CL_TRUE,
			
 
				-		0,
			
 
				-		size*sizeof(*foo),
			
 
				-		foo,
			
 
				-		0,
			
 
				-		NULL,
			
 
				-		NULL);
			
 
				+				  foo_gpu,
			
 
				+				  CL_FALSE,
			
 
				+				  0,
			
 
				+				  size*sizeof(*foo),
			
 
				+				  foo,
			
 
				+				  0,
			
 
				+				  NULL,
			
 
				+				  NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	clFinish(queue);
			
 
				 	return check_result(foo, size);
			
 
				 }
			
 
				 #endif /* !STARPU_USE_OPENCL */
			
@@ -258,6 +258,11 @@ int main(int argc, char **argv)
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+#endif
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
			
 
				 	ret = test_cuda();
			
--- a/tests/datawizard/interfaces/bcsr/bcsr_interface.c
+++ b/tests/datawizard/interfaces/bcsr/bcsr_interface.c
@@ -178,7 +178,7 @@ main(void)
 
				 		.nopencl = 1
			
 
				 	};
			
 
				 
			
 
				-	if (starpu_init(&conf) == -ENODEV)
			
 
				+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 
			
 
				 	register_data();
			
--- a/tests/datawizard/interfaces/block/block_interface.c
+++ b/tests/datawizard/interfaces/block/block_interface.c
@@ -140,7 +140,7 @@ main(void)
 
				 		.nopencl = 1
			
 
				 	};
			
 
				 
			
 
				-	if (starpu_init(&conf) == -ENODEV)
			
 
				+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				 		goto enodev;
			
 
				 
			
 
				 	register_data();
			
--- a/tests/datawizard/interfaces/csr/csr_interface.c
+++ b/tests/datawizard/interfaces/csr/csr_interface.c
@@ -148,7 +148,7 @@ main(void)
 
				 		.nopencl = 1
			
 
				 	};
			
 
				 
			
 
				-	if (starpu_init(&conf) == -ENODEV)
			
 
				+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				 		goto enodev;
			
 
				 
			
 
				 	register_data();
			
--- a/tests/datawizard/interfaces/matrix/matrix_interface.c
+++ b/tests/datawizard/interfaces/matrix/matrix_interface.c
@@ -123,7 +123,7 @@ main(void)
 
				 		.nopencl = 1
			
 
				 	};
			
 
				 
			
 
				-	if (starpu_init(&conf) == -ENODEV)
			
 
				+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				 		goto enodev;
			
 
				 
			
 
				 	register_data();
			
--- a/tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release.c
+++ b/tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release.c
@@ -131,7 +131,7 @@ main(void)
 
				 	};
			
 
				 	memset(&global_stats, 0, sizeof(global_stats));
			
 
				 	ret = starpu_init(&conf);
			
 
				-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	register_handle();
			
--- a/tests/datawizard/interfaces/multiformat/multiformat_interface.c
+++ b/tests/datawizard/interfaces/multiformat/multiformat_interface.c
@@ -144,7 +144,8 @@ main(void)
 
				 	};
			
 
				 
			
 
				 	ret = starpu_init(&conf);
			
 
				-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	register_data();
			
--- a/tests/datawizard/interfaces/variable/variable_interface.c
+++ b/tests/datawizard/interfaces/variable/variable_interface.c
@@ -101,7 +101,8 @@ main(void)
 
				 	};
			
 
				 
			
 
				 	ret = starpu_init(&conf);
			
 
				-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				 
			
 
				 	register_data();
			
 
				 
			
--- a/tests/datawizard/interfaces/vector/test_vector_interface.c
+++ b/tests/datawizard/interfaces/vector/test_vector_interface.c
@@ -110,7 +110,7 @@ main(void)
 
				 		.nopencl = 1
			
 
				 	};
			
 
				 
			
 
				-	if (starpu_init(&conf) == -ENODEV)
			
 
				+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				 		goto enodev;
			
 
				 
			
 
				 	register_data();
			
--- a/tests/datawizard/interfaces/void/void_interface.c
+++ b/tests/datawizard/interfaces/void/void_interface.c
@@ -66,7 +66,7 @@ main(void)
 
				 		.nopencl = 1
			
 
				 	};
			
 
				 
			
 
				-	if (starpu_init(&conf) == -ENODEV)
			
 
				+	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
			
 
				 		goto enodev;
			
 
				 
			
 
				 	register_data();
			
--- a/tests/datawizard/lazy_allocation.c
+++ b/tests/datawizard/lazy_allocation.c
@@ -65,15 +65,15 @@ static void opencl_memset_codelet(void *buffers[], void *args)
 
				 	memset(v, 42, length);
			
 
				 
			
 
				 	clEnqueueWriteBuffer(queue,
			
 
				-			buffer,
			
 
				-			CL_TRUE,
			
 
				-			0,      /* offset */
			
 
				-			length, /* sizeof (char) */
			
 
				-			v,
			
 
				-			0,      /* num_events_in_wait_list */
			
 
				-			NULL,   /* event_wait_list */
			
 
				-			NULL    /* event */);
			
 
				-			
			
 
				+			     buffer,
			
 
				+			     CL_FALSE,
			
 
				+			     0,      /* offset */
			
 
				+			     length, /* sizeof (char) */
			
 
				+			     v,
			
 
				+			     0,      /* num_events_in_wait_list */
			
 
				+			     NULL,   /* event_wait_list */
			
 
				+			     NULL    /* event */);
			
 
				+	clFinish(queue);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -160,19 +160,19 @@ static void opencl_check_content_codelet(void *buffers[], void *args)
 
				 	for (i = 0; i < length; i++)
			
 
				 	{
			
 
				 		char dst;
			
 
				-		clEnqueueReadBuffer(
			
 
				-			queue,
			
 
				-			buf,
			
 
				-			CL_TRUE,
			
 
				-			i * sizeof(dst),
			
 
				-			sizeof(dst),
			
 
				-			&dst,
			
 
				-			0,      /* num_events_in_wait_list */
			
 
				-			NULL,   /* event_wait_list */
			
 
				-			NULL    /* event */);
			
 
				+		clEnqueueReadBuffer(queue,
			
 
				+				    buf,
			
 
				+				    CL_FALSE,
			
 
				+				    i * sizeof(dst),
			
 
				+				    sizeof(dst),
			
 
				+				    &dst,
			
 
				+				    0,      /* num_events_in_wait_list */
			
 
				+				    NULL,   /* event_wait_list */
			
 
				+				    NULL    /* event */);
			
 
				+		clFinish(queue);
			
 
				 		if (dst != 42)
			
 
				 		{
			
 
				-			FPRINTF(stderr, "buf[%u] is %c while it should be %c\n", i, dst, 42);
			
 
				+			FPRINTF(stderr, "buf[%u] is '%c' while it should be '%c'\n", i, dst, 42);
			
 
				 			exit(-1);
			
 
				 		}
			
 
				 	}
			
--- a/tests/datawizard/manual_reduction.c
+++ b/tests/datawizard/manual_reduction.c
@@ -57,7 +57,8 @@ static void initialize_per_worker_handle(void *arg __attribute__((unused)))
 
				 			cl_mem ptr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(variable), NULL, NULL);
			
 
				 			/* Poor's man memset */
			
 
				 			unsigned zero = 0;
			
 
				-			clEnqueueWriteBuffer(queue, ptr, CL_TRUE, 0, sizeof(variable), (void *)&zero, 0, NULL, NULL);
			
 
				+			clEnqueueWriteBuffer(queue, ptr, CL_FALSE, 0, sizeof(variable), (void *)&zero, 0, NULL, NULL);
			
 
				+			clFinish(queue);
			
 
				 			per_worker[workerid] = (uintptr_t)ptr;
			
 
				 			}
			
 
				 
			
@@ -142,9 +143,11 @@ static void opencl_func_incr(void *descr[], void *cl_arg __attribute__((unused))
 
				 	cl_command_queue queue;
			
 
				 	starpu_opencl_get_current_queue(&queue);
			
 
				 
			
 
				-	clEnqueueReadBuffer(queue, d_val, CL_TRUE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
			
 
				+	clEnqueueReadBuffer(queue, d_val, CL_FALSE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
			
 
				+	clFinish(queue);
			
 
				 	h_val++;
			
 
				-	clEnqueueWriteBuffer(queue, d_val, CL_TRUE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
			
 
				+	clEnqueueWriteBuffer(queue, d_val, CL_FALSE, 0, sizeof(unsigned), (void *)&h_val, 0, NULL, NULL);
			
 
				+	clFinish(queue);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/mpi_like.c
+++ b/tests/datawizard/mpi_like.c
@@ -138,7 +138,7 @@ static void send_handle(struct thread_data *thread_data)
 
				 	/* wait until it's received (ie. neighbour's recv_flag is set back to 0) */
			
 
				 	while (neighbour_data->recv_flag)
			
 
				 		_STARPU_PTHREAD_COND_WAIT(&neighbour_data->recv_cond, &neighbour_data->recv_mutex);
			
 
				-	
			
 
				+
			
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&neighbour_data->recv_mutex);
			
 
				 
			
 
				 	starpu_data_release(thread_data->handle);
			
@@ -159,7 +159,7 @@ static void *thread_func(void *arg)
 
				 		{
			
 
				 			recv_handle(thread_data);
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 		increment_handle(thread_data);
			
 
				 
			
 
				 		if (!((index == (NTHREADS - 1)) && (iter == (NITER - 1))))
			
--- a/tests/datawizard/opencl_codelet_unsigned_inc.c
+++ b/tests/datawizard/opencl_codelet_unsigned_inc.c
@@ -44,17 +44,7 @@ void opencl_codelet_unsigned_inc(void *buffers[], void *args)
 
				 
			
 
				 	{
			
 
				 		size_t global=1;
			
 
				-		size_t local;
			
 
				-                size_t s;
			
 
				-                cl_device_id device;
			
 
				-
			
 
				-                starpu_opencl_get_device(devid, &device);
			
 
				-
			
 
				-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				-                if (err != CL_SUCCESS)
			
 
				-			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                if (local > global)
			
 
				-			local=global;
			
 
				+		size_t local=1;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS)
			
--- a/tests/datawizard/opencl_codelet_unsigned_inc_kernel.cl
+++ b/tests/datawizard/opencl_codelet_unsigned_inc_kernel.cl
@@ -14,7 +14,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-__kernel void _opencl_unsigned_inc(__global unsigned int *val)
			
 
				+__kernel void _opencl_unsigned_inc(__global unsigned *val)
			
 
				 {
			
 
				 	val[0]++;
			
 
				 }
			
--- a/tests/datawizard/reclaim.c
+++ b/tests/datawizard/reclaim.c
@@ -30,11 +30,12 @@
 
				 
			
 
				 #ifdef STARPU_SLOW_MACHINE
			
 
				 #  define BLOCK_SIZE (64*1024)
			
 
				+static unsigned ntasks = 250;
			
 
				 #else
			
 
				 #  define BLOCK_SIZE (64*1024*1024)
			
 
				+static unsigned ntasks = 1000;
			
 
				 #endif
			
 
				 
			
 
				-static unsigned ntasks = 1000;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 static uint64_t get_total_memory_size(void)
			
--- a/tests/errorcheck/starpu_init_noworker.c
+++ b/tests/errorcheck/starpu_init_noworker.c
@@ -33,7 +33,7 @@ static void unset_env_variables(void)
 
				 {
			
 
				 	(void) unsetenv("STARPU_NCPUS");
			
 
				 	(void) unsetenv("STARPU_NCUDA");
			
 
				-	(void) unsetenv("STARPU_NNOPENCL");
			
 
				+	(void) unsetenv("STARPU_NOPENCL");
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
@@ -58,9 +58,20 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* starpu_init should return -ENODEV */
			
 
				 	ret = starpu_init(&conf);
			
 
				-	if (ret != -ENODEV)
			
 
				+	if (ret == -ENODEV)
			
 
				+	     return EXIT_SUCCESS;
			
 
				+	else
			
 
				+	{
			
 
				+	     	unsigned ncpu = starpu_cpu_worker_get_count();
			
 
				+		unsigned ncuda = starpu_cuda_worker_get_count();
			
 
				+		unsigned nopencl = starpu_opencl_worker_get_count();
			
 
				+		FPRINTF(stderr, "StarPU has found :\n");
			
 
				+		FPRINTF(stderr, "\t%d CPU cores\n", ncpu);
			
 
				+		FPRINTF(stderr, "\t%d CUDA devices\n", ncuda);
			
 
				+		FPRINTF(stderr, "\t%d OpenCL devices\n", nopencl);
			
 
				 		return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				 
			
 
				-	return EXIT_SUCCESS;
			
 
				 }
			
 
				 #endif
			
--- a/tests/helper/starpu_data_cpy.c
+++ b/tests/helper/starpu_data_cpy.c
@@ -17,19 +17,18 @@
 
				 #include <starpu.h>
			
 
				 #include "../helper.h"
			
 
				 
			
 
				-int var1, var2;
			
 
				-starpu_data_handle_t var1_handle, var2_handle;
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret;
			
 
				+	int var1, var2;
			
 
				+	starpu_data_handle_t var1_handle, var2_handle;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	var1 = 42;
			
 
				-	var2 = 0;
			
 
				+	var2 = 12;
			
 
				 
			
 
				 	starpu_variable_data_register(&var1_handle, 0, (uintptr_t)&var1, sizeof(var1));
			
 
				 	starpu_variable_data_register(&var2_handle, 0, (uintptr_t)&var2, sizeof(var2));
			
@@ -38,12 +37,17 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_cpy");
			
 
				 
			
 
				 	starpu_data_acquire(var2_handle, STARPU_R);
			
 
				-	STARPU_ASSERT(var2 == 42);
			
 
				+	ret = EXIT_SUCCESS;
			
 
				+	if (var2 != var1)
			
 
				+	{
			
 
				+	     FPRINTF(stderr, "var2 is %d but it should be %d\n", var2, var1);
			
 
				+	     ret = EXIT_FAILURE;
			
 
				+	}
			
 
				 	starpu_data_release(var2_handle);
			
 
				 
			
 
				 	starpu_data_unregister(var1_handle);
			
 
				 	starpu_data_unregister(var2_handle);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	STARPU_RETURN(EXIT_SUCCESS);
			
 
				+	STARPU_RETURN(ret);
			
 
				 }
			
--- a/tests/main/task_wait_api.c
+++ b/tests/main/task_wait_api.c
@@ -120,8 +120,9 @@ int main(int argc, char **argv)
 
				 	ret = starpu_task_wait(taskK); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
			
 
				 	ret = starpu_task_wait(taskL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
			
 
				 
			
 
				-	/* Destroy all the tasks that were not detached and that we did not
			
 
				-	 * wait for previously */
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	/* Destroy all the tasks that were not detached */
			
 
				 	starpu_task_destroy(taskA);
			
 
				 	starpu_task_destroy(taskC);
			
 
				 	starpu_task_destroy(taskD);
			
--- a/tools/dev/internal/rename_internal.sed
+++ b/tools/dev/internal/rename_internal.sed
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2010  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,6 +14,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				+s/\bstruct starpu_priority_taskq_s\b/struct _starpu_priority_taskq/g
			
 
				 s/\bSTARPU_FUT_APPS_KEY\b/_STARPU_FUT_APPS_KEY/g
			
 
				 s/\bSTARPU_FUT_CPU_KEY\b/_STARPU_FUT_CPU_KEY/g
			
 
				 s/\bSTARPU_FUT_CUDA_KEY\b/_STARPU_FUT_CUDA_KEY/g