14 years ago · 64ee8754cc
--- a/.gitignore
+++ b/.gitignore
@@ -185,3 +185,4 @@ starpu.log
 
				 /gcc-plugin/tests/unregister
			
 
				 /gcc-plugin/tests/lib-user
			
 
				 /gcc-plugin/examples/matrix-mult
			
 
				+/gcc-plugin/src/c-expr.c
			
--- a/AUTHORS
+++ b/AUTHORS
@@ -10,3 +10,5 @@ William Braik <wbraik@gmail.com>
 
				 Yann Courtois <yann.courtois33@gmail.com>
			
 
				 Jean-Marie Couteyen <jm.couteyen@gmail.com>
			
 
				 Anthony Roy <theanthony33@gmail.com>
			
 
				+David Gómez <david_gomez1380@yahoo.com.mx>
			
 
				+Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,6 +18,8 @@ ACLOCAL_AMFLAGS=-I m4
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				 
			
 
				 SUBDIRS = src
			
 
				+SUBDIRS += tools tests doc
			
 
				+
			
 
				 if USE_MPI
			
 
				 SUBDIRS += mpi
			
 
				 endif
			
@@ -26,7 +28,7 @@ if BUILD_SOCL
 
				 SUBDIRS += socl
			
 
				 endif
			
 
				 
			
 
				-SUBDIRS += tools examples tests doc
			
 
				+SUBDIRS += examples
			
 
				 
			
 
				 if COND_OPT
			
 
				 SUBDIRS += tests/opt examples/opt
			
@@ -68,6 +70,8 @@ clean-local:
 
				 install-exec-local:
			
 
				 	$(MKDIR_P) $(DESTDIR)$(bindir)
			
 
				 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
			
 
				+uninstall-local:
			
 
				+	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
			
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_WINDOWS
			
@@ -77,3 +81,5 @@ txtdir = ${docdir}
 
				 endif
			
 
				 txt_DATA = AUTHORS COPYING.LGPL README
			
 
				 EXTRA_DIST = AUTHORS COPYING.LGPL README
			
 
				+
			
 
				+include starpu-top/extradist
			
--- a/configure.ac
+++ b/configure.ac
@@ -2,6 +2,7 @@
 
				 #
			
 
				 # Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2011  Télécom-SudParis
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -33,8 +34,8 @@ AC_PROG_SED
 
				 AC_PROG_LN_S
			
 
				 AC_PROG_F77
			
 
				 
			
 
				-AC_LIBTOOL_WIN32_DLL
			
 
				-AC_PROG_LIBTOOL
			
 
				+LT_PREREQ([2.2])
			
 
				+LT_INIT([win32-dll])
			
 
				 
			
 
				 AC_PROG_INSTALL
			
 
				 AC_PROG_MKDIR_P
			
@@ -158,18 +159,6 @@ if test x$enable_libnuma = xyes; then
 
				 fi
			
 
				 
			
 
				 ###############################################################################
			
 
				-#									      #
			
 
				-#				SCED_CTX settings			      #
			
 
				-#									      #
			
 
				-###############################################################################
			
 
				-AC_MSG_CHECKING(maximum number of sched_ctxs)
			
 
				-AC_ARG_ENABLE(max_sched_ctxs, [AS_HELP_STRING([--enable-max-sched-ctxs=<number>],
			
 
				-			[maximum number of sched_ctxs])],
			
 
				-			max_sched_ctxs=$enableval, max_sched_ctxs=10)
			
 
				-AC_MSG_RESULT($max_sched_ctxs)
			
 
				-AC_DEFINE_UNQUOTED(STARPU_NMAX_SCHED_CTXS, [$max_sched_ctxs], [Maximum number of sched_ctxs supported])
			
 
				-
			
 
				-###############################################################################
			
 
				 #                                                                             #
			
 
				 #                                 CPUs settings                               #
			
 
				 #                                                                             #
			
@@ -282,6 +271,7 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 
				 
			
 
				     if test "$have_valid_cuda" = "no" ; then
			
 
				         LDFLAGS="${SAVED_LDFLAGS}"
			
 
				+	unset STARPU_CUDA_LDFLAGS
			
 
				     fi
			
 
				 ])
			
 
				 
			
@@ -460,8 +450,10 @@ if test x$have_curand = xyes; then
 
				 fi
			
 
				 
			
 
				 # Peer transfers are only supported since CUDA 4.0
			
 
				+# Disable them if user explicity wants to disable them
			
 
				+AC_ARG_ENABLE(cuda_memcpy_peer, [AS_HELP_STRING([--disable-cuda-memcpy-peer], [do not allow peer transfers when using CUDA 4.0])],, [enable_cuda_memcpy_peer=yes])
			
 
				 have_cuda_memcpy_peer=no
			
 
				-if test x$enable_cuda = xyes; then
			
 
				+if test x$enable_cuda_memcpy_peer = xyes -a x$enable_cuda = xyes ; then
			
 
				     SAVED_LDFLAGS="${LDFLAGS}"
			
 
				     LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
			
 
				     AC_CHECK_FUNC([cudaMemcpyPeer], have_cuda_memcpy_peer=yes, have_cuda_memcpy_peer=no)
			
@@ -898,6 +890,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 
				 AC_MSG_RESULT($nmaxworkers)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
			
 
				 
			
 
				+# Computes the maximum number of implementations per arch
			
 
				+AC_MSG_CHECKING(maximum number of implementations)
			
 
				+AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
			
 
				+		[maximum number of implementations])],
			
 
				+		maximplementations=$enableval, maximplementations=1)
			
 
				+AC_MSG_RESULT($maximplementations)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
			
 
				+		[maximum number of implementations])
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                                    MPI                                      #
			
@@ -1116,6 +1117,10 @@ else
 
				    run_gcc_plugin_test_suite="no"
			
 
				 fi
			
 
				 
			
 
				+# Bison is used to generate the C expression parser.  The generated
			
 
				+# parser is part of the distribution, though.
			
 
				+AC_PROG_YACC
			
 
				+
			
 
				 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
			
 
				 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
			
 
				 
			
@@ -1285,6 +1290,10 @@ AC_MSG_CHECKING(which BLAS lib should be used)
 
				 AC_MSG_RESULT($blas_lib)
			
 
				 AC_SUBST(BLAS_LIB,$blas_lib)
			
 
				 
			
 
				+##########################################
			
 
				+# FFT                                    #
			
 
				+##########################################
			
 
				+
			
 
				 have_fftw=no
			
 
				 have_fftwf=no
			
 
				 have_fftwl=no
			
@@ -1295,7 +1304,7 @@ PKG_CHECK_MODULES([FFTW],  [fftw3],  [
 
				   have_fftw=yes
			
 
				 ], [:])
			
 
				 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
			
 
				-
			
 
				+ 
			
 
				 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
			
 
				   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
			
 
				   AC_SUBST([STARPU_HAVE_FFTWF], [1])
			
@@ -1310,6 +1319,10 @@ PKG_CHECK_MODULES([FFTWL], [fftw3l], [
 
				 ], [:])
			
 
				 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
			
 
				 
			
 
				+##########################################
			
 
				+# hwloc                                  #
			
 
				+##########################################
			
 
				+
			
 
				 AC_ARG_WITH([hwloc], [AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])])
			
 
				 SAVED_LDFLAGS="${LDFLAGS}"
			
 
				 SAVED_CPPFLAGS="${CPPFLAGS}"
			
@@ -1364,23 +1377,29 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 
				 # File configuration
			
 
				 AC_CONFIG_COMMANDS([executable-scripts], [
			
 
				   chmod +x tests/regression/regression.sh
			
 
				-  chmod +x gcc-plugin/tests/run-test
			
 
				 ])
			
 
				 
			
 
				 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
			
 
				 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
			
 
				 
			
 
				-AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
			
 
				+if test $build_gcc_plugin == "yes" ; then
			
 
				+    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
			
 
				+    AC_OUTPUT([
			
 
				+	    gcc-plugin/Makefile
			
 
				+	    gcc-plugin/src/Makefile
			
 
				+	    gcc-plugin/tests/Makefile
			
 
				+	    gcc-plugin/tests/run-test
			
 
				+	    gcc-plugin/examples/Makefile
			
 
				+            ])
			
 
				+    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
			
 
				+            chmod +x gcc-plugin/tests/run-test
			
 
				+            ])
			
 
				+fi 
			
 
				 
			
 
				 AC_OUTPUT([
			
 
				 	Makefile
			
 
				 	src/Makefile
			
 
				 	tools/Makefile
			
 
				-	gcc-plugin/Makefile
			
 
				-	gcc-plugin/src/Makefile
			
 
				-	gcc-plugin/tests/Makefile
			
 
				-	gcc-plugin/tests/run-test
			
 
				-	gcc-plugin/examples/Makefile
			
 
				 	socl/Makefile
			
 
				 	socl/src/Makefile
			
 
				 	libstarpu.pc
			
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,6 +16,8 @@
 
				 
			
 
				 info_TEXINFOS = starpu.texi
			
 
				 
			
 
				+starpu_TEXINFOS = c-extensions.texi
			
 
				+
			
 
				 MAINTAINERCLEANFILES = starpu.pdf
			
 
				 
			
 
				 EXTRA_DIST = starpu.pdf \
			
@@ -28,6 +30,9 @@ EXTRA_DIST = starpu.pdf \
 
				 
			
 
				 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
			
 
				 
			
 
				+uninstall-local:
			
 
				+	$(RM) $(DESTDIR)$(infodir)/dir
			
 
				+
			
 
				 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
			
 
				 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
			
 
				 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
			
--- a/doc/c-extensions.texi
+++ b/doc/c-extensions.texi
@@ -0,0 +1,160 @@
 
				+@c This is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+
			
 
				+@node C Extensions
			
 
				+@chapter C Extensions
			
 
				+
			
 
				+@cindex C extensions
			
 
				+@cindex GCC plug-in
			
 
				+
			
 
				+When configured with @code{--enable-gcc-extensions}, StarPU builds a
			
 
				+plug-in for the GNU Compiler Collection (GCC), which defines extensions
			
 
				+to the C language that make it easier to write StarPU code@footnote{This
			
 
				+feature is only available for GCC 4.5 and later.}.  Those extensions
			
 
				+include syntactic sugar for defining tasks and their implementations,
			
 
				+invoking a task, and manipulating data buffers.
			
 
				+
			
 
				+This section does not require detailed knowledge of the StarPU library.
			
 
				+
			
 
				+Note: as of StarPU @value{VERSION}, this is still an area under
			
 
				+development and subject to change.
			
 
				+
			
 
				+@menu
			
 
				+* Defining Tasks::              Defining StarPU tasks
			
 
				+* Registered Data Buffers::     Manipulating data buffers
			
 
				+@end menu
			
 
				+
			
 
				+@node Defining Tasks
			
 
				+@section Defining Tasks
			
 
				+
			
 
				+@cindex task
			
 
				+@cindex task implementation
			
 
				+
			
 
				+The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
			
 
				+
			
 
				+@enumerate
			
 
				+@item
			
 
				+tasks may have several implementations---e.g., one for CPUs, one written
			
 
				+in OpenCL, one written in CUDA;
			
 
				+@item
			
 
				+when a task is invoked, it may run in parallel, and StarPU is free to
			
 
				+choose any of its implementations.
			
 
				+@end enumerate
			
 
				+
			
 
				+Tasks and their implementations must be @emph{declared}.  These
			
 
				+declarations are annotated with @dfn{attributes} (@pxref{Attribute
			
 
				+Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
			
 
				+(GCC)}): the declaration of a task is a regular C function declaration
			
 
				+with an additional @code{task} attribute, and task implementations are
			
 
				+declared with a @code{task_implementation} attribute.
			
 
				+
			
 
				+The following function attributes are provided:
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item task
			
 
				+@cindex @code{task} attribute
			
 
				+Declare the given function as a StarPU task.  Its return type must be
			
 
				+@code{void}, and it must not be defined---instead, a definition will
			
 
				+automatically be provided by the compiler.
			
 
				+
			
 
				+Under the hood, declaring a task leads to the declaration of the
			
 
				+corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
			
 
				+more task implementations are declared in the same compilation unit,
			
 
				+then the codelet and the function itself are also defined; they inherit
			
 
				+the scope of the task.
			
 
				+
			
 
				+Scalar arguments to the task are passed by value and copied to the
			
 
				+target device if need be---technically, they are passed as the
			
 
				+@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
			
 
				+
			
 
				+Pointer arguments are assumed to be registered data buffers---the
			
 
				+@code{buffers} argument of a task (@pxref{Codelets and Tasks,
			
 
				+@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
			
 
				+read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
			
 
				+buffers are assumed to be used read-write (@code{STARPU_RW}).
			
 
				+
			
 
				+@item task_implementation (@var{target}, @var{task})
			
 
				+@cindex @code{task_implementation} attribute
			
 
				+Declare the given function as an implementation of @var{task} to run on
			
 
				+@var{target}.  @var{target} must be a string, currently one of
			
 
				+@code{"cpu"} or @code{"cuda"}.
			
 
				+@c FIXME: Update when OpenCL support is ready.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+Here is an example:
			
 
				+
			
 
				+@example
			
 
				+static void matmul (const float *A, const float *B, float *C,
			
 
				+		    size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+static void matmul_cpu (const float *A, const float *B, float *C,
			
 
				+			size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("cpu", matmul)));
			
 
				+
			
 
				+
			
 
				+static void
			
 
				+matmul_cpu (const float *A, const float *B, float *C,
			
 
				+	    size_t nx, size_t ny, size_t nz)
			
 
				+@{
			
 
				+  size_t i, j, k;
			
 
				+
			
 
				+  for (j = 0; j < ny; j++)
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+      @{
			
 
				+	for (k = 0; k < nz; k++)
			
 
				+	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
			
 
				+      @}
			
 
				+@}
			
 
				+@end example
			
 
				+
			
 
				+@noindent
			
 
				+A @code{matmult} task is defined; it has only one implementation,
			
 
				+@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
			
 
				+@var{B} are input buffers, whereas @var{C} is considered an input/output
			
 
				+buffer.  The task can be invoked like a regular C function:
			
 
				+
			
 
				+@example
			
 
				+matmul (&A[i * zdim * bydim + k * bzdim * bydim],
			
 
				+        &B[k * xdim * bzdim + j * bxdim * bzdim],
			
 
				+        &C[i * xdim * bydim + j * bxdim * bydim],
			
 
				+        bxdim, bydim, bzdim);
			
 
				+@end example
			
 
				+
			
 
				+@noindent
			
 
				+This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
			
 
				+implementation may run in parallel with the continuation of the caller.
			
 
				+
			
 
				+The next section describes how memory buffers must be handled in
			
 
				+StarPU-GCC code.
			
 
				+
			
 
				+
			
 
				+@node Registered Data Buffers
			
 
				+@section Registered Data Buffers
			
 
				+
			
 
				+Data buffers such as matrices and vectors that are to be passed to tasks
			
 
				+must be @dfn{registered}.  Registration allows StarPU to handle data
			
 
				+transfers among devices---e.g., transferring an input buffer from the
			
 
				+CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
			
 
				+Management Library}).
			
 
				+
			
 
				+The following pragmas are provided:
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item #pragma starpu register @var{ptr} [@var{size}]
			
 
				+Register @var{ptr} as a @var{size}-element buffer.
			
 
				+
			
 
				+@item #pragma starpu unregister @var{ptr}
			
 
				+@item #pragma starpu acquire @var{ptr}
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+FIXME: finish
			
 
				+
			
 
				+@c Local Variables:
			
 
				+@c TeX-master: "guile.texi"
			
 
				+@c ispell-local-dictionary: "american"
			
 
				+@c End:
			
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -49,8 +49,9 @@ was last updated on @value{UPDATED}.
 
				 * Configuring StarPU::          How to configure StarPU
			
 
				 * StarPU API::                  The API to use StarPU
			
 
				 * Advanced Topics::             Advanced use of StarPU
			
 
				-* Full source code for the 'Scaling a Vector' example::  
			
 
				+* C Extensions::                Easier StarPU programming with GCC
			
 
				 
			
 
				+* Full source code for the 'Scaling a Vector' example::
			
 
				 * Function Index::              Index of C functions.
			
 
				 @end menu
			
 
				 
			
@@ -1227,16 +1228,16 @@ Partitioning can be applied several times, see
 
				 @section Performance model example
			
 
				 
			
 
				 To achieve good scheduling, StarPU scheduling policies need to be able to
			
 
				-estimate in advance the duration of a task. This is done by giving to codelets a
			
 
				-performance model. There are several kinds of performance models.
			
 
				+estimate in advance the duration of a task. This is done by giving to codelets
			
 
				+a performance model, by defining a @code{starpu_perfmodel_t} structure and
			
 
				+providing its address in the @code{model} field of the @code{starpu_codelet}
			
 
				+structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel_t}
			
 
				+are mandatory, to give a name to the model, and the type of the model, since
			
 
				+there are several kinds of performance models.
			
 
				 
			
 
				 @itemize
			
 
				 @item
			
 
				-Providing an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
			
 
				-see for instance
			
 
				-@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)
			
 
				-@item
			
 
				-Measured at runtime (STARPU_HISTORY_BASED model type). This assumes that for a
			
 
				+Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
			
 
				 given set of data input/output sizes, the performance will always be about the
			
 
				 same. This is very true for regular kernels on GPUs for instance (<0.1% error),
			
 
				 and just a bit less true on CPUs (~=1% error). This also assumes that there are
			
@@ -1276,7 +1277,7 @@ starpu_codelet cl = @{
 
				 @end cartouche
			
 
				 
			
 
				 @item
			
 
				-Measured at runtime and refined by regression (STARPU_REGRESSION_*_BASED
			
 
				+Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
			
 
				 model type). This still assumes performance regularity, but can work
			
 
				 with various data input sizes, by applying regression over observed
			
 
				 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
			
@@ -1286,7 +1287,12 @@ STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
 
				 model for the @code{memset} operation.
			
 
				 
			
 
				 @item
			
 
				-Provided explicitly by the application (STARPU_PER_ARCH model type): the
			
 
				+Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
			
 
				+see for instance
			
 
				+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
			
 
				+
			
 
				+@item
			
 
				+Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
			
 
				 @code{.per_arch[i].cost_model} fields have to be filled with pointers to
			
 
				 functions which return the expected duration of the task in micro-seconds, one
			
 
				 per architecture.
			
@@ -1877,6 +1883,22 @@ TODO
 
				 
			
 
				 @c what kind of information do we get ?
			
 
				 
			
 
				+The bus speed measured by StarPU can be displayed by using the
			
 
				+@code{starpu_machine_display} tool, for instance:
			
 
				+
			
 
				+@example
			
 
				+StarPU has found :
			
 
				+        3 CUDA devices
			
 
				+                CUDA 0 (Tesla C2050 02:00.0)
			
 
				+                CUDA 1 (Tesla C2050 03:00.0)
			
 
				+                CUDA 2 (Tesla C2050 84:00.0)
			
 
				+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
			
 
				+RAM     0.000000        5176.530428     5176.492994     5191.710722
			
 
				+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
			
 
				+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
			
 
				+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
			
 
				+@end example
			
 
				+
			
 
				 @node StarPU-Top
			
 
				 @subsection StarPU-Top interface
			
 
				 
			
@@ -2057,7 +2079,7 @@ This will create an @code{activity.data} file in the current
 
				 directory. A profile of the application showing the activity of StarPU
			
 
				 during the execution of the program can be generated:
			
 
				 @example
			
 
				-$ starpu_top.sh activity.data
			
 
				+$ starpu_top activity.data
			
 
				 @end example
			
 
				 
			
 
				 This will create a file named @code{activity.eps} in the current directory.
			
@@ -2558,6 +2580,7 @@ Enable flags for the @code{gcov} coverage tool.
 
				 * --with-cuda-dir::             
			
 
				 * --with-cuda-include-dir::             
			
 
				 * --with-cuda-lib-dir::             
			
 
				+* --disable-cuda-memcpy-peer::
			
 
				 * --enable-maxopencldev::       
			
 
				 * --disable-opencl::            
			
 
				 * --with-opencl-dir::           
			
@@ -2624,6 +2647,13 @@ notably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to
 
				 
			
 
				 @end table
			
 
				 
			
 
				+@node --disable-cuda-memcpy-peer
			
 
				+@subsubsection @code{--disable-cuda-memcpy-peer}
			
 
				+@table @asis
			
 
				+@item @emph{Description}
			
 
				+Explicitely disables peer transfers when using CUDA 4.0
			
 
				+@end table
			
 
				+
			
 
				 @node --enable-maxopencldev
			
 
				 @subsubsection @code{--enable-maxopencldev=<number>}
			
 
				 @table @asis
			
@@ -4893,6 +4923,13 @@ static struct starpu_sched_policy_s dummy_sched_policy = @{
 
				 
			
 
				 
			
 
				 @c ---------------------------------------------------------------------
			
 
				+@c C Extensions
			
 
				+@c ---------------------------------------------------------------------
			
 
				+
			
 
				+@include c-extensions.texi
			
 
				+
			
 
				+
			
 
				+@c ---------------------------------------------------------------------
			
 
				 @c Appendices
			
 
				 @c ---------------------------------------------------------------------
			
 
				 
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -2,6 +2,7 @@
 
				 #
			
 
				 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2011  Télécom-SudParis
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -152,6 +153,7 @@ examplebin_PROGRAMS +=				\
 
				 	basic_examples/mult			\
			
 
				 	basic_examples/block			\
			
 
				 	basic_examples/variable			\
			
 
				+	basic_examples/mult_impl                \
			
 
				 	filters/fvector				\
			
 
				 	filters/fblock				\
			
 
				 	filters/fmatrix				\
			
--- a/examples/basic_examples/hello_world.c
+++ b/examples/basic_examples/hello_world.c
@@ -52,12 +52,12 @@ struct params {
 
				 };
			
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				-	struct params *params = cl_arg;
			
 
				+	struct params *params = (struct params *) cl_arg;
			
 
				 
			
 
				 	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
			
 
				 }
			
 
				 
			
 
				-starpu_codelet cl;
			
 
				+starpu_codelet cl = {};
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
--- a/examples/basic_examples/mult.c
+++ b/examples/basic_examples/mult.c
@@ -127,9 +127,9 @@ static void init_problem_data(void)
 
				 
			
 
				 	/* we initialize matrices A, B and C in the usual way */
			
 
				 
			
 
				-	A = malloc(zdim*ydim*sizeof(float));
			
 
				-	B = malloc(xdim*zdim*sizeof(float));
			
 
				-	C = malloc(xdim*ydim*sizeof(float));
			
 
				+	A = (float *) malloc(zdim*ydim*sizeof(float));
			
 
				+	B = (float *) malloc(xdim*zdim*sizeof(float));
			
 
				+	C = (float *) malloc(xdim*ydim*sizeof(float));
			
 
				 
			
 
				 	/* fill the A and B matrices */
			
 
				 	srand(2009);
			
--- a/examples/basic_examples/mult_impl.c
+++ b/examples/basic_examples/mult_impl.c
@@ -0,0 +1,384 @@
 
				+/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Télécom-SudParis
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <pthread.h>
			
 
				+#include <signal.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+static float *A, *B, *C;
			
 
				+static starpu_data_handle A_handle, B_handle, C_handle;
			
 
				+
			
 
				+static unsigned nslicesx = 4;
			
 
				+static unsigned nslicesy = 4;
			
 
				+static unsigned xdim = 1024;
			
 
				+static unsigned ydim = 1024;
			
 
				+static unsigned zdim = 512;
			
 
				+
			
 
				+
			
 
				+double mult_gemm_cost(starpu_buffer_descr *descr)
			
 
				+{
			
 
				+	/* C = A * B */
			
 
				+	uint32_t nxC, nyC, nxA;
			
 
				+
			
 
				+
			
 
				+	nxC = starpu_matrix_get_nx(descr[2].handle);
			
 
				+	nyC = starpu_matrix_get_ny(descr[2].handle);
			
 
				+	nxA = starpu_matrix_get_nx(descr[0].handle);
			
 
				+
			
 
				+	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
			
 
				+
			
 
				+	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
			
 
				+
			
 
				+	printf("cost %e \n", cost);
			
 
				+
			
 
				+	return cost;
			
 
				+}
			
 
				+
			
 
				+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
			
 
				+{
			
 
				+	float *subA, *subB, *subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+	printf("On application: Hello, this is kernel cpu_mult\n\n");
			
 
				+	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				+	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				+	 * stands for leading dimension).
			
 
				+	 * NB: in case some filters were used, the leading dimension is not
			
 
				+	 * guaranteed to be the same in main memory (on the original matrix)
			
 
				+	 * and on the accelerator! */
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	/* we assume a FORTRAN-ordering! */
			
 
				+	unsigned i,j,k;
			
 
				+	for (i = 0; i < nyC; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < nxC; j++)
			
 
				+		{
			
 
				+			float sum = 0.0;
			
 
				+
			
 
				+			for (k = 0; k < nyA; k++)
			
 
				+			{
			
 
				+				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				+			}
			
 
				+
			
 
				+			subC[j + i*ldC] = sum;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
			
 
				+{
			
 
				+	float *subA, *subB, *subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+	printf("On application: this is kernel cpu_mult_2\n\n");
			
 
				+	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	/* we assume a FORTRAN-ordering! */
			
 
				+	unsigned i,j,k;
			
 
				+	for (j = 0; j < nxC; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nyC; i++)
			
 
				+		{
			
 
				+			float sum = 0.0;
			
 
				+
			
 
				+			for (k = 0; k < nyA; k++)
			
 
				+			{
			
 
				+				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				+			}
			
 
				+
			
 
				+			subC[j + i*ldC] = sum;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void init_problem_data(void)
			
 
				+{
			
 
				+	unsigned i,j;
			
 
				+
			
 
				+	/* we initialize matrices A, B and C in the usual way */
			
 
				+
			
 
				+	A = malloc(zdim*ydim*sizeof(float));
			
 
				+	B = malloc(xdim*zdim*sizeof(float));
			
 
				+	C = malloc(xdim*ydim*sizeof(float));
			
 
				+
			
 
				+	/* fill the A and B matrices */
			
 
				+	srand(2009);
			
 
				+	for (j=0; j < ydim; j++) {
			
 
				+		for (i=0; i < zdim; i++) {
			
 
				+			A[j+i*ydim] = (float)(starpu_drand48());
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (j=0; j < zdim; j++) {
			
 
				+		for (i=0; i < xdim; i++) {
			
 
				+			B[j+i*zdim] = (float)(starpu_drand48());
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (j=0; j < ydim; j++) {
			
 
				+		for (i=0; i < xdim; i++) {
			
 
				+			C[j+i*ydim] = (float)(0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void partition_mult_data(void)
			
 
				+{
			
 
				+	/* note that we assume a FORTRAN ordering here! */
			
 
				+
			
 
				+	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
			
 
				+		ydim, ydim, zdim, sizeof(float));
			
 
				+	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
			
 
				+		zdim, zdim, xdim, sizeof(float));
			
 
				+	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
			
 
				+		ydim, ydim, xdim, sizeof(float));
			
 
				+
			
 
				+	/* A filter is a method to partition a data into disjoint chunks, it is
			
 
				+	 * described by the means of the "struct starpu_data_filter" structure that
			
 
				+	 * contains a function that is applied on a data handle to partition it
			
 
				+	 * into smaller chunks, and an argument that is passed to the function
			
 
				+	 * (eg. the number of blocks to create here).
			
 
				+	 */
			
 
				+
			
 
				+	struct starpu_data_filter vert = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nslicesx,
			
 
				+		.get_nchildren = NULL,
			
 
				+		.get_child_ops = NULL
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter horiz = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nslicesy,
			
 
				+		.get_nchildren = NULL,
			
 
				+		.get_child_ops = NULL
			
 
				+	};
			
 
				+
			
 
				+/*
			
 
				+ *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
			
 
				+ *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
			
 
				+ *	the number of filters to apply, and the indexes for each filters, for
			
 
				+ *	instance:
			
 
				+ *
			
 
				+ *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
			
 
				+ *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
			
 
				+ *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
			
 
				+ *
			
 
				+ *	Note that here we applied 2 filters recursively onto C.
			
 
				+ *
			
 
				+ *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
			
 
				+ *	of blocked matrix C for example.
			
 
				+ *
			
 
				+ *		              |---|---|---|---|
			
 
				+ *		              |   |   | B'|   | B
			
 
				+ *		              |---|---|---|---|
			
 
				+ *		                0   1   2   3
			
 
				+ *		     |----|   |---|---|---|---|
			
 
				+ *		     |    |   |   |   |   |   |
			
 
				+ *		     |    | 0 |   |   |   |   |
			
 
				+ *		     |----|   |---|---|---|---|
			
 
				+ *		     | A' |   |   |   | C'|   |
			
 
				+ *		     |    |   |   |   |   |   |
			
 
				+ *		     |----|   |---|---|---|---|
			
 
				+ *		       A              C
			
 
				+ *
			
 
				+ *	IMPORTANT: applying filters is equivalent to partitionning a piece of
			
 
				+ *	data in a hierarchical manner, so that memory consistency is enforced
			
 
				+ *	for each of the elements independantly. The tasks should therefore NOT
			
 
				+ *	access inner nodes (eg. one column of C or the whole C) but only the
			
 
				+ *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
			
 
				+ *	possible by disapplying the filters (using starpu_data_unpartition), to
			
 
				+ *	enforce memory consistency.
			
 
				+ */
			
 
				+
			
 
				+	starpu_data_partition(B_handle, &vert);
			
 
				+	starpu_data_partition(A_handle, &horiz);
			
 
				+
			
 
				+	/* starpu_data_map_filters is a variable-arity function, the first argument
			
 
				+	 * is the handle of the data to partition, the second argument is the
			
 
				+	 * number of filters to apply recursively. Filters are applied in the
			
 
				+	 * same order as the arguments.
			
 
				+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
			
 
				+	 * then applying horiz on each sub-data (ie. each column of C)
			
 
				+	 */
			
 
				+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
			
 
				+	.cost_model = mult_gemm_cost,
			
 
				+	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
			
 
				+	.symbol = "mult_perf_model"
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+static struct starpu_perfmodel_t mult_perf_model = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "mult_perf_model"
			
 
				+};
			
 
				+*/
			
 
				+
			
 
				+struct starpu_conf conf = {
			
 
				+		.sched_policy_name = "heft",
			
 
				+		.calibrate = 1,
			
 
				+		.ncpus = 4
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static starpu_codelet cl = {
			
 
				+        /* we can only execute that kernel on a CPU yet */
			
 
				+        .where = STARPU_CPU,
			
 
				+        //.starpu_impl_multiple = 1,
			
 
				+        /* CPU implementation of the codelet */
			
 
				+        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
			
 
				+        .cpu_funcs = {cpu_mult,cpu_mult_2},
			
 
				+        /* the codelet manipulates 3 buffers that are managed by the
			
 
				+         * DSM */
			
 
				+        .nbuffers = 3,
			
 
				+        /* in case the scheduling policy may use performance models */
			
 
				+        .model = &starpu_dgemm_model_common
			
 
				+};
			
 
				+
			
 
				+static void launch_tasks(void)
			
 
				+{
			
 
				+	/* partition the work into slices */
			
 
				+	unsigned taskx, tasky;
			
 
				+
			
 
				+	for (taskx = 0; taskx < nslicesx; taskx++)
			
 
				+	{
			
 
				+		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				+		{
			
 
				+			/* C[taskx, tasky] = A[tasky] B[taskx] */
			
 
				+
			
 
				+			/* by default, starpu_task_create() returns an
			
 
				+ 			 * asynchronous task (ie. task->synchronous = 0) */
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+			/* this task implements codelet "cl" */
			
 
				+			task->cl = &cl;
			
 
				+
			
 
				+			/*
			
 
				+			 *              |---|---|---|---|
			
 
				+			 *              |   | * |   |   | B
			
 
				+			 *              |---|---|---|---|
			
 
				+			 *                    X
			
 
				+			 *     |----|   |---|---|---|---|
			
 
				+			 *     |****| Y |   |***|   |   |
			
 
				+			 *     |****|   |   |***|   |   |
			
 
				+			 *     |----|   |---|---|---|---|
			
 
				+			 *     |    |   |   |   |   |   |
			
 
				+			 *     |    |   |   |   |   |   |
			
 
				+			 *     |----|   |---|---|---|---|
			
 
				+			 *       A              C
			
 
				+			 */
			
 
				+
			
 
				+			/* there was a single filter applied to matrices A
			
 
				+			 * (respectively B) so we grab the handle to the chunk
			
 
				+			 * identified by "tasky" (respectively "taskx). The "1"
			
 
				+			 * tells StarPU that there is a single argument to the
			
 
				+			 * variable-arity function starpu_data_get_sub_data */
			
 
				+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				+			task->buffers[0].mode = STARPU_R;
			
 
				+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				+			task->buffers[1].mode = STARPU_R;
			
 
				+
			
 
				+			/* 2 filters were applied on matrix C, so we give
			
 
				+			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
			
 
				+			 * must match the order in which the filters were
			
 
				+			 * applied.
			
 
				+			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
			
 
				+			 * a handle to the column number k of matrix C.
			
 
				+			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
			
 
				+			 * equivalent to
			
 
				+			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
			
 
				+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				+			task->buffers[2].mode = STARPU_W;
			
 
				+
			
 
				+			/* this is not a blocking call since task->synchronous = 0 */
			
 
				+			int summit_task;
			
 
				+			summit_task = starpu_task_submit(task);
			
 
				+			printf("task is submmited or not %d\n",summit_task);
			
 
				+
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	/* start the runtime */
			
 
				+	starpu_init(&conf);
			
 
				+
			
 
				+	/* initialize matrices A, B and C and register them to StarPU */
			
 
				+	init_problem_data();
			
 
				+
			
 
				+	/* partition matrices into blocks that can be manipulated by the
			
 
				+ 	 * codelets */
			
 
				+	partition_mult_data();
			
 
				+
			
 
				+	/* submit all tasks in an asynchronous fashion */
			
 
				+	launch_tasks();
			
 
				+
			
 
				+	/* wait for termination */
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	/* remove the filters applied by the means of starpu_data_map_filters; now
			
 
				+ 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
			
 
				+	 * starpu_data_map_filters is called again on C_handle.
			
 
				+	 * The second argument is the memory node where the different subsets
			
 
				+	 * should be reassembled, 0 = main memory (RAM) */
			
 
				+	starpu_data_unpartition(C_handle, 0);
			
 
				+
			
 
				+	/* stop monitoring matrix C : after this, it is not possible to pass C
			
 
				+	 * (or any subset of C) as a codelet input/output. This also implements
			
 
				+	 * a barrier so that the piece of data is put back into main memory in
			
 
				+	 * case it was only available on a GPU for instance. */
			
 
				+	starpu_data_unregister(C_handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/basic_examples/vector_scal_cpu.c
+++ b/examples/basic_examples/vector_scal_cpu.c
@@ -24,7 +24,7 @@
 
				 void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				 	unsigned i;
			
 
				-	float *factor = cl_arg;
			
 
				+	float *factor = (float *) cl_arg;
			
 
				 
			
 
				 	/*
			
 
				 	 * The "buffers" array matches the task->buffers array: for instance
			
@@ -37,7 +37,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 
				 	 * migrated/replicated), and elemsize gives the size of each elements.
			
 
				 	 */
			
 
				 
			
 
				-	starpu_vector_interface_t *vector = buffers[0];
			
 
				+	starpu_vector_interface_t *vector = (starpu_vector_interface_t *) buffers[0];
			
 
				 
			
 
				 	/* length of the vector */
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(vector);
			
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -125,27 +126,27 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 }
			
 
				 
			
 
				 struct starpu_perfmodel_t chol_model_11 = {
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_11"
			
 
				 };
			
 
				 
			
 
				 struct starpu_perfmodel_t chol_model_21 = {
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_21"
			
 
				 };
			
 
				 
			
 
				 struct starpu_perfmodel_t chol_model_22 = {
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_22"
			
--- a/examples/filters/fblock_cpu.c
+++ b/examples/filters/fblock_cpu.c
@@ -19,7 +19,7 @@
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				         unsigned i, j, k;
			
 
				-        int *factor = cl_arg;
			
 
				+        int *factor = (int *) cl_arg;
			
 
				 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
			
 
				 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
			
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -25,7 +25,7 @@
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				         unsigned i, j;
			
 
				-        int *factor = cl_arg;
			
 
				+        int *factor = (int *) cl_arg;
			
 
				 
			
 
				         /* length of the matrix */
			
 
				         unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -24,7 +24,7 @@
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				         unsigned i;
			
 
				-        int *factor = cl_arg;
			
 
				+        int *factor = (int *) cl_arg;
			
 
				 
			
 
				         /* length of the vector */
			
 
				         unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
--- a/examples/heat/lu_kernels_model.c
+++ b/examples/heat/lu_kernels_model.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -215,9 +216,9 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
				 
			
 
				 struct starpu_perfmodel_t model_11 = {
			
 
				 	.cost_model = task_11_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_11_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_11_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_11_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
@@ -231,9 +232,9 @@ struct starpu_perfmodel_t model_11 = {
 
				 
			
 
				 struct starpu_perfmodel_t model_12 = {
			
 
				 	.cost_model = task_12_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_12_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_12_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_12_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
@@ -247,9 +248,9 @@ struct starpu_perfmodel_t model_12 = {
 
				 
			
 
				 struct starpu_perfmodel_t model_21 = {
			
 
				 	.cost_model = task_21_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_21_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_21_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_21_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
@@ -263,9 +264,9 @@ struct starpu_perfmodel_t model_21 = {
 
				 
			
 
				 struct starpu_perfmodel_t model_22 = {
			
 
				 	.cost_model = task_22_cost,
			
 
				-	.per_arch = { 
			
 
				-		[STARPU_CPU_DEFAULT] = { .cost_model = task_22_cost_cpu },
			
 
				-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
			
 
				+	.per_arch = {
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_22_cost_cpu },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_22_cost_cuda }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -333,7 +333,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
				 
			
 
				 	while (1)
			
 
				 	{
			
 
				-		local_iy = STARPU_ATOMIC_ADD(pcnt, 1) - 1;
			
 
				+		local_iy = STARPU_ATOMIC_ADD((unsigned int *)pcnt, 1) - 1;
			
 
				 		if (local_iy >= block_size)
			
 
				 			break;
			
 
				 
			
--- a/examples/ppm_downscaler/ppm_downscaler.c
+++ b/examples/ppm_downscaler/ppm_downscaler.c
@@ -28,7 +28,7 @@
 
				 
			
 
				 struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
			
 
				 {
			
 
				-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
			
 
				+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
			
 
				 	assert(ppm);
			
 
				 
			
 
				 	ppm->ncols = ncols;
			
@@ -36,9 +36,9 @@ struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
 
				 	ppm->coldepth = coldepth;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_MEMALIGN
			
 
				-	ppm->data = memalign(16384, ncols*nlines*sizeof(struct ppm_color));
			
 
				+	ppm->data = (struct ppm_color *) memalign(16384, ncols*nlines*sizeof(struct ppm_color));
			
 
				 #else
			
 
				-	ppm->data = malloc(ncols*nlines*sizeof(struct ppm_color));
			
 
				+	ppm->data = (struct ppm_color *) malloc(ncols*nlines*sizeof(struct ppm_color));
			
 
				 #endif
			
 
				 	assert(ppm->data);
			
 
				 
			
@@ -49,7 +49,7 @@ struct ppm_image *file_to_ppm(char *filename)
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
			
 
				+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
			
 
				 	assert(ppm);
			
 
				 	
			
 
				 	FILE *file = fopen(filename, "r");
			
@@ -64,9 +64,9 @@ struct ppm_image *file_to_ppm(char *filename)
 
				 	
			
 
				 	/* allocate a buffer for the image */
			
 
				 #ifdef STARPU_HAVE_MEMALIGN
			
 
				-	ppm->data = memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
			
 
				+	ppm->data = (struct ppm_color *) memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
			
 
				 #else
			
 
				-	ppm->data = malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
			
 
				+	ppm->data = (struct ppm_color *) malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
			
 
				 #endif
			
 
				 	assert(ppm->data);
			
 
				 
			
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -123,7 +123,7 @@ int main(int argc, char **argv)
 
				 	FILE *f_in = fopen(filename_in, "r");
			
 
				 	assert(f_in);
			
 
				 
			
 
				-	struct yuv_frame *yuv_in_buffer = malloc(nframes*FRAMESIZE);
			
 
				+	struct yuv_frame *yuv_in_buffer = (struct yuv_frame *) malloc(nframes*FRAMESIZE);
			
 
				 	fread(yuv_in_buffer, FRAMESIZE, nframes, f_in);
			
 
				 
			
 
				 	/* allocate room for an output buffer */
			
@@ -131,16 +131,16 @@ int main(int argc, char **argv)
 
				 	assert(f_out);
			
 
				 
			
 
				 /*	fprintf(stderr, "Alloc output file ...\n"); */
			
 
				-	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
			
 
				+	struct yuv_new_frame *yuv_out_buffer = (struct yuv_new_frame *) calloc(nframes, NEW_FRAMESIZE);
			
 
				 	assert(yuv_out_buffer);
			
 
				 
			
 
				-	starpu_data_handle *frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
			
 
				-	starpu_data_handle *frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
			
 
				-	starpu_data_handle *frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
			
 
				+	starpu_data_handle *frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
			
 
				+	starpu_data_handle *frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
			
 
				+	starpu_data_handle *frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
			
 
				 
			
 
				-	starpu_data_handle *new_frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
			
 
				-	starpu_data_handle *new_frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
			
 
				-	starpu_data_handle *new_frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
			
 
				+	starpu_data_handle *new_frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
			
 
				+	starpu_data_handle *new_frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
			
 
				+	starpu_data_handle *new_frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
			
 
				 
			
 
				 	starpu_init(NULL);
			
 
				 
			
--- a/examples/profiling/profiling.c
+++ b/examples/profiling/profiling.c
@@ -53,7 +53,7 @@ int main(int argc, char **argv)
 
				 		.nbuffers = 0
			
 
				 	};
			
 
				 
			
 
				-	struct starpu_task **tasks = malloc(niter*sizeof(struct starpu_task *));
			
 
				+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
			
 
				 	assert(tasks);
			
 
				 
			
 
				 	unsigned i;
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -164,11 +164,11 @@ int main(int argc, char **argv)
 
				 	unsigned long nelems = nblocks*entries_per_block;
			
 
				 	size_t size = nelems*sizeof(float);
			
 
				 
			
 
				-	x = malloc(size);
			
 
				-	y = malloc(size);
			
 
				+	x = (float *) malloc(size);
			
 
				+	y = (float *) malloc(size);
			
 
				 
			
 
				-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
			
 
				-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
			
 
				+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
			
 
				+	y_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
			
 
				 
			
 
				 	assert(x && y);
			
 
				 
			
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -129,8 +129,8 @@ int main(int argc, char **argv)
 
				 	unsigned long nelems = nblocks*entries_per_bock;
			
 
				 	size_t size = nelems*sizeof(TYPE);
			
 
				 
			
 
				-	x = malloc(size);
			
 
				-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
			
 
				+	x = (TYPE *) malloc(size);
			
 
				+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
			
 
				 	
			
 
				 	assert(x && x_handles);
			
 
				 
			
--- a/examples/socl/mandelbrot/mandelbrot.c
+++ b/examples/socl/mandelbrot/mandelbrot.c
@@ -21,7 +21,7 @@
 
				 #include <unistd.h>
			
 
				 
			
 
				 /* Uncomment this to activate X11 display */
			
 
				-#define USE_X11
			
 
				+//#define USE_X11
			
 
				 
			
 
				 #define SHORT_LOG 1
			
 
				 #define ROUND_ROBIN
			
--- a/examples/spmv/spmv.c
+++ b/examples/spmv/spmv.c
@@ -44,8 +44,8 @@ static void parse_args(int argc, char **argv)
 
				  * same number of non-zero entries. */
			
 
				 static void csr_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_father = father_interface;
			
 
				-	starpu_csr_interface_t *csr_child = child_interface;
			
 
				+	starpu_csr_interface_t *csr_father = (starpu_csr_interface_t *) father_interface;
			
 
				+	starpu_csr_interface_t *csr_child = (starpu_csr_interface_t *) child_interface;
			
 
				 
			
 
				 	uint32_t nrow = csr_father->nrow;
			
 
				 	size_t elemsize = csr_father->elemsize;
			
--- a/examples/starpufft/.gitignore
+++ b/examples/starpufft/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/examples/stencil/.gitignore
+++ b/examples/stencil/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -90,7 +90,7 @@ int MPI_TAG1(int z, int iter, int dir)
 
				 /* Compute the size of the different blocks */
			
 
				 static void compute_block_sizes(void)
			
 
				 {
			
 
				-	block_sizes_z = malloc(nbz*sizeof(unsigned));
			
 
				+	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
			
 
				 	STARPU_ASSERT(block_sizes_z);
			
 
				 
			
 
				 	/* Perhaps the last chunk is smaller */
			
@@ -136,7 +136,7 @@ void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsi
 
				 	sizez = _sizez;
			
 
				 
			
 
				 	/* Create a grid of block descriptors */
			
 
				-	blocks = calloc(nbz, sizeof(struct block_description));
			
 
				+	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
			
 
				 	STARPU_ASSERT(blocks);
			
 
				 
			
 
				 	/* What is the size of the different blocks ? */
			
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -165,10 +165,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 
				 /*
			
 
				  * Load a neighbour's boundary into block, CPU version
			
 
				  */
			
 
				-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_from_buffer_cpu(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
			
 
				+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -184,10 +186,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
 
				  * Load a neighbour's boundary into block, CUDA version
			
 
				  */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_from_buffer_cuda(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
			
 
				+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -245,16 +249,16 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
				 	for (i=1; i<=K; i++)
			
 
				 	{
			
 
				 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
			
 
				+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
			
 
				 
			
 
				 		/* Shadow data */
			
 
				 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 
			
 
				 		/* And perform actual computation */
			
 
				 #ifdef LIFE
			
 
				-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 #else
			
 
				-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				 #endif /* LIFE */
			
 
				 	}
			
 
				 
			
@@ -338,16 +342,16 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
				 	for (i=1; i<=K; i++)
			
 
				 	{
			
 
				 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
			
 
				+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
			
 
				 
			
 
				 		/* Shadow data */
			
 
				 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 
			
 
				 		/* And perform actual computation */
			
 
				 #ifdef LIFE
			
 
				-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 #else
			
 
				-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
			
 
				+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, NULL);
			
 
				 #endif /* LIFE */
			
 
				 	}
			
 
				 
			
@@ -365,7 +369,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
				  */
			
 
				 static void update_func_cpu(void *descr[], void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
			
 
				 	if (block->bz == 0)
			
@@ -405,8 +409,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
				 
			
 
				 	for (i=1; i<=K; i++)
			
 
				 	{
			
 
				-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
			
 
				-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
			
 
				+		starpu_block_interface_t *oldb = (starpu_block_interface_t *) descr[i%2], *newb = (starpu_block_interface_t *) descr[(i+1)%2];
			
 
				+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
			
 
				 
			
 
				 		/* Shadow data */
			
 
				 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
			
@@ -424,9 +428,9 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
				 
			
 
				 		/* And perform actual computation */
			
 
				 #ifdef LIFE
			
 
				-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 #else
			
 
				-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
			
 
				+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
			
 
				 #endif /* LIFE */
			
 
				 	}
			
 
				 
			
@@ -465,10 +469,12 @@ starpu_codelet cl_update = {
 
				  */
			
 
				 
			
 
				 /* CPU version */
			
 
				-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_into_buffer_cpu(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
			
 
				+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -482,10 +488,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
				 
			
 
				 /* CUDA version */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
			
 
				-					starpu_block_interface_t *boundary,
			
 
				+static void load_subblock_into_buffer_cuda(void *_block,
			
 
				+					void *_boundary,
			
 
				 					unsigned firstz)
			
 
				 {
			
 
				+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
			
 
				+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
			
 
				 	check_load(block, boundary);
			
 
				 
			
 
				 	/* We do a contiguous memory transfer */
			
@@ -527,7 +535,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 
				 /* top save, CPU version */
			
 
				 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	top_per_worker[workerid]++;
			
 
				 
			
@@ -543,7 +551,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 
				 /* bottom save, CPU version */
			
 
				 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	bottom_per_worker[workerid]++;
			
 
				 
			
@@ -557,7 +565,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	top_per_worker[workerid]++;
			
 
				 
			
@@ -574,7 +582,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
				 /* bottom save, CUDA version */
			
 
				 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	bottom_per_worker[workerid]++;
			
 
				 
			
@@ -590,7 +598,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	top_per_worker[workerid]++;
			
 
				 
			
@@ -610,7 +618,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 
				 /* bottom save, OPENCL version */
			
 
				 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
			
 
				 {
			
 
				-	struct block_description *block = arg;
			
 
				+	struct block_description *block = (struct block_description *) arg;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	bottom_per_worker[workerid]++;
			
 
				 
			
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -132,9 +132,9 @@ static void init_problem(int argc, char **argv, int rank, int world_size)
 
				 	display_memory_consumption(rank);
			
 
				 
			
 
				 	who_runs_what_len = 2*niter;
			
 
				-	who_runs_what = calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
			
 
				-	who_runs_what_index = calloc(nbz, sizeof(*who_runs_what_index));
			
 
				-	last_tick = calloc(nbz, sizeof(*last_tick));
			
 
				+	who_runs_what = (int *) calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
			
 
				+	who_runs_what_index = (int *) calloc(nbz, sizeof(*who_runs_what_index));
			
 
				+	last_tick = (struct timeval *) calloc(nbz, sizeof(*last_tick));
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/examples/tag_example/tag_example.c
+++ b/examples/tag_example/tag_example.c
@@ -30,7 +30,7 @@
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define TAG(i, j, iter)	((starpu_tag_t) ( ((uint64_t)(iter)<<48) |  ((uint64_t)(j)<<24) | (i)) )
			
 
				 
			
 
				-starpu_codelet cl;
			
 
				+starpu_codelet cl = {};
			
 
				 
			
 
				 #define Ni	64
			
 
				 #define Nj	32
			
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -30,7 +30,7 @@
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
			
 
				 
			
 
				-starpu_codelet cl;
			
 
				+starpu_codelet cl = {};
			
 
				 
			
 
				 #define Ni	64
			
 
				 #define Nk	256
			
--- a/examples/tag_example/tag_example3.c
+++ b/examples/tag_example/tag_example3.c
@@ -30,7 +30,7 @@
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
			
 
				 
			
 
				-starpu_codelet cl;
			
 
				+starpu_codelet cl = {};
			
 
				 
			
 
				 #define Ni	64
			
 
				 #define Nk	256
			
--- a/examples/tag_example/tag_restartable.c
+++ b/examples/tag_example/tag_restartable.c
@@ -35,7 +35,7 @@
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)((iter)%Nrolls))<<32 | (i)) )
			
 
				 
			
 
				-starpu_codelet cl;
			
 
				+starpu_codelet cl = {};
			
 
				 
			
 
				 #define Ni	64
			
 
				 #define Nk	256
			
@@ -134,7 +134,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 	FPRINTF(stderr, "ITER : %u\n", nk);
			
 
				 
			
 
				 	for (i = 0; i < Nrolls; i++) {
			
 
				-		tasks[i] = malloc(ni * sizeof(*tasks[i]));
			
 
				+		tasks[i] = (struct starpu_task **) malloc(ni * sizeof(*tasks[i]));
			
 
				 
			
 
				 		create_task_grid(i);
			
 
				 	}
			
--- a/examples/top/hello_world_top.c
+++ b/examples/top/hello_world_top.c
@@ -62,7 +62,7 @@ struct params {
 
				 };
			
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				-	struct params *params = cl_arg;
			
 
				+	struct params *params = (struct params *) cl_arg;
			
 
				 
			
 
				 	//loosing time for top example...
			
 
				 	int sum = 0;
			
@@ -82,14 +82,14 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 void callback_name_changed(starputop_param* param)
			
 
				 {
			
 
				-	char* message = malloc(256);
			
 
				+	char* message = (char *) malloc(256);
			
 
				 	sprintf(message, "Name have been changed to %s", names[name_selected]);
			
 
				 	starputop_debug_log(message);
			
 
				 }
			
 
				 
			
 
				 void callback_number_addition_changed(starputop_param* param)
			
 
				 {
			
 
				-	char* message = malloc(256);
			
 
				+	char* message = (char *) malloc(256);
			
 
				 	sprintf(message, "Number of addition is now %d", number_of_addition);
			
 
				 
			
 
				 	starputop_debug_log(message);
			
--- a/gcc-plugin/README
+++ b/gcc-plugin/README
@@ -8,3 +8,5 @@ tasks.
 
				 Plug-ins are supported starting from GCC 4.5.
			
 
				 
			
 
				 To run the test suite, GNU Guile 1.8.x or 2.0.x is needed.
			
 
				+
			
 
				+When building from SVN, GNU Bison 2.5+ is required.
			
--- a/gcc-plugin/examples/Makefile.am
+++ b/gcc-plugin/examples/Makefile.am
@@ -17,4 +17,9 @@ noinst_PROGRAMS =				\
 
				   matrix-mult
			
 
				 
			
 
				 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
			
 
				+
			
 
				+AM_CPPFLAGS =						\
			
 
				+  -I$(top_srcdir)/include				\
			
 
				+  $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
			
 
				+
			
 
				 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
			
--- a/gcc-plugin/examples/matrix-mult.c
+++ b/gcc-plugin/examples/matrix-mult.c
@@ -187,34 +187,15 @@ main (int argc, char **argv)
 
				   gettimeofday (&start_register, NULL);
			
 
				   for (i = 0; i < nslicesy; i++)
			
 
				     for (j = 0; j < nslicesz; j++)
			
 
				-      {
			
 
				-	/* TODO: Get rid of the `ptr' and `size' variables when the pragma
			
 
				-	   parser supports arbitrary C expressions.  */
			
 
				-
			
 
				-	typeof (A) ptr = &A[i*zdim*bydim + j*bzdim*bydim];
			
 
				-	size_t size = (bzdim * bydim);
			
 
				-#pragma starpu register ptr size
			
 
				-      }
			
 
				+#pragma starpu register &A[i*zdim*bydim + j*bzdim*bydim] (bzdim * bydim)
			
 
				 
			
 
				   for (i = 0; i < nslicesz; i++)
			
 
				-    {
			
 
				-      for (j = 0; j < nslicesx; j++)
			
 
				-	{
			
 
				-	  typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
			
 
				-	  size_t size = (bxdim * bzdim);
			
 
				-#pragma starpu register ptr size
			
 
				-	}
			
 
				-    }
			
 
				+    for (j = 0; j < nslicesx; j++)
			
 
				+#pragma starpu register &B[i*xdim*bzdim + j*bxdim*bzdim] (bxdim * bzdim)
			
 
				 
			
 
				   for (i = 0; i < nslicesy; i++)
			
 
				-    {
			
 
				-      for (j = 0; j < nslicesx; j++)
			
 
				-	{
			
 
				-	  typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
			
 
				-	  size_t size = (bxdim * bydim);
			
 
				-#pragma starpu register ptr size
			
 
				-	}
			
 
				-    }
			
 
				+    for (j = 0; j < nslicesx; j++)
			
 
				+#pragma starpu register &C[i*xdim*bydim + j*bxdim*bydim] (bxdim * bydim)
			
 
				 
			
 
				 
			
 
				   gettimeofday (&end_register, NULL);
			
@@ -256,25 +237,16 @@ main (int argc, char **argv)
 
				   gettimeofday (&start_unregister, NULL);
			
 
				   for (i = 0; i < nslicesy; i++)
			
 
				     for (j = 0; j < nslicesz; j++)
			
 
				-      {
			
 
				-	typeof (A) ptr =  &A[i*zdim*bydim + j*bzdim*bydim];
			
 
				-#pragma starpu unregister ptr
			
 
				-      }
			
 
				+#pragma starpu unregister &A[i*zdim*bydim + j*bzdim*bydim]
			
 
				 
			
 
				 
			
 
				   for (i = 0; i < nslicesz; i++)
			
 
				     for (j = 0; j < nslicesx; j++)
			
 
				-      {
			
 
				-	typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
			
 
				-#pragma starpu unregister ptr
			
 
				-      }
			
 
				+#pragma starpu unregister &B[i*xdim*bzdim + j*bxdim*bzdim]
			
 
				 
			
 
				   for (i = 0; i < nslicesy; i++)
			
 
				     for (j = 0; j < nslicesx; j++)
			
 
				-      {
			
 
				-	typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
			
 
				-#pragma starpu unregister ptr
			
 
				-      }
			
 
				+#pragma starpu unregister &C[i*xdim*bydim + j*bxdim*bydim]
			
 
				 
			
 
				   gettimeofday (&end_unregister, NULL);
			
 
				   gettimeofday (&end_all, NULL);
			
--- a/gcc-plugin/src/Makefile.am
+++ b/gcc-plugin/src/Makefile.am
@@ -17,7 +17,10 @@
 
				 # requires a name prefixed by `lib'.
			
 
				 pkglib_LTLIBRARIES = starpu.la
			
 
				 
			
 
				-starpu_la_SOURCES = starpu.c
			
 
				+starpu_la_SOURCES = starpu.c c-expr.y
			
 
				+
			
 
				+AM_CPPFLAGS =						\
			
 
				+  -I$(top_srcdir)/include				\
			
 
				+  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1
			
 
				 
			
 
				-AM_CPPFLAGS = -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall
			
 
				 AM_LDFLAGS = -module
			
--- a/gcc-plugin/src/c-expr.y
+++ b/gcc-plugin/src/c-expr.y
@@ -0,0 +1,234 @@
 
				+/* GCC-StarPU
			
 
				+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+
			
 
				+   GCC-StarPU is free software: you can redistribute it and/or modify
			
 
				+   it under the terms of the GNU General Public License as published by
			
 
				+   the Free Software Foundation, either version 3 of the License, or
			
 
				+   (at your option) any later version.
			
 
				+
			
 
				+   GCC-StarPU is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+   GNU General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU General Public License
			
 
				+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
			
 
				+
			
 
				+/* Parser for simple C expressions in pragmas.  */
			
 
				+
			
 
				+%define api.pure
			
 
				+%parse-param { location_t loc }
			
 
				+%parse-param { const char *pragma }
			
 
				+%parse-param { tree *seq }
			
 
				+%debug
			
 
				+
			
 
				+%{
			
 
				+  #include <starpu-gcc-config.h>
			
 
				+
			
 
				+  #include <gcc-plugin.h>
			
 
				+  #include <plugin.h>
			
 
				+  #include <tree.h>
			
 
				+  #include <cpplib.h>
			
 
				+
			
 
				+  #ifdef HAVE_C_FAMILY_C_COMMON_H
			
 
				+  # include <c-family/c-common.h>
			
 
				+  #elif HAVE_C_COMMON_H
			
 
				+  # include <c-common.h>
			
 
				+  #endif
			
 
				+
			
 
				+  #ifdef HAVE_C_FAMILY_C_PRAGMA_H
			
 
				+  # include <c-family/c-pragma.h>
			
 
				+  #elif HAVE_C_PRAGMA_H
			
 
				+  # include <c-pragma.h>
			
 
				+  #endif
			
 
				+
			
 
				+  #if !HAVE_DECL_BUILD_ARRAY_REF
			
 
				+  /* This declaration is missing in GCC 4.6.1.  */
			
 
				+  extern tree build_array_ref (location_t loc, tree array, tree index);
			
 
				+  #endif
			
 
				+
			
 
				+
			
 
				+  #define YYSTYPE tree
			
 
				+  #define YYLTYPE location_t
			
 
				+
			
 
				+  static void
			
 
				+  yyerror (location_t loc, const char *pragma, tree *seq,
			
 
				+	   char const *message)
			
 
				+  {
			
 
				+    error_at (loc, "parse error in pragma %qs: %s", pragma, message);
			
 
				+  }
			
 
				+
			
 
				+  /* Return SOMETHING if it's a VAR_DECL, an identifier bound to a VAR_DECL,
			
 
				+     or another object; raise an error otherwise.  */
			
 
				+
			
 
				+  static tree
			
 
				+  ensure_bound (location_t loc, tree something)
			
 
				+  {
			
 
				+    gcc_assert (something != NULL_TREE);
			
 
				+
			
 
				+    if (DECL_P (something))
			
 
				+      return something;
			
 
				+    else if (TREE_CODE (something) == IDENTIFIER_NODE)
			
 
				+      {
			
 
				+	tree var = lookup_name (something);
			
 
				+	if (var == NULL_TREE)
			
 
				+	  {
			
 
				+	    error_at (loc, "unbound variable %qE", something);
			
 
				+	    return error_mark_node;
			
 
				+	  }
			
 
				+	else
			
 
				+	  return var;
			
 
				+      }
			
 
				+
			
 
				+    return something;
			
 
				+  }
			
 
				+
			
 
				+  static tree
			
 
				+  build_component_ref (location_t loc, tree what, tree field)
			
 
				+  {
			
 
				+    sorry ("struct field access not implemented yet"); /* XXX */
			
 
				+    return error_mark_node;
			
 
				+  }
			
 
				+%}
			
 
				+
			
 
				+%code {
			
 
				+  /* Mapping of libcpp token names to Bison-generated token names.  This is
			
 
				+     not ideal but Bison cannot be told to use the `enum cpp_ttype'
			
 
				+     values.  */
			
 
				+  static const int cpplib_bison_token_map[] =
			
 
				+    {
			
 
				+      [CPP_NAME] = YCPP_NAME,
			
 
				+      [CPP_NUMBER] = YCPP_NUM,
			
 
				+      [CPP_AND] = YCPP_AND,
			
 
				+      [CPP_OPEN_SQUARE] = YCPP_OPEN_SQUARE,
			
 
				+      [CPP_CLOSE_SQUARE] = YCPP_CLOSE_SQUARE,
			
 
				+      [CPP_OPEN_PAREN] = YCPP_OPEN_PAREN,
			
 
				+      [CPP_CLOSE_PAREN] = YCPP_CLOSE_PAREN,
			
 
				+      [CPP_PLUS] = YCPP_PLUS,
			
 
				+      [CPP_MINUS] = YCPP_MINUS,
			
 
				+      [CPP_MULT] = YCPP_MULT,
			
 
				+      [CPP_DIV] = YCPP_DIV,
			
 
				+      [CPP_DOT] = YCPP_DOT,
			
 
				+      [CPP_DEREF] = YCPP_DEREF
			
 
				+    };
			
 
				+
			
 
				+  static int
			
 
				+  yylex (YYSTYPE *lvalp)
			
 
				+  {
			
 
				+    int ret;
			
 
				+
			
 
				+    ret = pragma_lex (lvalp);
			
 
				+    if (ret < sizeof cpplib_bison_token_map / sizeof cpplib_bison_token_map[0])
			
 
				+      ret = cpplib_bison_token_map[ret];
			
 
				+    else
			
 
				+      ret = -1;
			
 
				+
			
 
				+    return ret;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+%token YCPP_NAME "identifier"
			
 
				+%token YCPP_NUM "integer"
			
 
				+%token YCPP_AND "&"
			
 
				+%token YCPP_OPEN_SQUARE "["
			
 
				+%token YCPP_CLOSE_SQUARE "]"
			
 
				+%token YCPP_OPEN_PAREN "("
			
 
				+%token YCPP_CLOSE_PAREN ")"
			
 
				+%token YCPP_PLUS "+"
			
 
				+%token YCPP_MINUS "-"
			
 
				+%token YCPP_MULT "*"
			
 
				+%token YCPP_DIV "/"
			
 
				+%token YCPP_DOT "."
			
 
				+%token YCPP_DEREF "->"
			
 
				+
			
 
				+%% /* Grammar rules.  */
			
 
				+
			
 
				+ /* Always return a TREE_LIST rather than a raw chain, because the elements
			
 
				+    of that list may be already chained for other purposes---e.g., PARM_DECLs
			
 
				+    of a function are chained together.  */
			
 
				+
			
 
				+sequence: expression {
			
 
				+          gcc_assert (*seq == NULL_TREE);
			
 
				+	  *seq = tree_cons (NULL_TREE, $1, NULL_TREE);
			
 
				+	  $$ = *seq;
			
 
				+      }
			
 
				+      | expression sequence {
			
 
				+	  gcc_assert ($2 == *seq);
			
 
				+	  *seq = tree_cons (NULL_TREE, $1, $2);
			
 
				+	  $$ = *seq;
			
 
				+      }
			
 
				+;
			
 
				+
			
 
				+expression: identifier | binary_expression | unary_expression;
			
 
				+
			
 
				+/* XXX: `ensure_bound' below leads to errors raised even for non-significant
			
 
				+   arguments---e.g., junk after pragma.  */
			
 
				+identifier: YCPP_NAME  { $$ = ensure_bound (loc, $1); }
			
 
				+;
			
 
				+
			
 
				+binary_expression: multiplicative_expression
			
 
				+     | additive_expression
			
 
				+;
			
 
				+
			
 
				+multiplicative_expression: multiplicative_expression YCPP_MULT cast_expression {
			
 
				+       $$ = build_binary_op (UNKNOWN_LOCATION, MULT_EXPR, $1, $3, 0);
			
 
				+     }
			
 
				+     | multiplicative_expression YCPP_DIV cast_expression {
			
 
				+       $$ = build_binary_op (UNKNOWN_LOCATION, TRUNC_DIV_EXPR, $1, $3, 0);
			
 
				+     }
			
 
				+     | cast_expression
			
 
				+;
			
 
				+
			
 
				+additive_expression: multiplicative_expression
			
 
				+     | additive_expression YCPP_PLUS multiplicative_expression {
			
 
				+       $$ = build_binary_op (UNKNOWN_LOCATION, PLUS_EXPR, $1, $3, 0);
			
 
				+     }
			
 
				+     | additive_expression YCPP_MINUS multiplicative_expression {
			
 
				+       $$ = build_binary_op (UNKNOWN_LOCATION, MINUS_EXPR, $1, $3, 0);
			
 
				+     }
			
 
				+;
			
 
				+
			
 
				+cast_expression: unary_expression
			
 
				+		 /* XXX: No support for '(' TYPE-NAME ')' UNARY-EXPRESSION.  */
			
 
				+;
			
 
				+
			
 
				+unary_expression:
			
 
				+       primary_expression
			
 
				+     | postfix_expression
			
 
				+     | YCPP_AND cast_expression {
			
 
				+       $$ = build_addr (ensure_bound (loc, $2), current_function_decl);
			
 
				+     }
			
 
				+;
			
 
				+
			
 
				+postfix_expression:
			
 
				+       primary_expression
			
 
				+     | postfix_expression YCPP_OPEN_SQUARE expression YCPP_CLOSE_SQUARE {
			
 
				+#if 1
			
 
				+	 /* Build the array ref with proper error checking.  */
			
 
				+	 $$ = build_array_ref (loc, ensure_bound (loc, $1),
			
 
				+			       ensure_bound (loc, $3));
			
 
				+#else /* TIMTOWTDI */
			
 
				+	 $$ = build_indirect_ref (loc,
			
 
				+	       build_binary_op (loc, PLUS_EXPR, ensure_bound (loc, $1), ensure_bound (loc, $3), 0),
			
 
				+		RO_ARRAY_INDEXING);
			
 
				+#endif
			
 
				+     }
			
 
				+     | postfix_expression YCPP_DOT identifier {
			
 
				+        $$ = build_component_ref (loc, ensure_bound (loc, $1), $2);
			
 
				+     }
			
 
				+     | postfix_expression YCPP_DEREF identifier {
			
 
				+        $$ = build_component_ref (loc,
			
 
				+               build_indirect_ref (loc, ensure_bound (loc, $1), RO_ARRAY_INDEXING),
			
 
				+               $2);
			
 
				+     }
			
 
				+;
			
 
				+
			
 
				+primary_expression: identifier
			
 
				+     | constant
			
 
				+     | YCPP_OPEN_PAREN expression YCPP_CLOSE_PAREN { $$ = $2; }
			
 
				+;
			
 
				+
			
 
				+constant: YCPP_NUM { $$ = $1; }
			
 
				+;
			
 
				+
			
 
				+%%
			
--- a/gcc-plugin/src/starpu-gcc-config.h.in
+++ b/gcc-plugin/src/starpu-gcc-config.h.in
@@ -21,3 +21,11 @@
 
				 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_ARRAY
			
 
				 
			
 
				 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_VEC
			
 
				+
			
 
				+#undef HAVE_DECL_BUILD_ARRAY_REF
			
 
				+
			
 
				+#undef HAVE_C_FAMILY_C_COMMON_H
			
 
				+#undef HAVE_C_COMMON_H
			
 
				+
			
 
				+#undef HAVE_C_FAMILY_C_PRAGMA_H
			
 
				+#undef HAVE_C_PRAGMA_H
			
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
@@ -30,8 +30,19 @@ int plugin_is_GPL_compatible;
 
				 #include <cpplib.h>
			
 
				 #include <tree.h>
			
 
				 #include <tree-iterator.h>
			
 
				-#include <c-common.h>
			
 
				-#include <c-pragma.h>
			
 
				+
			
 
				+#ifdef HAVE_C_FAMILY_C_COMMON_H
			
 
				+# include <c-family/c-common.h>
			
 
				+#elif HAVE_C_COMMON_H
			
 
				+# include <c-common.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef HAVE_C_FAMILY_C_PRAGMA_H
			
 
				+# include <c-family/c-pragma.h>
			
 
				+#elif HAVE_C_PRAGMA_H
			
 
				+# include <c-pragma.h>
			
 
				+#endif
			
 
				+
			
 
				 #include <tm.h>
			
 
				 #include <gimple.h>
			
 
				 #include <tree-pass.h>
			
@@ -75,6 +86,9 @@ static tree build_codelet_declaration (tree task_decl);
 
				 static tree build_task_body (const_tree task_decl);
			
 
				 static tree build_pointer_lookup (tree pointer);
			
 
				 
			
 
				+static bool task_p (const_tree decl);
			
 
				+static bool task_implementation_p (const_tree decl);
			
 
				+
			
 
				 
			
 
				 /* Lookup the StarPU function NAME in the global scope and store the result
			
 
				    in VAR (this can't be done from `lower_starpu'.)  */
			
@@ -119,7 +133,9 @@ build_call_expr_loc_vec (location_t loc, tree fndecl, VEC(tree,gc) *vec)
 
				 
			
 
				 
			
 
				 /* Build a reference to the INDEXth element of ARRAY.  `build_array_ref' is
			
 
				-   not exported, so we roll our own.  */
			
 
				+   not exported, so we roll our own.
			
 
				+   FIXME: This version may not work for array types and doesn't do as much
			
 
				+   type-checking as `build_array_ref'.  */
			
 
				 
			
 
				 static tree
			
 
				 array_ref (tree array, size_t index)
			
@@ -347,48 +363,45 @@ handle_pragma_shutdown (struct cpp_reader *reader)
 
				 static void
			
 
				 handle_pragma_wait (struct cpp_reader *reader)
			
 
				 {
			
 
				-  tree fndecl;
			
 
				+  if (task_implementation_p (current_function_decl))
			
 
				+    {
			
 
				+      location_t loc;
			
 
				 
			
 
				-  fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
			
 
				-  gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
			
 
				+      loc = cpp_peek_token (reader, 0)->src_loc;
			
 
				+
			
 
				+      /* TODO: In the future we could generate a task for the continuation
			
 
				+	 and have it depend on what's before here.  */
			
 
				+      error_at (loc, "task implementation is not allowed to wait");
			
 
				+    }
			
 
				+  else
			
 
				+    {
			
 
				+      tree fndecl;
			
 
				+
			
 
				+      fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
			
 
				+      gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
			
 
				 
			
 
				-  add_stmt (build_call_expr (fndecl, 0));
			
 
				+      add_stmt (build_call_expr (fndecl, 0));
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				-/* Parse a pointer variable for PRAGMA, raising the appropriate error if
			
 
				-   needed.  Return the pointer variable on success, NULL_TREE otherwise.  */
			
 
				+/* The minimal C expression parser.  */
			
 
				+
			
 
				+extern int yyparse (location_t, const char *, tree *);
			
 
				+extern int yydebug;
			
 
				+
			
 
				+/* Parse expressions from the CPP reader for PRAGMA, which is located at LOC.
			
 
				+   Return a TREE_LIST of C expressions.  */
			
 
				 
			
 
				 static tree
			
 
				-read_pragma_pointer_variable (const char *pragma, location_t loc)
			
 
				+read_pragma_expressions (const char *pragma, location_t loc)
			
 
				 {
			
 
				-  tree token, var = NULL_TREE;
			
 
				-  enum cpp_ttype type;
			
 
				-
			
 
				-  type = pragma_lex (&token);
			
 
				-  if (type == CPP_EOF)
			
 
				-    error_at (loc, "unterminated %<starpu %s%> pragma", pragma);
			
 
				-  else if (type != CPP_NAME)
			
 
				-    error_at (loc, "identifier expected");
			
 
				-  else
			
 
				-    {
			
 
				-      /* Get the variable name.  */
			
 
				-      tree var_name = token;
			
 
				-      tree decl = lookup_name (var_name);
			
 
				-
			
 
				-      if (decl == NULL_TREE || !DECL_P (decl))
			
 
				-	error_at (loc, "unbound variable %qE", var_name);
			
 
				-      else if (!POINTER_TYPE_P (TREE_TYPE (decl))
			
 
				-	       && TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
			
 
				-	error_at (loc, "%qE is neither a pointer nor an array", var_name);
			
 
				-      else
			
 
				-	{
			
 
				-	  var = decl;
			
 
				-	  TREE_USED (var) = true;
			
 
				-	  DECL_READ_P (var) = true;
			
 
				-	}
			
 
				-    }
			
 
				+  tree expr = NULL_TREE;
			
 
				+
			
 
				+  if (yyparse (loc, pragma, &expr))
			
 
				+    /* Parse error or memory exhaustion.  */
			
 
				+    expr = NULL_TREE;
			
 
				 
			
 
				-  return var;
			
 
				+  return expr;
			
 
				 }
			
 
				 
			
 
				 /* Process `#pragma starpu register VAR [COUNT]' and emit the corresponding
			
@@ -397,19 +410,37 @@ read_pragma_pointer_variable (const char *pragma, location_t loc)
 
				 static void
			
 
				 handle_pragma_register (struct cpp_reader *reader)
			
 
				 {
			
 
				-  tree token, var;
			
 
				+  tree args, ptr, count_arg;
			
 
				   location_t loc;
			
 
				-  enum cpp_ttype type;
			
 
				 
			
 
				   loc = cpp_peek_token (reader, 0)->src_loc;
			
 
				 
			
 
				-  var = read_pragma_pointer_variable ("register", loc);
			
 
				-  if (var == NULL_TREE)
			
 
				+  args = read_pragma_expressions ("register", loc);
			
 
				+  if (args == NULL_TREE)
			
 
				+    /* Parse error, presumably already handled by the parser.  */
			
 
				     return;
			
 
				 
			
 
				-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE
			
 
				-      && !DECL_EXTERNAL (var)
			
 
				-      && !TREE_STATIC (var)
			
 
				+  /* First argument should be a pointer expression.  */
			
 
				+  ptr = TREE_VALUE (args);
			
 
				+  args = TREE_CHAIN (args);
			
 
				+
			
 
				+  if (ptr == error_mark_node)
			
 
				+    return;
			
 
				+
			
 
				+  if (!POINTER_TYPE_P (TREE_TYPE (ptr))
			
 
				+      && TREE_CODE (TREE_TYPE (ptr)) != ARRAY_TYPE)
			
 
				+    {
			
 
				+      error_at (loc, "%qE is neither a pointer nor an array", ptr);
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+  TREE_USED (ptr) = true;
			
 
				+  if (DECL_P (ptr))
			
 
				+    DECL_READ_P (ptr) = true;
			
 
				+
			
 
				+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE
			
 
				+      && !DECL_EXTERNAL (ptr)
			
 
				+      && !TREE_STATIC (ptr)
			
 
				       && !MAIN_NAME_P (DECL_NAME (current_function_decl)))
			
 
				     warning_at (loc, 0, "using an on-stack array as a task input "
			
 
				 		"considered unsafe");
			
@@ -417,9 +448,9 @@ handle_pragma_register (struct cpp_reader *reader)
 
				   /* Determine the number of elements in the vector.  */
			
 
				   tree count = NULL_TREE;
			
 
				 
			
 
				-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE)
			
 
				+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE)
			
 
				     {
			
 
				-      tree domain = TYPE_DOMAIN (TREE_TYPE (var));
			
 
				+      tree domain = TYPE_DOMAIN (TREE_TYPE (ptr));
			
 
				 
			
 
				       if (domain != NULL_TREE)
			
 
				 	{
			
@@ -435,52 +466,44 @@ handle_pragma_register (struct cpp_reader *reader)
 
				 	}
			
 
				     }
			
 
				 
			
 
				+  /* Second argument is optional but should be an integer.  */
			
 
				+  count_arg = (args == NULL_TREE) ? NULL_TREE : TREE_VALUE (args);
			
 
				+  if (args != NULL_TREE)
			
 
				+    {
			
 
				+      args = TREE_CHAIN (args);
			
 
				+      TREE_CHAIN (count_arg) = NULL_TREE;
			
 
				+    }
			
 
				 
			
 
				-  type = pragma_lex (&token);
			
 
				-  if (type == CPP_EOF)
			
 
				+  if (count_arg == NULL_TREE)
			
 
				     {
			
 
				-      /* End of line reached: don't consume TOKEN and check whether the array
			
 
				-	 size was determined.  */
			
 
				+      /* End of line reached: check whether the array size was
			
 
				+	 determined.  */
			
 
				       if (count == NULL_TREE)
			
 
				 	{
			
 
				-	  error_at (loc, "cannot determine size of array %qE", DECL_NAME (var));
			
 
				+	  error_at (loc, "cannot determine size of array %qE", ptr);
			
 
				 	  return;
			
 
				 	}
			
 
				     }
			
 
				+  else if (count_arg == error_mark_node)
			
 
				+    /* COUNT_ARG could not be parsed and an error was already reported.  */
			
 
				+    return;
			
 
				+  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
			
 
				+    {
			
 
				+      error_at (loc, "%qE is not an integer", count_arg);
			
 
				+      return;
			
 
				+    }
			
 
				   else
			
 
				     {
			
 
				-      /* TOKEN may be a number or a integer variable.  */
			
 
				-
			
 
				-      tree count_arg;
			
 
				-
			
 
				-      if (TREE_CODE (token) == IDENTIFIER_NODE)
			
 
				-	{
			
 
				-	  count_arg = lookup_name (token);
			
 
				-	  if (count_arg == NULL_TREE)
			
 
				-	    {
			
 
				-	      error_at (loc, "unbound variable %qE", token);
			
 
				-	      return;
			
 
				-	    }
			
 
				-	  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
			
 
				-	    {
			
 
				-	      error_at (loc, "integer expected");
			
 
				-	      return;
			
 
				-	    }
			
 
				-
			
 
				-	  TREE_USED (count_arg) = true;
			
 
				-	  DECL_READ_P (count_arg) = true;
			
 
				-	}
			
 
				-      else if (TREE_CODE (token) != INTEGER_CST)
			
 
				-	error_at (loc, "integer expected");
			
 
				-      else
			
 
				-	count_arg = token;
			
 
				+      TREE_USED (count_arg) = true;
			
 
				+      if (DECL_P (count_arg))
			
 
				+	DECL_READ_P (count_arg) = true;
			
 
				 
			
 
				       if (count != NULL_TREE)
			
 
				 	{
			
 
				 	  /* The number of elements of this array was already determined.  */
			
 
				 	  inform (loc,
			
 
				 		  "element count can be omitted for bounded array %qE",
			
 
				-		  DECL_NAME (var));
			
 
				+		  ptr);
			
 
				 
			
 
				 	  if (count_arg != NULL_TREE)
			
 
				 	    {
			
@@ -489,7 +512,7 @@ handle_pragma_register (struct cpp_reader *reader)
 
				 		  if (!tree_int_cst_equal (count, count_arg))
			
 
				 		    error_at (loc, "specified element count differs "
			
 
				 			      "from actual size of array %qE",
			
 
				-			      DECL_NAME (var));
			
 
				+			      ptr);
			
 
				 		}
			
 
				 	      else
			
 
				 		/* Using a variable to determine the array size whereas the
			
@@ -501,19 +524,24 @@ handle_pragma_register (struct cpp_reader *reader)
 
				 	}
			
 
				       else
			
 
				 	count = count_arg;
			
 
				-
			
 
				-      if (pragma_lex (&token) != CPP_EOF)
			
 
				-	error_at (loc, "junk after %<starpu register%> pragma");
			
 
				     }
			
 
				 
			
 
				-  /* If VAR is an array, take its address.  */
			
 
				+  /* Any remaining args?  */
			
 
				+  if (args != NULL_TREE)
			
 
				+    error_at (loc, "junk after %<starpu register%> pragma");
			
 
				+
			
 
				+  /* If PTR is an array, take its address.  */
			
 
				   tree pointer =
			
 
				-    POINTER_TYPE_P (TREE_TYPE (var))
			
 
				-    ? var
			
 
				-    : build_addr (var, current_function_decl);
			
 
				+    POINTER_TYPE_P (TREE_TYPE (ptr))
			
 
				+    ? ptr
			
 
				+    : build_addr (ptr, current_function_decl);
			
 
				 
			
 
				   /* Introduce a local variable to hold the handle.  */
			
 
				-  tree handle_var = create_tmp_var (ptr_type_node, ".handle");
			
 
				+  tree handle_var = build_decl (loc, VAR_DECL, create_tmp_var_name (".handle"),
			
 
				+				ptr_type_node);
			
 
				+  DECL_CONTEXT (handle_var) = current_function_decl;
			
 
				+  DECL_ARTIFICIAL (handle_var) = true;
			
 
				+  DECL_INITIAL (handle_var) = NULL_TREE;
			
 
				 
			
 
				   tree register_fn =
			
 
				     lookup_name (get_identifier ("starpu_vector_data_register"));
			
@@ -525,9 +553,13 @@ handle_pragma_register (struct cpp_reader *reader)
 
				 		     build_addr (handle_var, current_function_decl),
			
 
				 		     build_zero_cst (uintptr_type_node), /* home node */
			
 
				 		     pointer, count,
			
 
				-		     size_in_bytes (TREE_TYPE (TREE_TYPE (var))));
			
 
				+		     size_in_bytes (TREE_TYPE (TREE_TYPE (ptr))));
			
 
				 
			
 
				-  add_stmt (call);
			
 
				+  tree bind;
			
 
				+  bind = build3 (BIND_EXPR, void_type_node, handle_var, call,
			
 
				+		 NULL_TREE);
			
 
				+
			
 
				+  add_stmt (bind);
			
 
				 }
			
 
				 
			
 
				 /* Process `#pragma starpu acquire VAR' and emit the corresponding
			
@@ -539,16 +571,26 @@ handle_pragma_acquire (struct cpp_reader *reader)
 
				   static tree acquire_fn;
			
 
				   LOOKUP_STARPU_FUNCTION (acquire_fn, "starpu_data_acquire");
			
 
				 
			
 
				-  tree token, var;
			
 
				+  tree args, var;
			
 
				   location_t loc;
			
 
				 
			
 
				   loc = cpp_peek_token (reader, 0)->src_loc;
			
 
				 
			
 
				-  var = read_pragma_pointer_variable ("acquire", loc);
			
 
				-  if (var == NULL_TREE)
			
 
				+  args = read_pragma_expressions ("acquire", loc);
			
 
				+  if (args == NULL_TREE)
			
 
				     return;
			
 
				 
			
 
				-  if (pragma_lex (&token) != CPP_EOF)
			
 
				+  var = TREE_VALUE (args);
			
 
				+
			
 
				+  if (var == error_mark_node)
			
 
				+    return;
			
 
				+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
			
 
				+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
			
 
				+    {
			
 
				+      error_at (loc, "%qE is neither a pointer nor an array", var);
			
 
				+      return;
			
 
				+    }
			
 
				+  else if (TREE_CHAIN (var) != NULL_TREE)
			
 
				     error_at (loc, "junk after %<starpu acquire%> pragma");
			
 
				 
			
 
				   /* If VAR is an array, take its address.  */
			
@@ -573,16 +615,26 @@ handle_pragma_unregister (struct cpp_reader *reader)
 
				   static tree unregister_fn;
			
 
				   LOOKUP_STARPU_FUNCTION (unregister_fn, "starpu_data_unregister");
			
 
				 
			
 
				-  tree token, var;
			
 
				+  tree args, var;
			
 
				   location_t loc;
			
 
				 
			
 
				   loc = cpp_peek_token (reader, 0)->src_loc;
			
 
				 
			
 
				-  var = read_pragma_pointer_variable ("unregister", loc);
			
 
				-  if (var == NULL_TREE)
			
 
				+  args = read_pragma_expressions ("unregister", loc);
			
 
				+  if (args == NULL_TREE)
			
 
				     return;
			
 
				 
			
 
				-  if (pragma_lex (&token) != CPP_EOF)
			
 
				+  var = TREE_VALUE (args);
			
 
				+
			
 
				+  if (var == error_mark_node)
			
 
				+    return;
			
 
				+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
			
 
				+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
			
 
				+    {
			
 
				+      error_at (loc, "%qE is neither a pointer nor an array", var);
			
 
				+      return;
			
 
				+    }
			
 
				+  else if (TREE_CHAIN (args) != NULL_TREE)
			
 
				     error_at (loc, "junk after %<starpu unregister%> pragma");
			
 
				 
			
 
				   /* If VAR is an array, take its address.  */
			
--- a/gcc-plugin/tests/Makefile.am
+++ b/gcc-plugin/tests/Makefile.am
@@ -28,6 +28,7 @@ gcc_tests =					\
 
				   pointer-tasks.c				\
			
 
				   no-initialize.c				\
			
 
				   lib-user.c					\
			
 
				+  wait-errors.c					\
			
 
				   shutdown-errors.c
			
 
				 
			
 
				 dist_noinst_HEADERS = mocks.h
			
--- a/gcc-plugin/tests/acquire-errors.c
+++ b/gcc-plugin/tests/acquire-errors.c
@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
				 
			
 
				 #pragma starpu register x
			
 
				 
			
 
				-#pragma starpu acquire /* (error "unterminated") */
			
 
				-#pragma starpu acquire 123 /* (error "identifier expected") */
			
 
				+#pragma starpu acquire /* (error "parse error") */
			
 
				+#pragma starpu acquire 123 /* (error "neither a pointer nor an array") */
			
 
				 #pragma starpu acquire does_not_exit /* (error "unbound variable") */
			
 
				 
			
 
				 #pragma starpu acquire argc /* (error "neither a pointer nor an array") */
			
--- a/gcc-plugin/tests/lib-user.c
+++ b/gcc-plugin/tests/lib-user.c
@@ -28,7 +28,10 @@ main (int argc, char *argv[])
 
				 {
			
 
				 #pragma starpu initialize
			
 
				 
			
 
				-  static const char x[] = { 0, 1, 2, 3, 4, 5 };
			
 
				+  /* Align X so that the assumptions behind `dummy_pointer_to_handle'
			
 
				+     hold.  */
			
 
				+  static const char x[] __attribute__ ((aligned (8))) = { 0, 1, 2, 3, 4, 5 };
			
 
				+
			
 
				   float y[sizeof x];
			
 
				 
			
 
				   static const char forty_two = 42;
			
--- a/gcc-plugin/tests/register-errors.c
+++ b/gcc-plugin/tests/register-errors.c
@@ -23,9 +23,9 @@ main (int argc, char *argv[])
 
				 {
			
 
				 #pragma starpu initialize
			
 
				 
			
 
				-#pragma starpu register /* (error "unterminated") */
			
 
				+#pragma starpu register /* (error "parse error") */
			
 
				 
			
 
				-#pragma starpu register argv 234 junk right here /* (error "junk after") */
			
 
				+#pragma starpu register argv 234 junk here /* (error "junk after") *//* (error "unbound") *//* (error "unbound") */
			
 
				 
			
 
				   static int x[123] __attribute__ ((unused));
			
 
				 #pragma starpu register x 234 /* (note "can be omitted") *//* (error "differs from actual size") */
			
@@ -37,9 +37,15 @@ main (int argc, char *argv[])
 
				 #pragma starpu register argv does_not_exit /* (error "unbound variable") */
			
 
				 
			
 
				 #pragma starpu register argv /* (error "cannot determine size") */
			
 
				+#pragma starpu register &argv[2] /* (error "cannot determine size") */
			
 
				+#pragma starpu register &x[2] /* (error "cannot determine size") */
			
 
				 
			
 
				 #pragma starpu register argc /* (error "neither a pointer nor an array") */
			
 
				 
			
 
				+#pragma starpu register argv[2][3] 3 /* (error "neither a pointer nor an array") */
			
 
				+
			
 
				+#pragma starpu register argv[does_not_exist] 3 /* (error "unbound variable") */
			
 
				+
			
 
				   char **p = argv;
			
 
				   size_t ps = argc;
			
 
				 #pragma starpu register p ps  /* No unused variable warning, please! */
			
--- a/gcc-plugin/tests/register.c
+++ b/gcc-plugin/tests/register.c
@@ -31,6 +31,15 @@ foo (void)
 
				 #pragma starpu register x /* (warning "considered unsafe") */
			
 
				 }
			
 
				 
			
 
				+static void
			
 
				+bar (float *p, int s)
			
 
				+{
			
 
				+  expected_register_arguments.pointer = p;
			
 
				+  expected_register_arguments.elements = s;
			
 
				+  expected_register_arguments.element_size = sizeof *p;
			
 
				+#pragma starpu register p s
			
 
				+}
			
 
				+
			
 
				 int
			
 
				 main (int argc, char *argv[])
			
 
				 {
			
@@ -39,6 +48,7 @@ main (int argc, char *argv[])
 
				   int x[123];
			
 
				   double *y;
			
 
				   static char z[345];
			
 
				+  static float m[7][42];
			
 
				   short w[] = { 1, 2, 3 };
			
 
				   size_t y_size = 234;
			
 
				 
			
@@ -84,8 +94,36 @@ main (int argc, char *argv[])
 
				 #undef N
			
 
				 
			
 
				   foo ();
			
 
				+  bar ((float *) argv, argc);
			
 
				+
			
 
				+  expected_register_arguments.pointer = argv;
			
 
				+  expected_register_arguments.elements = argc;
			
 
				+  expected_register_arguments.element_size = sizeof argv[0];
			
 
				+
			
 
				+  int chbouib = argc;
			
 
				+#pragma starpu register argv chbouib
			
 
				+
			
 
				+  expected_register_arguments.pointer = &argv[2];
			
 
				+  expected_register_arguments.elements = 3;
			
 
				+  expected_register_arguments.element_size = sizeof argv[0];
			
 
				+#pragma starpu register &argv[2] 3
			
 
				+
			
 
				+  expected_register_arguments.pointer = &argv[argc + 3 / 2];
			
 
				+  expected_register_arguments.elements = argc * 4;
			
 
				+  expected_register_arguments.element_size = sizeof argv[0];
			
 
				+#pragma starpu register &argv[argc + 3 / 2] (argc * 4)
			
 
				+
			
 
				+  expected_register_arguments.pointer = &y[y_size / 2];
			
 
				+  expected_register_arguments.elements = (y_size / 2 - 7);
			
 
				+  expected_register_arguments.element_size = sizeof y[0];
			
 
				+#pragma starpu register &y[y_size / 2] (y_size / 2 - 7)
			
 
				+
			
 
				+  expected_register_arguments.pointer = m[6];
			
 
				+  expected_register_arguments.elements = 42;
			
 
				+  expected_register_arguments.element_size = sizeof m[0][0];
			
 
				+#pragma starpu register m[6]
			
 
				 
			
 
				-  assert (data_register_calls == 8);
			
 
				+  assert (data_register_calls == 14);
			
 
				 
			
 
				   free (y);
			
 
				 
			
--- a/gcc-plugin/tests/run-test.in
+++ b/gcc-plugin/tests/run-test.in
@@ -85,11 +85,11 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
 
				     ;; the real file name.
			
 
				     ,(string-append "-fplugin=" %builddir "/../src/.libs/starpu.so")
			
 
				 
			
 
				-    "-g" "-O2"
			
 
				+    "-g"
			
 
				     "-fdump-tree-gimple" "-Wall"))
			
 
				 
			
 
				 (define %default-ldflags
			
 
				-  `(,(string-append "-L" %srcdir "/../../src")))
			
 
				+  `(,(string-append "-L" %builddir "/../../src")))
			
 
				 
			
 
				 (define %libtool
			
 
				   (string-append %builddir "/../../libtool"))
			
@@ -105,7 +105,7 @@ compiler status and the list of lines printed on stdout/stderr."
 
				          (mode     (if compile?
			
 
				                        "compile"
			
 
				                        "link"))
			
 
				-         (command  (format #f "LANG=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
			
 
				+         (command  (format #f "LC_ALL=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
			
 
				                            %libtool mode cc cflags file ldflags))
			
 
				          (pipe     (begin
			
 
				                      (log "running `~a'" command)
			
@@ -311,7 +311,7 @@ otherwise."
 
				                 file (length dependencies) dependencies))
			
 
				 
			
 
				     (and (every (cut compile/match <> cc cflags ldflags)
			
 
				-                dependencies)
			
 
				+                (map (cut string-append %srcdir "/" <>) dependencies))
			
 
				          (let*-values (((goal)
			
 
				                         (if error-expected?
			
 
				                             'compile
			
@@ -421,6 +421,12 @@ otherwise."
 
				 ;;;
			
 
				 
			
 
				 (define (build/run . file)
			
 
				-  (exit (every (cut compile/match <> %gcc %default-cflags %default-ldflags) file)))
			
 
				+  (exit (every (lambda (file)
			
 
				+                 ;; For each file, check that everything works both with and
			
 
				+                 ;; without optimizations.
			
 
				+                 (every (cut compile/match file %gcc <> %default-ldflags)
			
 
				+                        `((,"-O0" ,@%default-cflags)
			
 
				+                          (,"-O2" ,@%default-cflags))))
			
 
				+               file)))
			
 
				 
			
 
				 ;;; run-test.in ends here
			
--- a/gcc-plugin/tests/unregister-errors.c
+++ b/gcc-plugin/tests/unregister-errors.c
@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
				 
			
 
				 #pragma starpu register x
			
 
				 
			
 
				-#pragma starpu unregister /* (error "unterminated") */
			
 
				-#pragma starpu unregister 123 /* (error "identifier expected") */
			
 
				+#pragma starpu unregister /* (error "parse error") */
			
 
				+#pragma starpu unregister 123 /* (error "neither a pointer nor an array") */
			
 
				 #pragma starpu unregister does_not_exit /* (error "unbound variable") */
			
 
				 
			
 
				 #pragma starpu unregister argc /* (error "neither a pointer nor an array") */
			
--- a/gcc-plugin/tests/wait-errors.c
+++ b/gcc-plugin/tests/wait-errors.c
@@ -0,0 +1,27 @@
 
				+/* GCC-StarPU
			
 
				+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+
			
 
				+   GCC-StarPU is free software: you can redistribute it and/or modify
			
 
				+   it under the terms of the GNU General Public License as published by
			
 
				+   the Free Software Foundation, either version 3 of the License, or
			
 
				+   (at your option) any later version.
			
 
				+
			
 
				+   GCC-StarPU is distributed in the hope that it will be useful,
			
 
				+   but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+   GNU General Public License for more details.
			
 
				+
			
 
				+   You should have received a copy of the GNU General Public License
			
 
				+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
			
 
				+
			
 
				+/* The task under test.  */
			
 
				+
			
 
				+void task (int x, char y, int z) __attribute__ ((task));
			
 
				+static void task_cpu (int x, char y, int z)
			
 
				+  __attribute__ ((task_implementation ("cpu", task)));
			
 
				+
			
 
				+static void
			
 
				+task_cpu (int x, char y, int z)
			
 
				+{
			
 
				+#pragma starpu wait /* (error "not allowed") */
			
 
				+}
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -46,6 +46,7 @@
 
				 #undef STARPU_MAXCUDADEVS
			
 
				 #undef STARPU_MAXOPENCLDEVS
			
 
				 #undef STARPU_NMAXWORKERS
			
 
				+#undef STARPU_MAXIMPLEMENTATIONS
			
 
				 
			
 
				 #undef STARPU_HAVE_LIBNUMA
			
 
				 
			
--- a/include/starpu_data_filters.h
+++ b/include/starpu_data_filters.h
@@ -19,6 +19,8 @@
 
				 #ifndef __STARPU_DATA_FILTERS_H__
			
 
				 #define __STARPU_DATA_FILTERS_H__
			
 
				 
			
 
				+#include <stdarg.h>
			
 
				+
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_config.h>
			
 
				 
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -104,7 +105,7 @@ struct starpu_perfmodel_t {
 
				 	double (*cost_model)(struct starpu_buffer_descr_t *);
			
 
				 
			
 
				 	/* per-architecture model */
			
 
				-	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS];
			
 
				+	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
			
 
				 	const char *symbol;
			
@@ -126,12 +127,12 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
 
				  * performance model files */
			
 
				 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
			
 
				 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
			
 
				-		enum starpu_perf_archtype arch, char *path, size_t maxlen);
			
 
				-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,
			
 
				-		char *archname, size_t maxlen);
			
 
				+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
			
 
				+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,	char *archname, size_t maxlen, unsigned nimpl);
			
 
				 int starpu_list_models(void);
			
 
				 
			
 
				 void starpu_force_bus_sampling(void);
			
 
				+void starpu_print_bus_bandwidth(FILE *f);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -127,7 +128,7 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 
				 #endif
			
 
				 
			
 
				 /* Check if the worker specified by workerid can execute the codelet. */
			
 
				-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
			
 
				+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
			
 
				 
			
 
				 /* The scheduling policy may put tasks directly into a worker's local queue so
			
 
				  * that it is not always necessary to create its own queue when the local queue
			
@@ -147,6 +148,7 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back);
 
				 /* By convention, the default priority level should be 0 so that we can
			
 
				  * statically allocate tasks with a default priority. */
			
 
				 #define STARPU_DEFAULT_PRIO	0
			
 
				+
			
 
				 int starpu_sched_get_min_priority(void);
			
 
				 int starpu_sched_get_max_priority(void);
			
 
				 
			
@@ -164,7 +166,7 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 
				 /* Get the description of a combined worker */
			
 
				 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
			
 
				 /* Variant of starpu_worker_may_execute_task compatible with combined workers */
			
 
				-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
			
 
				+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
			
 
				 
			
 
				 /*
			
 
				  *	Data prefetching
			
@@ -182,7 +184,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
 
				 /* Return the current date */
			
 
				 double starpu_timing_now(void);
			
 
				 /* Returns expected task duration in µs */
			
 
				-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch);
			
 
				+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 /* Returns an estimated speedup factor relative to CPU speed */
			
 
				 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
			
 
				 /* Returns expected data transfer time in µs */
			
@@ -190,7 +192,7 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 
				 /* Predict the transfer time (in µs) to move a handle to a memory node */
			
 
				 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
			
 
				 /* Returns expected power consumption in J */
			
 
				-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch);
			
 
				+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 
			
 
				 /* Waits until all the tasks of a worker, already submitted, have been executed */
			
 
				 int starpu_wait_for_all_tasks_of_worker(int workerid);
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,7 +24,7 @@
 
				 #include <starpu_config.h>
			
 
				 
			
 
				 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
			
 
				-#include <cuda.h>
			
 
				+# include <cuda.h>
			
 
				 #endif
			
 
				 
			
 
				 #include <starpu_data.h>
			
@@ -56,6 +57,18 @@ extern "C" {
 
				 
			
 
				 typedef uint64_t starpu_tag_t;
			
 
				 
			
 
				+
			
 
				+typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
			
 
				+typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
			
 
				+typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
			
 
				+typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
			
 
				+
			
 
				+#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    (starpu_cpu_func_t) -1
			
 
				+#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   (starpu_cuda_func_t) -1
			
 
				+#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS (starpu_opencl_func_t) -1
			
 
				+#define STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS 255
			
 
				+
			
 
				+
			
 
				 /*
			
 
				  * A codelet describes the various function 
			
 
				  * that may be called from a worker
			
@@ -72,6 +85,11 @@ typedef struct starpu_codelet_t {
 
				 	void (*opencl_func)(void **, void *);
			
 
				 	uint8_t gordon_func;
			
 
				 
			
 
				+	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	starpu_gordon_func_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+
			
 
				 	/* how many buffers do the codelet takes as argument ? */
			
 
				 	unsigned nbuffers;
			
 
				 
			
--- a/include/starpu_task_bundle.h
+++ b/include/starpu_task_bundle.h
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -71,10 +72,10 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 
				 void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
			
 
				 
			
 
				 /* Return the expected duration of the entire task bundle in µs. */
			
 
				-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch);
			
 
				+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 /* Return the time (in µs) expected to transfer all data used within the bundle */
			
 
				 double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
			
 
				 /* Return the expected power consumption of the entire task bundle in J. */
			
 
				-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch);
			
 
				+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 
			
 
				 #endif // __STARPU_TASK_BUNDLE_H__
			
--- a/m4/.gitignore
+++ b/m4/.gitignore
@@ -0,0 +1,2 @@
 
				+/libtool.m4
			
 
				+/lt*.m4
			
--- a/m4/gcc.m4
+++ b/m4/gcc.m4
@@ -51,8 +51,16 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
 
				     dnl Reason:
			
 
				     dnl   build_call_expr_loc_array -- not in GCC 4.5.x; appears in 4.6
			
 
				     dnl   build_call_expr_loc_vec   -- likewise
			
 
				+    dnl   build_array_ref           -- present but undeclared in 4.6.1
			
 
				     _STARPU_WITH_GCC_PLUGIN_API([
			
 
				-      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec],
			
 
				+      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec,
			
 
				+                      build_array_ref],
			
 
				+        [], [], [#include <gcc-plugin.h>
			
 
				+	         #include <tree.h>])
			
 
				+
			
 
				+      dnl Work around header naming issues introduced upstream and in Debian
			
 
				+      dnl (see <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=631082>).
			
 
				+      AC_CHECK_HEADERS([c-common.h c-pragma.h c-family/c-common.h c-family/c-pragma.h],
			
 
				         [], [], [#include <gcc-plugin.h>
			
 
				 	         #include <tree.h>])
			
 
				     ])
			
--- a/mpi/.gitignore
+++ b/mpi/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/mpi/tests/.gitignore
+++ b/mpi/tests/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/socl/src/Makefile.am
+++ b/socl/src/Makefile.am
@@ -24,16 +24,18 @@ SUBDIRS =
 
				 lib_LTLIBRARIES = libsocl.la
			
 
				 
			
 
				 libsocl_la_SOURCES = 						\
			
 
				-  socl.c \
			
 
				-  gc.c \
			
 
				-  graph.c \
			
 
				+  command.c \
			
 
				+  command_list.c \
			
 
				+  command_queue.c \
			
 
				+  debug.c \
			
 
				+  devices.c \
			
 
				   event.c \
			
 
				+  gc.c \
			
 
				   init.c \
			
 
				-  task.c \
			
 
				-  command_queue.c \
			
 
				   mem_objects.c \
			
 
				+  socl.c \
			
 
				+  task.c \
			
 
				   util.c \
			
 
				-  devices.c \
			
 
				   cl_getplatformids.c \
			
 
				   cl_getplatforminfo.c \
			
 
				   cl_getdeviceids.c \
			
--- a/socl/src/cl_createcommandqueue.c
+++ b/socl/src/cl_createcommandqueue.c
@@ -30,8 +30,7 @@ static void release_callback_command_queue(void * e) {
 
				   gc_entity_unstore(&cq->context);
			
 
				 
			
 
				   /* Destruct object */
			
 
				-  pthread_spin_destroy(&cq->spin);
			
 
				-  free(cq->events);
			
 
				+  pthread_mutex_destroy(&cq->mutex);
			
 
				 }
			
 
				 
			
 
				 
			
@@ -68,9 +67,9 @@ soclCreateCommandQueue(cl_context                   context,
 
				       profiling_queue_count += 1;
			
 
				    }
			
 
				 
			
 
				-   cq->events = NULL;
			
 
				+   cq->commands = NULL;
			
 
				    cq->barrier = NULL;
			
 
				-   pthread_spin_init(&cq->spin, 0);
			
 
				+   pthread_mutex_init(&cq->mutex, NULL);
			
 
				 
			
 
				    if (errcode_ret != NULL)
			
 
				       *errcode_ret = CL_SUCCESS;
			
--- a/socl/src/cl_createkernel.c
+++ b/socl/src/cl_createkernel.c
@@ -32,23 +32,23 @@ static void soclCreateKernel_task(void *data) {
 
				    }
			
 
				 
			
 
				    /* One worker creates argument structures */
			
 
				-   if (__sync_bool_compare_and_swap(&k->arg_count, 0, 666)) {
			
 
				+   if (__sync_bool_compare_and_swap(&k->num_args, 0, 666)) {
			
 
				       unsigned int i;
			
 
				-      cl_uint arg_count;
			
 
				+      cl_uint num_args;
			
 
				 
			
 
				-      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(arg_count), &arg_count, NULL);
			
 
				+      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
			
 
				       if (err != CL_SUCCESS) {
			
 
				          DEBUG_CL("clGetKernelInfo", err);
			
 
				          ERROR_STOP("Unable to get kernel argument count. Aborting.\n");
			
 
				       }
			
 
				-      k->arg_count = arg_count;
			
 
				-      DEBUG_MSG("Kernel has %d arguments\n", arg_count);
			
 
				+      k->num_args = num_args;
			
 
				+      DEBUG_MSG("Kernel has %d arguments\n", num_args);
			
 
				 
			
 
				-      k->arg_size = (size_t*)malloc(sizeof(size_t) * arg_count);
			
 
				-      k->arg_value = (void**)malloc(sizeof(void*) * arg_count);
			
 
				-      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * arg_count);
			
 
				+      k->arg_size = (size_t*)malloc(sizeof(size_t) * num_args);
			
 
				+      k->arg_value = (void**)malloc(sizeof(void*) * num_args);
			
 
				+      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * num_args);
			
 
				       /* Settings default type to NULL */
			
 
				-      for (i=0; i<arg_count; i++) {
			
 
				+      for (i=0; i<num_args; i++) {
			
 
				          k->arg_value[i] = NULL;
			
 
				          k->arg_type[i] = Null;
			
 
				       }
			
@@ -70,7 +70,7 @@ static void release_callback_kernel(void * e) {
 
				 
			
 
				   //Free args
			
 
				   unsigned int j;
			
 
				-  for (j=0; j<kernel->arg_count; j++) {
			
 
				+  for (j=0; j<kernel->num_args; j++) {
			
 
				     switch (kernel->arg_type[j]) {
			
 
				       case Null:
			
 
				         break;
			
@@ -125,7 +125,7 @@ soclCreateKernel(cl_program    program,
 
				    
			
 
				    gc_entity_store(&k->program, program);
			
 
				    k->kernel_name = strdup(kernel_name);
			
 
				-   k->arg_count = 0;
			
 
				+   k->num_args = 0;
			
 
				    k->arg_value = NULL;
			
 
				    k->arg_size = NULL;
			
 
				 
			
--- a/socl/src/cl_createprogramwithsource.c
+++ b/socl/src/cl_createprogramwithsource.c
@@ -40,11 +40,11 @@ static void soclCreateProgramWithSource_task(void *data) {
 
				 }
			
 
				 
			
 
				 static void rp_task(void *data) {
			
 
				-   struct _cl_program *d = (struct _cl_program*)data;
			
 
				+   cl_program program = (cl_program)data;
			
 
				 
			
 
				    int range = starpu_worker_get_range();
			
 
				 
			
 
				-   cl_int err = clReleaseProgram(d->cl_programs[range]);
			
 
				+   cl_int err = clReleaseProgram(program->cl_programs[range]);
			
 
				    if (err != CL_SUCCESS)
			
 
				       DEBUG_CL("clReleaseProgram", err);
			
 
				 }
			
--- a/socl/src/cl_enqueuebarrier.c
+++ b/socl/src/cl_enqueuebarrier.c
@@ -19,8 +19,9 @@
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclEnqueueBarrier(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				-   cl_event ev = enqueueBarrier(cq);   
			
 
				-   gc_entity_release(ev);
			
 
				+	command_marker cmd = command_barrier_create();
			
 
				 
			
 
				-   return CL_SUCCESS;
			
 
				+	command_queue_enqueue(cq, cmd, 0, NULL);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_enqueuecopybuffer.c
+++ b/socl/src/cl_enqueuecopybuffer.c
@@ -66,6 +66,42 @@ static starpu_codelet codelet_copybuffer = {
 
				    .nbuffers = 2
			
 
				 };
			
 
				 
			
 
				+cl_int command_copy_buffer_submit(command_copy_buffer cmd) {
			
 
				+	/* Aliases */
			
 
				+	cl_mem src_buffer = cmd->src_buffer;
			
 
				+	cl_mem dst_buffer = cmd->dst_buffer;
			
 
				+	size_t src_offset = cmd->src_offset;
			
 
				+	size_t dst_offset = cmd->dst_offset;
			
 
				+	size_t cb = cmd->cb;
			
 
				+
			
 
				+	struct starpu_task *task;
			
 
				+	struct arg_copybuffer *arg;
			
 
				+
			
 
				+	task = task_create(CL_COMMAND_COPY_BUFFER);
			
 
				+
			
 
				+	task->buffers[0].handle = src_buffer->handle;
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->buffers[1].handle = dst_buffer->handle;
			
 
				+	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->cl = &codelet_copybuffer;
			
 
				+
			
 
				+	arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
			
 
				+	arg->src_offset = src_offset;
			
 
				+	arg->dst_offset = dst_offset;
			
 
				+	arg->cb = cb;
			
 
				+	gc_entity_store(&arg->src_buffer, src_buffer);
			
 
				+	gc_entity_store(&arg->dst_buffer, dst_buffer);
			
 
				+	task->cl_arg = arg;
			
 
				+	task->cl_arg_size = sizeof(struct arg_copybuffer);
			
 
				+
			
 
				+	dst_buffer->scratch = 0;
			
 
				+
			
 
				+	task_submit(task, cmd);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclEnqueueCopyBuffer(cl_command_queue  cq, 
			
 
				                     cl_mem              src_buffer,
			
@@ -77,35 +113,11 @@ soclEnqueueCopyBuffer(cl_command_queue  cq,
 
				                     const cl_event *    events,
			
 
				                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				-   struct starpu_task *task;
			
 
				-   struct arg_copybuffer *arg;
			
 
				-   cl_event ev;
			
 
				-
			
 
				-   task = task_create(CL_COMMAND_COPY_BUFFER);
			
 
				-   ev = task_event(task);
			
 
				-
			
 
				-   task->buffers[0].handle = src_buffer->handle;
			
 
				-   task->buffers[0].mode = STARPU_R;
			
 
				-   task->buffers[1].handle = dst_buffer->handle;
			
 
				-   task->buffers[1].mode = STARPU_RW;
			
 
				-   task->cl = &codelet_copybuffer;
			
 
				-
			
 
				-   arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
			
 
				-   arg->src_offset = src_offset;
			
 
				-   arg->dst_offset = dst_offset;
			
 
				-   arg->cb = cb;
			
 
				-   gc_entity_store(&arg->src_buffer, src_buffer);
			
 
				-   gc_entity_store(&arg->dst_buffer, dst_buffer);
			
 
				-   task->cl_arg = arg;
			
 
				-   task->cl_arg_size = sizeof(struct arg_copybuffer);
			
 
				-
			
 
				-   dst_buffer->scratch = 0;
			
 
				-
			
 
				-   DEBUG_MSG("Submitting CopyBuffer task (event %d)\n", ev->id);
			
 
				+	command_copy_buffer cmd = command_copy_buffer_create(src_buffer, dst_buffer, src_offset, dst_offset, cb);
			
 
				 
			
 
				-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				 
			
 
				-   RETURN_EVENT(ev, event);
			
 
				+	RETURN_EVENT(cmd, event);
			
 
				 
			
 
				-   return ret;
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_enqueuemapbuffer.c
+++ b/socl/src/cl_enqueuemapbuffer.c
@@ -16,76 +16,53 @@
 
				 
			
 
				 #include "socl.h"
			
 
				 
			
 
				-struct mb_data {
			
 
				-  cl_event ev;
			
 
				-  cl_mem buffer;
			
 
				-  cl_map_flags map_flags;
			
 
				-};
			
 
				-
			
 
				 static void mapbuffer_callback(void *args) {
			
 
				-  struct mb_data *arg = (struct mb_data*)args;
			
 
				-
			
 
				-  starpu_tag_notify_from_apps(arg->ev->id);
			
 
				-  arg->ev->status = CL_COMPLETE;
			
 
				+	command_map_buffer cmd = (command_map_buffer)args;
			
 
				 
			
 
				-  gc_entity_unstore(&arg->ev);
			
 
				-  gc_entity_unstore(&arg->buffer);
			
 
				-  free(args);
			
 
				+	starpu_tag_notify_from_apps(cmd->event->id);
			
 
				+	cmd->event->status = CL_COMPLETE;
			
 
				 }
			
 
				 
			
 
				 static void mapbuffer_task(void *args) {
			
 
				-  struct mb_data *arg = (struct mb_data*)args;
			
 
				+	command_map_buffer cmd = (command_map_buffer)args;
			
 
				+
			
 
				+	starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
			
 
				+
			
 
				+	starpu_data_acquire_cb(cmd->buffer->handle, mode, mapbuffer_callback, cmd);
			
 
				+}
			
 
				 
			
 
				-  starpu_access_mode mode = (arg->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
			
 
				+cl_int command_map_buffer_submit(command_map_buffer cmd) {
			
 
				+	starpu_task task = task_create_cpu(mapbuffer_task, cmd, 0);
			
 
				 
			
 
				-  starpu_data_acquire_cb(arg->buffer->handle, mode, mapbuffer_callback, arg);
			
 
				+	task_submit(task, cmd);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				 CL_API_ENTRY void * CL_API_CALL
			
 
				 soclEnqueueMapBuffer(cl_command_queue cq,
			
 
				                    cl_mem           buffer,
			
 
				-                   cl_bool          blocking_map, 
			
 
				+                   cl_bool          blocking, 
			
 
				                    cl_map_flags     map_flags,
			
 
				                    size_t           offset, 
			
 
				-                   size_t           UNUSED(cb),
			
 
				+                   size_t           cb,
			
 
				                    cl_uint          num_events,
			
 
				                    const cl_event * events,
			
 
				                    cl_event *       event,
			
 
				                    cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				-   struct starpu_task *task;
			
 
				-   struct mb_data *arg;
			
 
				-   cl_event ev;
			
 
				-   cl_int err;
			
 
				-
			
 
				-   /* Create custom event that will be triggered when map is complete */
			
 
				-   ev = event_create();
			
 
				-
			
 
				-   /* Store arguments */
			
 
				-   arg = (struct mb_data*)malloc(sizeof(struct mb_data));
			
 
				-   arg->map_flags = map_flags;
			
 
				-   gc_entity_store(&arg->ev, ev);
			
 
				-   gc_entity_store(&arg->buffer, buffer);
			
 
				-
			
 
				-   /* Create StarPU task */
			
 
				-   task = task_create_cpu(CL_COMMAND_MAP_BUFFER, mapbuffer_task, arg, 0);
			
 
				-   cl_event map_event = task_event(task);
			
 
				-
			
 
				-   /* Enqueue task */
			
 
				-   DEBUG_MSG("Submitting MapBuffer task (event %d)\n", ev->id);
			
 
				-   err = command_queue_enqueue_fakeevent(cq, task, 0, num_events, events, ev);
			
 
				-   gc_entity_release(map_event);
			
 
				+	cl_event ev = event_create();
			
 
				 
			
 
				-   if (errcode_ret != NULL)
			
 
				-      *errcode_ret = err;
			
 
				+	command_map_buffer cmd = command_map_buffer_create(buffer, map_flags, offset, cb, ev);
			
 
				 
			
 
				-   if (err != CL_SUCCESS)
			
 
				-      return NULL;
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				 
			
 
				-   if (blocking_map == CL_TRUE)
			
 
				-      soclWaitForEvents(1, &ev);
			
 
				+	if (errcode_ret != NULL)
			
 
				+		*errcode_ret = CL_SUCCESS;
			
 
				 
			
 
				-   RETURN_EVENT(ev, event);
			
 
				+	RETURN_CUSTOM_EVENT(ev,event);
			
 
				 
			
 
				-   return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
			
 
				+	MAY_BLOCK_CUSTOM(blocking,ev);
			
 
				+	
			
 
				+	return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
			
 
				 }
			
--- a/socl/src/cl_enqueuemarker.c
+++ b/socl/src/cl_enqueuemarker.c
@@ -20,11 +20,21 @@ CL_API_ENTRY cl_int CL_API_CALL
 
				 soclEnqueueMarker(cl_command_queue  cq,
			
 
				                 cl_event *          event) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				-   if (event == NULL)
			
 
				-      return CL_INVALID_VALUE;
			
 
				+	if (event == NULL)
			
 
				+		return CL_INVALID_VALUE;
			
 
				+	
			
 
				+	command_marker cmd = command_marker_create();
			
 
				 
			
 
				-   starpu_task * task = task_create(CL_COMMAND_MARKER);
			
 
				-   *event = task_event(task);
			
 
				+	command_queue_enqueue(cq, cmd, 0, NULL);
			
 
				 
			
 
				-   return command_queue_enqueue(cq, task, 0, 0, NULL);
			
 
				+	RETURN_EVENT(cmd, event);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+cl_int command_marker_submit(command_marker cmd) {
			
 
				+	struct starpu_task *task;
			
 
				+	task = task_create(CL_COMMAND_MARKER);
			
 
				+
			
 
				+	task_submit(task, cmd);
			
 
				 }
			
--- a/socl/src/cl_enqueuendrangekernel.c
+++ b/socl/src/cl_enqueuendrangekernel.c
@@ -16,81 +16,64 @@
 
				 
			
 
				 #include "socl.h"
			
 
				 
			
 
				-typedef struct running_kernel * running_kernel;
			
 
				-
			
 
				-struct running_kernel {
			
 
				-  cl_kernel kernel;
			
 
				-  cl_mem *buffers;
			
 
				-  unsigned int buffer_count;
			
 
				-  starpu_codelet *codelet;
			
 
				-  cl_uint work_dim;
			
 
				-  size_t * global_work_offset;
			
 
				-  size_t * global_work_size;
			
 
				-  size_t * local_work_size;
			
 
				-  /* Arguments */
			
 
				-  unsigned int arg_count;
			
 
				-  size_t *arg_size;
			
 
				-  enum kernel_arg_type  *arg_type;
			
 
				-  void  **arg_value;
			
 
				-};
			
 
				 
			
 
				-static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
			
 
				-   running_kernel d;
			
 
				+void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
			
 
				+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
			
 
				+
			
 
				    cl_command_queue cq;
			
 
				    int wid;
			
 
				    cl_int err;
			
 
				 
			
 
				-   d = (running_kernel)args;
			
 
				    wid = starpu_worker_get_id();
			
 
				    starpu_opencl_get_queue(wid, &cq);
			
 
				 
			
 
				-   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, d->kernel->id);
			
 
				+   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, cmd->kernel->id);
			
 
				 
			
 
				    int range = starpu_worker_get_range();
			
 
				 
			
 
				    /* Set arguments */
			
 
				    {
			
 
				-      unsigned int i;
			
 
				-      int buf = 0;
			
 
				-      for (i=0; i<d->arg_count; i++) {
			
 
				-         switch (d->arg_type[i]) {
			
 
				-            case Null:
			
 
				-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], NULL);
			
 
				-               break;
			
 
				-            case Buffer: {
			
 
				-                  cl_mem mem;  
			
 
				-                  mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
			
 
				-                  err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], &mem);
			
 
				-                  buf++;
			
 
				-               }
			
 
				-               break;
			
 
				-            case Immediate:
			
 
				-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], d->arg_value[i]);
			
 
				-               break;
			
 
				-         }
			
 
				-         if (err != CL_SUCCESS) {
			
 
				-            DEBUG_CL("clSetKernelArg", err);
			
 
				-            DEBUG_ERROR("Aborting\n");
			
 
				-         }
			
 
				-      }
			
 
				+	   unsigned int i;
			
 
				+	   int buf = 0;
			
 
				+	   for (i=0; i<cmd->num_args; i++) {
			
 
				+		   switch (cmd->arg_types[i]) {
			
 
				+			   case Null:
			
 
				+				   err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], NULL);
			
 
				+				   break;
			
 
				+			   case Buffer: {
			
 
				+						cl_mem mem;  
			
 
				+						mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
			
 
				+						err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], &mem);
			
 
				+						buf++;
			
 
				+					}
			
 
				+					break;
			
 
				+			   case Immediate:
			
 
				+					err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], cmd->args[i]);
			
 
				+					break;
			
 
				+		   }
			
 
				+		   if (err != CL_SUCCESS) {
			
 
				+			   DEBUG_CL("clSetKernelArg", err);
			
 
				+			   DEBUG_ERROR("Aborting\n");
			
 
				+		   }
			
 
				+	   }
			
 
				    }
			
 
				 
			
 
				    /* Calling Kernel */
			
 
				    cl_event event;
			
 
				-   err = clEnqueueNDRangeKernel(cq, d->kernel->cl_kernels[range], d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size, 0, NULL, &event);
			
 
				+   err = clEnqueueNDRangeKernel(cq, cmd->kernel->cl_kernels[range], cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size, 0, NULL, &event);
			
 
				 
			
 
				    if (err != CL_SUCCESS) {
			
 
				-      ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
			
 
				-      DEBUG_CL("clEnqueueNDRangeKernel", err);
			
 
				-      DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
			
 
				-                d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size);
			
 
				-      DEBUG_MSG("Global work size: %ld %ld %ld\n", d->global_work_size[0],
			
 
				-            (d->work_dim > 1 ? d->global_work_size[1] : 1), (d->work_dim > 2 ? d->global_work_size[2] : 1)); 
			
 
				-      if (d->local_work_size != NULL)
			
 
				-         DEBUG_MSG("Local work size: %ld %ld %ld\n", d->local_work_size[0],
			
 
				-               (d->work_dim > 1 ? d->local_work_size[1] : 1), (d->work_dim > 2 ? d->local_work_size[2] : 1)); 
			
 
				-      ERROR_MSG("Aborting.\n");
			
 
				-      exit(1);
			
 
				+	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
			
 
				+	   DEBUG_CL("clEnqueueNDRangeKernel", err);
			
 
				+	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
			
 
				+			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
			
 
				+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
			
 
				+			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
			
 
				+	   if (cmd->local_work_size != NULL)
			
 
				+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
			
 
				+				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
			
 
				+	   ERROR_MSG("Aborting.\n");
			
 
				+	   exit(1);
			
 
				    }
			
 
				 
			
 
				    /* Waiting for kernel to terminate */
			
@@ -99,219 +82,110 @@ static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
 
				 }
			
 
				 
			
 
				 static void cleaning_task_callback(void *args) {
			
 
				-   running_kernel arg = (running_kernel)args;
			
 
				+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
			
 
				 
			
 
				-   free(arg->arg_size);
			
 
				-   free(arg->arg_type);
			
 
				-
			
 
				-   unsigned int i;
			
 
				-   for (i=0; i<arg->arg_count; i++) {
			
 
				-      free(arg->arg_value[i]);
			
 
				-   }
			
 
				-   free(arg->arg_value);
			
 
				+	free(cmd->arg_sizes);
			
 
				+	free(cmd->arg_types);
			
 
				 
			
 
				-   for (i=0; i<arg->buffer_count; i++)
			
 
				-      gc_entity_unstore(&arg->buffers[i]);
			
 
				+	unsigned int i;
			
 
				+	for (i=0; i<cmd->num_args; i++) {
			
 
				+		free(cmd->args[i]);
			
 
				+	}
			
 
				+	free(cmd->args);
			
 
				 
			
 
				-   gc_entity_unstore(&arg->kernel);
			
 
				+	for (i=0; i<cmd->num_buffers; i++)
			
 
				+		gc_entity_unstore(&cmd->buffers[i]);
			
 
				 
			
 
				-   free(arg->buffers);
			
 
				-   free(arg->global_work_offset);
			
 
				-   free(arg->global_work_size);
			
 
				-   free(arg->local_work_size);
			
 
				-   void * co = arg->codelet;
			
 
				-   arg->codelet = NULL;
			
 
				-   free(co);
			
 
				+	free(cmd->buffers);
			
 
				+	void * co = cmd->codelet;
			
 
				+	cmd->codelet = NULL;
			
 
				+	free(co);
			
 
				 }
			
 
				 
			
 
				 static struct starpu_perfmodel_t perf_model = {
			
 
				-  .type = STARPU_HISTORY_BASED,
			
 
				-  .symbol = "perf_model"
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "perf_model"
			
 
				 };
			
 
				 
			
 
				 /**
			
 
				  * Real kernel enqueuing command
			
 
				  */
			
 
				-cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
			
 
				-
			
 
				-   struct starpu_task *task;
			
 
				-   running_kernel arg;
			
 
				-   starpu_codelet *codelet;
			
 
				-   cl_event ev;
			
 
				-   
			
 
				-   /* Alias struc fields */
			
 
				-   cl_command_queue cq = n->cq;
			
 
				-   cl_kernel        kernel = n->kernel;
			
 
				-   cl_uint          work_dim = n->work_dim;
			
 
				-   const size_t *   global_work_offset = n->global_work_offset;
			
 
				-   const size_t *   global_work_size = n->global_work_size;
			
 
				-   const size_t *   local_work_size = n->local_work_size;
			
 
				-   cl_uint          num_events = n->num_events;
			
 
				-   const cl_event * events = n->events;
			
 
				-   cl_event *       event = n->event;
			
 
				-   char 	    is_task = n->is_task;
			
 
				-
			
 
				-
			
 
				-   /* Allocate structures */
			
 
				-
			
 
				-   /* Codelet */
			
 
				-   codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
			
 
				-   if (codelet == NULL)
			
 
				-      return CL_OUT_OF_HOST_MEMORY;
			
 
				-
			
 
				-   /* Codelet arguments */
			
 
				-   arg = (running_kernel)malloc(sizeof(struct running_kernel));
			
 
				-   if (arg == NULL) {
			
 
				-      free(codelet);
			
 
				-      return CL_OUT_OF_HOST_MEMORY;
			
 
				-   }
			
 
				-
			
 
				-   /* StarPU task */
			
 
				-   task = task_create(is_task ? CL_COMMAND_TASK : CL_COMMAND_NDRANGE_KERNEL);
			
 
				-   ev = task_event(task);
			
 
				-
			
 
				-   /*******************
			
 
				-    * Initializations *
			
 
				-    *******************/
			
 
				-
			
 
				-   /* ------- *
			
 
				-    * Codelet *
			
 
				-    * ------- */
			
 
				-   codelet->where = STARPU_OPENCL;
			
 
				-   codelet->power_model = NULL;
			
 
				-   codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
			
 
				-   //codelet->model = NULL;
			
 
				-   codelet->model = &perf_model;
			
 
				-
			
 
				-   /* ---------------- *
			
 
				-    * Codelet argument *
			
 
				-    * ---------------- */
			
 
				-   gc_entity_store(&arg->kernel, kernel);
			
 
				-   arg->work_dim = work_dim;
			
 
				-   arg->codelet = codelet;
			
 
				-
			
 
				-   /* Global work offset */
			
 
				-   if (global_work_offset != NULL) {
			
 
				-      arg->global_work_offset = (size_t*)malloc(sizeof(size_t)*work_dim);
			
 
				-      memcpy(arg->global_work_offset, global_work_offset, work_dim*sizeof(size_t));
			
 
				-   }
			
 
				-   else arg->global_work_offset = NULL;
			
 
				-
			
 
				-   /* Global work size */
			
 
				-   arg->global_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
			
 
				-   memcpy(arg->global_work_size, global_work_size, work_dim*sizeof(size_t));
			
 
				-
			
 
				-   /* Local work size */
			
 
				-   if (local_work_size != NULL) {
			
 
				-      arg->local_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
			
 
				-      memcpy(arg->local_work_size, local_work_size, work_dim*sizeof(size_t));
			
 
				-   }
			
 
				-   else arg->local_work_size = NULL;
			
 
				-
			
 
				-   /* ----------- *
			
 
				-    * StarPU task *
			
 
				-    * ----------- */
			
 
				-   task->cl = codelet;
			
 
				-   task->cl_arg = arg;
			
 
				-   task->cl_arg_size = sizeof(struct running_kernel);
			
 
				-
			
 
				-   /* Convert OpenCL's memory objects to StarPU buffers */
			
 
				-   codelet->nbuffers = 0;
			
 
				-   {
			
 
				-      arg->buffers = malloc(sizeof(cl_mem) * kernel->arg_count);
			
 
				-      arg->buffer_count = 0;
			
 
				-
			
 
				-      unsigned int i;
			
 
				-      for (i=0; i<kernel->arg_count; i++) {
			
 
				-         if (kernel->arg_type[i] == Buffer) {
			
 
				-
			
 
				-            cl_mem buf = (cl_mem)kernel->arg_value[i];
			
 
				-
			
 
				-            /* We save cl_mem references in order to properly release them after kernel termination */
			
 
				-            gc_entity_store(&arg->buffers[arg->buffer_count], buf);
			
 
				-            arg->buffer_count += 1;
			
 
				-
			
 
				-            codelet->nbuffers++;
			
 
				-            task->buffers[codelet->nbuffers-1].handle = buf->handle;
			
 
				-
			
 
				-            /* Determine best StarPU buffer access mode */
			
 
				-            int mode;
			
 
				-            if (buf->mode == CL_MEM_READ_ONLY)
			
 
				-               mode = STARPU_R;
			
 
				-            else if (buf->mode == CL_MEM_WRITE_ONLY) {
			
 
				-               mode = STARPU_W;
			
 
				-               buf->scratch = 0;
			
 
				-            }
			
 
				-            else if (buf->scratch) { //RW but never accessed in RW or W mode
			
 
				-               mode = STARPU_W;
			
 
				-               buf->scratch = 0;
			
 
				-            }
			
 
				-            else {
			
 
				-               mode = STARPU_RW;
			
 
				-               buf->scratch = 0;
			
 
				-            }
			
 
				-            task->buffers[codelet->nbuffers-1].mode = mode; 
			
 
				-         }
			
 
				-      }
			
 
				-   }
			
 
				-
			
 
				-   /* Copy arguments as kernel args can be modified by the time we launch the kernel */
			
 
				-   {
			
 
				-      arg->arg_count = kernel->arg_count;
			
 
				-      arg->arg_size = malloc(sizeof(size_t) * kernel->arg_count);
			
 
				-      memcpy(arg->arg_size, kernel->arg_size, sizeof(size_t) * kernel->arg_count);
			
 
				-      arg->arg_type = malloc(sizeof(enum kernel_arg_type) * kernel->arg_count);
			
 
				-      memcpy(arg->arg_type, kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->arg_count);
			
 
				-      arg->arg_value = malloc(sizeof(void*) * kernel->arg_count);
			
 
				-      unsigned int i;
			
 
				-      for (i=0; i<kernel->arg_count; i++) {
			
 
				-         if (kernel->arg_value[i] != NULL) {
			
 
				-           arg->arg_value[i] = malloc(arg->arg_size[i]);
			
 
				-           memcpy(arg->arg_value[i], kernel->arg_value[i], arg->arg_size[i]);
			
 
				-         }
			
 
				-         else arg->arg_value[i] = NULL;
			
 
				-      }
			
 
				-   }
			
 
				-
			
 
				-   DEBUG_MSG("Submitting NDRange task (event %d)\n", ev->id);
			
 
				-
			
 
				-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
			
 
				-
			
 
				-   /* Enqueue a cleaning task */
			
 
				-   starpu_task * cleaning_task = task_create_cpu(0, cleaning_task_callback, arg,1);
			
 
				-   cl_event cleaning_event = task_event(cleaning_task);
			
 
				-   command_queue_enqueue(cq, cleaning_task, 0, 1, &ev);
			
 
				+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd) {
			
 
				+
			
 
				+	starpu_task task = task_create();
			
 
				+	task->cl = cmd->codelet;
			
 
				+	task->cl_arg = cmd;
			
 
				+	task->cl_arg_size = sizeof(cmd);
			
 
				+
			
 
				+	starpu_codelet * codelet = cmd->codelet;
			
 
				+
			
 
				+	/* We need to detect which parameters are OpenCL's memory objects and
			
 
				+	 * we retrieve their corresponding StarPU buffers */
			
 
				+	cmd->num_buffers = 0;
			
 
				+	cmd->buffers = malloc(sizeof(cl_mem) * cmd->num_args);
			
 
				+
			
 
				+	unsigned int i;
			
 
				+	for (i=0; i<cmd->num_args; i++) {
			
 
				+		if (cmd->arg_types[i] == Buffer) {
			
 
				+
			
 
				+			cl_mem buf = *(cl_mem*)cmd->args[i];
			
 
				+
			
 
				+			gc_entity_store(&cmd->buffers[cmd->num_buffers], buf);
			
 
				+			task->buffers[cmd->num_buffers].handle = buf->handle;
			
 
				+
			
 
				+			/* Determine best StarPU buffer access mode */
			
 
				+			int mode;
			
 
				+			if (buf->mode == CL_MEM_READ_ONLY)
			
 
				+				mode = STARPU_R;
			
 
				+			else if (buf->mode == CL_MEM_WRITE_ONLY) {
			
 
				+				mode = STARPU_W;
			
 
				+				buf->scratch = 0;
			
 
				+			}
			
 
				+			else if (buf->scratch) { //RW but never accessed in RW or W mode
			
 
				+				mode = STARPU_W;
			
 
				+				buf->scratch = 0;
			
 
				+			}
			
 
				+			else {
			
 
				+				mode = STARPU_RW;
			
 
				+				buf->scratch = 0;
			
 
				+			}
			
 
				+			task->buffers[cmd->num_buffers].mode = mode; 
			
 
				+
			
 
				+			cmd->num_buffers += 1;
			
 
				+		}
			
 
				+	}
			
 
				+	codelet->nbuffers = cmd->num_buffers;
			
 
				+
			
 
				+	task_submit(task, cmd);
			
 
				+
			
 
				+	/* Enqueue a cleaning task */
			
 
				+	//FIXME: execute this in the callback?
			
 
				+	starpu_task cleaning_task = task_create_cpu(cleaning_task_callback, cmd,1);
			
 
				+	cl_event ev = command_event_get(cmd);
			
 
				+	task_depends_on(cleaning_task, 1, &ev);
			
 
				+	task_submit(cleaning_task, cmd);
			
 
				 
			
 
				-   gc_entity_release(cleaning_event);
			
 
				-  
			
 
				-   RETURN_EVENT(ev, event);
			
 
				-
			
 
				-   return ret;
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * Virtual kernel enqueueing command
			
 
				- */
			
 
				+
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclEnqueueNDRangeKernel(cl_command_queue cq,
			
 
				-                       cl_kernel        kernel,
			
 
				-                       cl_uint          work_dim,
			
 
				-                       const size_t *   global_work_offset,
			
 
				-                       const size_t *   global_work_size,
			
 
				-                       const size_t *   local_work_size,
			
 
				-                       cl_uint          num_events,
			
 
				-                       const cl_event * events,
			
 
				-                       cl_event *       event) CL_API_SUFFIX__VERSION_1_1
			
 
				+		cl_kernel        kernel,
			
 
				+		cl_uint          work_dim,
			
 
				+		const size_t *   global_work_offset,
			
 
				+		const size_t *   global_work_size,
			
 
				+		const size_t *   local_work_size,
			
 
				+		cl_uint          num_events,
			
 
				+		const cl_event * events,
			
 
				+		cl_event *       event) CL_API_SUFFIX__VERSION_1_1
			
 
				 {
			
 
				-	node_enqueue_kernel n;
			
 
				+	command_ndrange_kernel cmd = command_ndrange_kernel_create(kernel, work_dim,
			
 
				+			global_work_offset, global_work_size, local_work_size);
			
 
				+
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				 
			
 
				-	n = graph_create_enqueue_kernel(0, cq, kernel, work_dim, global_work_offset, global_work_size,
			
 
				-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
			
 
				-		kernel->arg_type, kernel->arg_value);
			
 
				-	
			
 
				-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
			
 
				-	node_play_enqueue_kernel(n);
			
 
				+	RETURN_EVENT(cmd, event);
			
 
				 
			
 
				-	//graph_store(n);
			
 
				 	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_enqueuereadbuffer.c
+++ b/socl/src/cl_enqueuereadbuffer.c
@@ -66,6 +66,37 @@ static starpu_codelet codelet_readbuffer = {
 
				    .nbuffers = 1
			
 
				 };
			
 
				 
			
 
				+cl_int command_read_buffer_submit(command_read_buffer cmd) {
			
 
				+	/* Aliases */
			
 
				+	cl_mem buffer = cmd->buffer;
			
 
				+	size_t offset = cmd->offset;
			
 
				+	size_t cb = cmd->cb;
			
 
				+	void * ptr = cmd->ptr;
			
 
				+
			
 
				+	struct starpu_task *task;
			
 
				+	struct arg_readbuffer *arg;
			
 
				+
			
 
				+	task = task_create(CL_COMMAND_READ_BUFFER);
			
 
				+
			
 
				+	task->buffers[0].handle = buffer->handle;
			
 
				+	task->buffers[0].mode = STARPU_R;
			
 
				+	task->cl = &codelet_readbuffer;
			
 
				+
			
 
				+	arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
			
 
				+	arg->offset = offset;
			
 
				+	arg->cb = cb;
			
 
				+	arg->ptr = ptr;
			
 
				+	task->cl_arg = arg;
			
 
				+	task->cl_arg_size = sizeof(struct arg_readbuffer);
			
 
				+
			
 
				+	gc_entity_store(&arg->buffer, buffer);
			
 
				+
			
 
				+	task_submit(task, cmd);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclEnqueueReadBuffer(cl_command_queue  cq,
			
 
				                     cl_mem              buffer,
			
@@ -77,33 +108,14 @@ soclEnqueueReadBuffer(cl_command_queue  cq,
 
				                     const cl_event *    events,
			
 
				                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
			
 
				 { 
			
 
				-   struct starpu_task *task;
			
 
				-   struct arg_readbuffer *arg;
			
 
				-   cl_event ev;
			
 
				-
			
 
				-   task = task_create(CL_COMMAND_READ_BUFFER);
			
 
				-   ev = task_event(task);
			
 
				-
			
 
				-   task->buffers[0].handle = buffer->handle;
			
 
				-   task->buffers[0].mode = STARPU_R;
			
 
				-   task->cl = &codelet_readbuffer;
			
 
				-
			
 
				-   arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
			
 
				-   arg->offset = offset;
			
 
				-   arg->cb = cb;
			
 
				-   arg->ptr = ptr;
			
 
				-   task->cl_arg = arg;
			
 
				-   task->cl_arg_size = sizeof(struct arg_readbuffer);
			
 
				-
			
 
				-   gc_entity_store(&arg->buffer, buffer);
			
 
				 
			
 
				-   task->synchronous = (blocking == CL_TRUE);
			
 
				+	command_read_buffer cmd = command_read_buffer_create(buffer, offset, cb, ptr);
			
 
				 
			
 
				-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				 
			
 
				-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
			
 
				+	RETURN_EVENT(cmd, event);
			
 
				 
			
 
				-   RETURN_EVENT(ev, event);
			
 
				+	MAY_BLOCK(blocking);
			
 
				 
			
 
				-   return ret;
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_enqueuetask.c
+++ b/socl/src/cl_enqueuetask.c
@@ -16,14 +16,6 @@
 
				 
			
 
				 #include "socl.h"
			
 
				 
			
 
				-static cl_uint work_dim = 3;
			
 
				-static const size_t global_work_offset[3] = {0,0,0};
			
 
				-static const size_t global_work_size[3] = {1,1,1};
			
 
				-static const size_t * local_work_size = NULL;
			
 
				-
			
 
				-CL_API_ENTRY cl_int CL_API_CALL
			
 
				-soclEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) CL_API_SUFFIX__VERSION_1_0;
			
 
				-
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclEnqueueTask(cl_command_queue cq,
			
 
				               cl_kernel         kernel,
			
@@ -31,15 +23,11 @@ soclEnqueueTask(cl_command_queue cq,
 
				               const cl_event *  events,
			
 
				               cl_event *        event) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				-	node_enqueue_kernel n;
			
 
				-
			
 
				-	n = graph_create_enqueue_kernel(1, cq, kernel, work_dim, global_work_offset, global_work_size,
			
 
				-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
			
 
				-		kernel->arg_type, kernel->arg_value);
			
 
				+	command_ndrange_kernel cmd = command_task_create(kernel);
			
 
				 	
			
 
				-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
			
 
				-	node_play_enqueue_kernel(n);
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				+
			
 
				+	RETURN_EVENT(cmd, event);
			
 
				 
			
 
				-	//graph_store(n);
			
 
				 	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_enqueueunmapmemobject.c
+++ b/socl/src/cl_enqueueunmapmemobject.c
@@ -16,27 +16,31 @@
 
				 
			
 
				 #include "socl.h"
			
 
				 
			
 
				+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd) {
			
 
				+	/* Aliases */
			
 
				+	cl_mem buffer = cmd->buffer;
			
 
				+
			
 
				+	//FIXME: use a callback
			
 
				+	starpu_task task = task_create_cpu((void(*)(void*))starpu_data_release, buffer->handle, 0);
			
 
				+
			
 
				+	task_submit(task, cmd);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclEnqueueUnmapMemObject(cl_command_queue cq,
			
 
				-                        cl_mem            memobj,
			
 
				-                        void *            UNUSED(mapped_ptr),
			
 
				+                        cl_mem            buffer,
			
 
				+                        void *            ptr,
			
 
				                         cl_uint           num_events,
			
 
				                         const cl_event *  events,
			
 
				                         cl_event *        event) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				-   struct starpu_task *task;
			
 
				-   cl_int err;
			
 
				-   cl_event ev;
			
 
				-
			
 
				-   /* Create StarPU task */
			
 
				-   task = task_create_cpu(CL_COMMAND_UNMAP_MEM_OBJECT, (void(*)(void*))starpu_data_release, memobj->handle, 0);
			
 
				-   ev = task_event(task);
			
 
				-
			
 
				-   DEBUG_MSG("Submitting UnmapBuffer task (event %d)\n", task->tag_id);
			
 
				+	command_unmap_mem_object cmd = command_unmap_mem_object_create(buffer, ptr);
			
 
				 
			
 
				-   err = command_queue_enqueue(cq, task, 0, num_events, events);
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				 
			
 
				-   RETURN_EVENT(ev, event);
			
 
				+	RETURN_EVENT(cmd, event);
			
 
				 
			
 
				-   return err;
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_enqueuewaitforevents.c
+++ b/socl/src/cl_enqueuewaitforevents.c
@@ -22,10 +22,9 @@ soclEnqueueWaitForEvents(cl_command_queue cq,
 
				                        const cl_event * events) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				 
			
 
				-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_WAIT_FOR_EVENTS doesn't exist
			
 
				-   starpu_task * task = task_create(CL_COMMAND_MARKER);
			
 
				+	command_marker cmd = command_marker_create();
			
 
				 
			
 
				-   command_queue_enqueue(cq, task, 0, num_events, events);
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				 
			
 
				-   return CL_SUCCESS;
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_enqueuewritebuffer.c
+++ b/socl/src/cl_enqueuewritebuffer.c
@@ -68,6 +68,43 @@ static starpu_codelet codelet_writebuffer = {
 
				    .nbuffers = 1
			
 
				 };
			
 
				 
			
 
				+cl_int command_write_buffer_submit(command_write_buffer cmd) {
			
 
				+	/* Aliases */
			
 
				+	cl_mem buffer = cmd->buffer;
			
 
				+	size_t offset = cmd->offset;
			
 
				+	size_t cb = cmd->cb;
			
 
				+	const void * ptr = cmd->ptr;
			
 
				+
			
 
				+	struct starpu_task *task;
			
 
				+	struct arg_writebuffer *arg;
			
 
				+
			
 
				+	task = task_create(CL_COMMAND_WRITE_BUFFER);
			
 
				+
			
 
				+	task->buffers[0].handle = buffer->handle;
			
 
				+	//If only a subpart of the buffer is written, RW access mode is required
			
 
				+	if (cb != buffer->size)
			
 
				+		task->buffers[0].mode = STARPU_RW;
			
 
				+	else 
			
 
				+		task->buffers[0].mode = STARPU_W;
			
 
				+	task->cl = &codelet_writebuffer;
			
 
				+
			
 
				+	arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
			
 
				+	arg->offset = offset;
			
 
				+	arg->cb = cb;
			
 
				+	arg->ptr = ptr;
			
 
				+	task->cl_arg = arg;
			
 
				+	task->cl_arg_size = sizeof(struct arg_writebuffer);
			
 
				+
			
 
				+	gc_entity_store(&arg->buffer, buffer);
			
 
				+
			
 
				+	//The buffer now contains meaningful data
			
 
				+	arg->buffer->scratch = 0;
			
 
				+
			
 
				+	task_submit(task, cmd);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclEnqueueWriteBuffer(cl_command_queue cq, 
			
 
				                      cl_mem             buffer, 
			
@@ -79,41 +116,13 @@ soclEnqueueWriteBuffer(cl_command_queue cq,
 
				                      const cl_event *   events, 
			
 
				                      cl_event *         event) CL_API_SUFFIX__VERSION_1_0
			
 
				 { 
			
 
				-   struct starpu_task *task;
			
 
				-   struct arg_writebuffer *arg;
			
 
				-   cl_event ev;
			
 
				-
			
 
				-   task = task_create(CL_COMMAND_WRITE_BUFFER);
			
 
				-   ev = task_event(task);
			
 
				-
			
 
				-   task->buffers[0].handle = buffer->handle;
			
 
				-   //If only a subpart of the buffer is written, RW access mode is required
			
 
				-   if (cb != buffer->size)
			
 
				-      task->buffers[0].mode = STARPU_RW;
			
 
				-   else 
			
 
				-      task->buffers[0].mode = STARPU_W;
			
 
				-   task->cl = &codelet_writebuffer;
			
 
				-
			
 
				-   arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
			
 
				-   arg->offset = offset;
			
 
				-   arg->cb = cb;
			
 
				-   arg->ptr = ptr;
			
 
				-   task->cl_arg = arg;
			
 
				-   task->cl_arg_size = sizeof(struct arg_writebuffer);
			
 
				-
			
 
				-   gc_entity_store(&arg->buffer, buffer);
			
 
				-
			
 
				-   //The buffer now contains meaningful data
			
 
				-   arg->buffer->scratch = 0;
			
 
				-
			
 
				-   task->synchronous = (blocking == CL_TRUE);
			
 
				+	command_write_buffer cmd = command_write_buffer_create(buffer, offset, cb, ptr);
			
 
				 
			
 
				-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
			
 
				+	command_queue_enqueue(cq, cmd, num_events, events);
			
 
				 
			
 
				-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events,events);
			
 
				+	RETURN_EVENT(cmd, event);
			
 
				 
			
 
				-   /* Return retained event if required by user */
			
 
				-   RETURN_EVENT(ev,event);
			
 
				+	MAY_BLOCK(blocking);
			
 
				 
			
 
				-   return ret;
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_finish.c
+++ b/socl/src/cl_finish.c
@@ -17,11 +17,14 @@
 
				 #include "socl.h"
			
 
				 
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				-soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
			
 
				-{
			
 
				-   cl_event ev = enqueueBarrier(cq);
			
 
				-   soclWaitForEvents(1, &ev);
			
 
				-   gc_entity_release(ev);
			
 
				+soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0 {
			
 
				 
			
 
				-   return CL_SUCCESS;
			
 
				+	command_marker cmd = command_barrier_create();
			
 
				+
			
 
				+	command_queue_enqueue(cq, cmd, 0, NULL);
			
 
				+		cl_event ev = command_event_get(cmd);
			
 
				+
			
 
				+	MAY_BLOCK(CL_TRUE)
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
--- a/socl/src/cl_geteventinfo.c
+++ b/socl/src/cl_geteventinfo.c
@@ -17,6 +17,7 @@
 
				 #include "socl.h"
			
 
				 #include "getinfo.h"
			
 
				 
			
 
				+
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclGetEventInfo(cl_event       event,
			
 
				                cl_event_info    param_name,
			
@@ -33,7 +34,7 @@ soclGetEventInfo(cl_event       event,
 
				 
			
 
				    switch (param_name) {
			
 
				       INFO_CASE(CL_EVENT_COMMAND_QUEUE, event->cq);
			
 
				-      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->type);
			
 
				+      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->command->typ);
			
 
				       INFO_CASE(CL_EVENT_COMMAND_EXECUTION_STATUS, event->status);
			
 
				       INFO_CASE(CL_EVENT_REFERENCE_COUNT, event->_entity.refs);
			
 
				       default:
			
--- a/socl/src/cl_getkernelinfo.c
+++ b/socl/src/cl_getkernelinfo.c
@@ -29,7 +29,7 @@ soclGetKernelInfo(cl_kernel       kernel,
 
				 
			
 
				    switch (param_name) {
			
 
				       INFO_CASE_EX(CL_KERNEL_FUNCTION_NAME, kernel->kernel_name, strlen(kernel->kernel_name)+1)
			
 
				-      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->arg_count)
			
 
				+      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->num_args)
			
 
				       INFO_CASE(CL_KERNEL_REFERENCE_COUNT, kernel->_entity.refs)
			
 
				       INFO_CASE(CL_KERNEL_PROGRAM, kernel->program)
			
 
				       INFO_CASE(CL_KERNEL_CONTEXT, kernel->program->context)
			
--- a/socl/src/cl_setkernelarg.c
+++ b/socl/src/cl_setkernelarg.c
@@ -25,7 +25,7 @@ soclSetKernelArg(cl_kernel  kernel,
 
				    if (kernel == NULL)
			
 
				       return CL_INVALID_KERNEL;
			
 
				 
			
 
				-   if (arg_index >= kernel->arg_count)
			
 
				+   if (arg_index >= kernel->num_args)
			
 
				       return CL_INVALID_ARG_INDEX;
			
 
				 
			
 
				    //FIXME: we don't return CL_INVALID_ARG_VALUE if "arg_value is NULL for an argument that is not declared with __local qualifier or vice-versa"
			
@@ -38,7 +38,8 @@ soclSetKernelArg(cl_kernel  kernel,
 
				          break;
			
 
				       case Buffer:
			
 
				          kernel->arg_type[arg_index] = Null;
			
 
				-         gc_entity_unstore((cl_mem*)&kernel->arg_value[arg_index]);
			
 
				+         gc_entity_unstore((cl_mem*)kernel->arg_value[arg_index]);
			
 
				+	 free(kernel->arg_value[arg_index]);
			
 
				          kernel->arg_value[arg_index] = NULL;
			
 
				          break;
			
 
				       case Immediate:
			
@@ -60,7 +61,8 @@ soclSetKernelArg(cl_kernel  kernel,
 
				       if ((arg_size == sizeof(cl_mem)) && ((buf = mem_object_fetch(arg_value)) != NULL)) {
			
 
				          DEBUG_MSG("Found buffer %d \n", buf->id);
			
 
				          kernel->arg_type[arg_index] = Buffer;
			
 
				-         gc_entity_store(&kernel->arg_value[arg_index], buf);
			
 
				+         kernel->arg_value[arg_index] = malloc(sizeof(void*));
			
 
				+	 gc_entity_store((cl_mem*)kernel->arg_value[arg_index], buf);
			
 
				       }
			
 
				       else {
			
 
				          /* Argument must be an immediate buffer  */
			
--- a/socl/src/cl_waitforevents.c
+++ b/socl/src/cl_waitforevents.c
@@ -23,7 +23,12 @@ soclWaitForEvents(cl_uint           num_events,
 
				    unsigned int i;
			
 
				    DEBUG_MSG("Waiting for events: ");
			
 
				    for (i=0; i<num_events; i++) {
			
 
				-      DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
			
 
				+   	command_graph_dump(event_list[i]->command);
			
 
				+
			
 
				+   	/* We need to submit commands if it's not already done */
			
 
				+	command_submit_deep(event_list[i]->command);
			
 
				+
			
 
				+      	DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
			
 
				    }
			
 
				    DEBUG_MSG_NOHEAD("\n");
			
 
				 
			
--- a/socl/src/command.c
+++ b/socl/src/command.c
@@ -0,0 +1,235 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "socl.h"
			
 
				+
			
 
				+void command_init_ex(cl_command cmd, cl_command_type typ) {
			
 
				+	cmd->typ = typ;
			
 
				+	cmd->num_events = 0;
			
 
				+	cmd->events = NULL;
			
 
				+	cmd->event = event_create();
			
 
				+	cmd->event->command = cmd;
			
 
				+	cmd->cq = NULL;
			
 
				+	cmd->task = NULL;
			
 
				+	cmd->submitted = 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void command_submit_ex(cl_command cmd) {
			
 
				+#define SUBMIT(typ,name) case typ:\
			
 
				+	name##_submit((name)cmd);\
			
 
				+	break;
			
 
				+
			
 
				+	assert(cmd->submitted == 0);
			
 
				+
			
 
				+	switch(cmd->typ) {
			
 
				+		SUBMIT(CL_COMMAND_NDRANGE_KERNEL, command_ndrange_kernel)
			
 
				+		SUBMIT(CL_COMMAND_TASK, command_ndrange_kernel)
			
 
				+		SUBMIT(CL_COMMAND_READ_BUFFER, command_read_buffer)
			
 
				+		SUBMIT(CL_COMMAND_WRITE_BUFFER, command_write_buffer)
			
 
				+		SUBMIT(CL_COMMAND_COPY_BUFFER, command_copy_buffer)
			
 
				+		SUBMIT(CL_COMMAND_MAP_BUFFER, command_map_buffer)
			
 
				+		SUBMIT(CL_COMMAND_UNMAP_MEM_OBJECT, command_unmap_mem_object)
			
 
				+		SUBMIT(CL_COMMAND_MARKER, command_marker)
			
 
				+		default:
			
 
				+			ERROR_STOP("Trying to submit unknown command (type %x)", cmd->typ);
			
 
				+	}
			
 
				+
			
 
				+	cmd->submitted = 1;
			
 
				+#undef SUBMIT
			
 
				+}
			
 
				+
			
 
				+cl_int command_submit_deep_ex(cl_command cmd) {
			
 
				+	if (cmd->submitted == 1)
			
 
				+		return CL_SUCCESS;
			
 
				+	
			
 
				+	/* We set this in order to avoid cyclic dependencies */
			
 
				+	cmd->submitted = 1;
			
 
				+
			
 
				+	unsigned int i;
			
 
				+	for (i=0; i<cmd->num_events; i++)
			
 
				+		command_submit_deep(cmd->events[i]->command);
			
 
				+	
			
 
				+	cmd->submitted = 0;
			
 
				+
			
 
				+	command_submit_ex(cmd);
			
 
				+
			
 
				+	return CL_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+void command_graph_dump_ex(cl_command cmd) {
			
 
				+
			
 
				+	unsigned int i;
			
 
				+	for (i=0; i<cmd->num_events; i++)
			
 
				+		command_graph_dump_ex(cmd->events[i]->command);
			
 
				+
			
 
				+	printf("CMD %lx TYPE %d DEPS", cmd, cmd->typ);
			
 
				+	for (i=0; i<cmd->num_events; i++)
			
 
				+		printf(" %lx", cmd->events[i]->command);
			
 
				+	printf("\n");
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#define nullOrDup(name,size) cmd->name = memdup_safe(name,size)
			
 
				+#define dup(name) cmd->name = name
			
 
				+#define dupEntity(name) do { cmd->name = name; gc_entity_retain(name); } while (0);
			
 
				+
			
 
				+void soclEnqueueNDRangeKernel_task(void *descr[], void *args);
			
 
				+
			
 
				+command_ndrange_kernel command_ndrange_kernel_create (
			
 
				+		cl_kernel        kernel,
			
 
				+		cl_uint          work_dim,
			
 
				+		const size_t *   global_work_offset,
			
 
				+		const size_t *   global_work_size,
			
 
				+		const size_t *   local_work_size)
			
 
				+{
			
 
				+	command_ndrange_kernel cmd = malloc(sizeof(struct command_ndrange_kernel_t));
			
 
				+	command_init(cmd, CL_COMMAND_NDRANGE_KERNEL);
			
 
				+
			
 
				+	dupEntity(kernel);
			
 
				+	dup(work_dim);
			
 
				+	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
			
 
				+	nullOrDup(global_work_size, work_dim*sizeof(size_t));
			
 
				+	nullOrDup(local_work_size, work_dim*sizeof(size_t));
			
 
				+
			
 
				+   	/* Codelet */
			
 
				+   	cmd->codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
			
 
				+	starpu_codelet * codelet = cmd->codelet;
			
 
				+	codelet->where = STARPU_OPENCL;
			
 
				+	codelet->power_model = NULL;
			
 
				+	codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
			
 
				+	codelet->model = NULL;
			
 
				+
			
 
				+   	/* Kernel is mutable, so we duplicate its parameters... */
			
 
				+	cmd->num_args = kernel->num_args;
			
 
				+	cmd->arg_sizes = memdup(kernel->arg_size, sizeof(size_t) * kernel->num_args);
			
 
				+	cmd->arg_types = memdup(kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->num_args);
			
 
				+	cmd->args = memdup_deep_varsize_safe(kernel->arg_value, kernel->num_args, kernel->arg_size);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_ndrange_kernel command_task_create (cl_kernel kernel) {
			
 
				+
			
 
				+	static cl_uint task_work_dim = 3;
			
 
				+	static const size_t task_global_work_offset[3] = {0,0,0};
			
 
				+	static const size_t task_global_work_size[3] = {1,1,1};
			
 
				+	static const size_t * task_local_work_size = NULL;
			
 
				+
			
 
				+	command_ndrange_kernel cmd = command_ndrange_kernel_create(
			
 
				+			kernel, task_work_dim, task_global_work_offset,
			
 
				+			task_global_work_size, task_local_work_size);
			
 
				+
			
 
				+	/* This is the only difference with command_ndrange_kernel_create */
			
 
				+	cmd->_command.typ = CL_COMMAND_TASK;
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_marker command_barrier_create () {
			
 
				+
			
 
				+	command_marker cmd = malloc(sizeof(struct command_marker_t));
			
 
				+	command_init(cmd, CL_COMMAND_BARRIER);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_marker command_marker_create () {
			
 
				+
			
 
				+	command_marker cmd = malloc(sizeof(struct command_marker_t));
			
 
				+	command_init(cmd, CL_COMMAND_MARKER);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_map_buffer command_map_buffer_create(
			
 
				+		cl_mem buffer,
			
 
				+		cl_map_flags map_flags,
			
 
				+		size_t offset,
			
 
				+		size_t cb,
			
 
				+		cl_event event
			
 
				+		) {
			
 
				+
			
 
				+	command_map_buffer cmd = malloc(sizeof(struct command_map_buffer_t));
			
 
				+	command_init(cmd, CL_COMMAND_MAP_BUFFER);
			
 
				+
			
 
				+	dupEntity(buffer);
			
 
				+	dup(map_flags);
			
 
				+	dup(offset);
			
 
				+	dup(cb);
			
 
				+	dupEntity(event);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_unmap_mem_object command_unmap_mem_object_create(cl_mem buffer, void * ptr) {
			
 
				+	command_unmap_mem_object cmd = malloc(sizeof(struct command_unmap_mem_object_t));
			
 
				+	command_init(cmd, CL_COMMAND_UNMAP_MEM_OBJECT);
			
 
				+
			
 
				+	dupEntity(buffer);
			
 
				+	dup(ptr);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_read_buffer command_read_buffer_create(cl_mem buffer, size_t offset, size_t cb, void * ptr) {
			
 
				+
			
 
				+	command_read_buffer cmd = malloc(sizeof(struct command_read_buffer_t));
			
 
				+	command_init(cmd, CL_COMMAND_READ_BUFFER);
			
 
				+
			
 
				+	dupEntity(buffer);
			
 
				+	dup(offset);
			
 
				+	dup(cb);
			
 
				+	dup(ptr);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_write_buffer command_write_buffer_create(cl_mem buffer, size_t offset, size_t cb, const void * ptr) {
			
 
				+
			
 
				+	command_write_buffer cmd = malloc(sizeof(struct command_write_buffer_t));
			
 
				+	command_init(cmd, CL_COMMAND_WRITE_BUFFER);
			
 
				+
			
 
				+	dupEntity(buffer);
			
 
				+	dup(offset);
			
 
				+	dup(cb);
			
 
				+	dup(ptr);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+command_copy_buffer command_copy_buffer_create( cl_mem src_buffer, cl_mem dst_buffer,
			
 
				+		size_t src_offset, size_t dst_offset, size_t cb)
			
 
				+{
			
 
				+	command_copy_buffer cmd = malloc(sizeof(struct command_copy_buffer_t));
			
 
				+	command_init(cmd, CL_COMMAND_COPY_BUFFER);
			
 
				+
			
 
				+	dupEntity(src_buffer);
			
 
				+	dupEntity(dst_buffer);
			
 
				+	dup(src_offset);
			
 
				+	dup(dst_offset);
			
 
				+	dup(cb);
			
 
				+
			
 
				+	return cmd;
			
 
				+}
			
 
				+
			
 
				+#undef nullOrDup
			
 
				+#undef nodeNullOrDup
			
 
				+#undef dup
			
 
				+#undef dupEntity
			
 
				+#undef nodeDup
			
 
				+#undef memdup
			
 
				+
			
--- a/socl/src/command.h
+++ b/socl/src/command.h
@@ -0,0 +1,198 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOCL_COMMANDS_H
			
 
				+#define SOCL_COMMANDS_H
			
 
				+
			
 
				+typedef struct cl_command_t * cl_command;
			
 
				+
			
 
				+/**
			
 
				+ * Initialize a command structure
			
 
				+ *
			
 
				+ * Command constructors for each kind of command use this method
			
 
				+ * Implicit and explicit dependencies must be passed as parameters
			
 
				+ */
			
 
				+void command_init_ex(cl_command cmd, cl_command_type typ);
			
 
				+#define command_init(cmd,typ) \
			
 
				+	command_init_ex((cl_command)cmd,typ)
			
 
				+
			
 
				+/** Submit a command for execution */
			
 
				+void command_submit_ex(cl_command cmd);
			
 
				+#define command_submit(cmd) \
			
 
				+	command_submit_ex(&(cmd)->_command)
			
 
				+
			
 
				+/** Submit a command and its dependencies */
			
 
				+cl_int command_submit_deep_ex(cl_command cmd);
			
 
				+#define command_submit_deep(cmd) (command_submit_deep_ex((cl_command)cmd))
			
 
				+
			
 
				+void command_graph_dump_ex(cl_command cmd);
			
 
				+#define command_graph_dump(cmd) (command_graph_dump_ex((cl_command)cmd))
			
 
				+
			
 
				+/**************************
			
 
				+ * OpenCL Commands
			
 
				+ **************************/
			
 
				+struct cl_command_t {
			
 
				+	cl_command_type	typ;	 	/* Command type */
			
 
				+	cl_uint 	num_events;	/* Number of dependencies */
			
 
				+	cl_event * 	events;		/* Dependencies */
			
 
				+	cl_event  	event;		/* Event for this command */
			
 
				+	cl_command_queue cq;		/* Command queue the command is enqueued in */
			
 
				+	starpu_task	task;		/* Associated StarPU task, if any */
			
 
				+	char		submitted;	/* True if the command has been submitted to StarPU */
			
 
				+};
			
 
				+
			
 
				+#define command_type_get(cmd) (((cl_command)cmd)->typ)
			
 
				+#define command_event_get(cmd) (((cl_command)cmd)->event)
			
 
				+#define command_num_events_get(cmd) (((cl_command)cmd)->num_events)
			
 
				+#define command_events_get(cmd) (((cl_command)cmd)->events)
			
 
				+#define command_task_get(cmd) (((cl_command)cmd)->task)
			
 
				+#define command_cq_get(cmd) (((cl_command)cmd)->cq)
			
 
				+
			
 
				+#define CL_COMMAND struct cl_command_t _command;
			
 
				+
			
 
				+typedef struct command_ndrange_kernel_t {
			
 
				+	CL_COMMAND
			
 
				+
			
 
				+	cl_kernel        kernel;
			
 
				+	cl_uint          work_dim;
			
 
				+	const size_t *   global_work_offset;
			
 
				+	const size_t *   global_work_size;
			
 
				+	const size_t *   local_work_size;
			
 
				+	cl_uint 	 num_args;
			
 
				+	size_t *	 arg_sizes;
			
 
				+	enum kernel_arg_type * arg_types;
			
 
				+	void **		 args;
			
 
				+	starpu_codelet * codelet;
			
 
				+	cl_uint		 num_buffers;
			
 
				+	cl_mem *	 buffers;
			
 
				+} * command_ndrange_kernel;
			
 
				+
			
 
				+
			
 
				+typedef struct command_read_buffer_t {
			
 
				+	CL_COMMAND
			
 
				+	
			
 
				+	cl_mem buffer;
			
 
				+	size_t offset;
			
 
				+	size_t cb;
			
 
				+	void * ptr;
			
 
				+} * command_read_buffer;
			
 
				+
			
 
				+
			
 
				+typedef struct command_write_buffer_t {
			
 
				+	CL_COMMAND
			
 
				+
			
 
				+	cl_mem buffer;
			
 
				+	size_t offset;
			
 
				+	size_t cb;
			
 
				+	const void * ptr;
			
 
				+} * command_write_buffer;
			
 
				+
			
 
				+
			
 
				+typedef struct command_copy_buffer_t {
			
 
				+	CL_COMMAND
			
 
				+	
			
 
				+	cl_mem src_buffer;
			
 
				+	cl_mem dst_buffer;
			
 
				+	size_t src_offset;
			
 
				+	size_t dst_offset;
			
 
				+	size_t cb;
			
 
				+} * command_copy_buffer;
			
 
				+
			
 
				+
			
 
				+typedef struct command_map_buffer_t {
			
 
				+	CL_COMMAND
			
 
				+
			
 
				+	cl_mem buffer;
			
 
				+	cl_map_flags map_flags;
			
 
				+	size_t offset;
			
 
				+	size_t cb;
			
 
				+	cl_event event;
			
 
				+} * command_map_buffer;
			
 
				+
			
 
				+
			
 
				+typedef struct command_unmap_mem_object_t {
			
 
				+	CL_COMMAND
			
 
				+
			
 
				+	cl_mem buffer;
			
 
				+	void * ptr;
			
 
				+} * command_unmap_mem_object;
			
 
				+
			
 
				+
			
 
				+typedef struct command_marker_t {
			
 
				+	CL_COMMAND
			
 
				+} * command_marker;
			
 
				+
			
 
				+/*************************
			
 
				+ * Constructor functions
			
 
				+ *************************/
			
 
				+
			
 
				+command_ndrange_kernel command_ndrange_kernel_create (
			
 
				+		cl_kernel        kernel,
			
 
				+		cl_uint          work_dim,
			
 
				+		const size_t *   global_work_offset,
			
 
				+		const size_t *   global_work_size,
			
 
				+		const size_t *   local_work_size);
			
 
				+
			
 
				+command_ndrange_kernel command_task_create (cl_kernel kernel);
			
 
				+
			
 
				+command_marker command_barrier_create ();
			
 
				+
			
 
				+command_marker command_marker_create ();
			
 
				+
			
 
				+command_map_buffer command_map_buffer_create(
			
 
				+		cl_mem buffer,
			
 
				+		cl_map_flags map_flags,
			
 
				+		size_t offset,
			
 
				+		size_t cb,
			
 
				+		cl_event event);
			
 
				+
			
 
				+command_unmap_mem_object command_unmap_mem_object_create(
			
 
				+		cl_mem buffer,
			
 
				+		void * ptr);
			
 
				+
			
 
				+command_read_buffer command_read_buffer_create(
			
 
				+		cl_mem buffer,
			
 
				+		size_t offset,
			
 
				+		size_t cb,
			
 
				+		void * ptr);
			
 
				+
			
 
				+command_write_buffer command_write_buffer_create(
			
 
				+		cl_mem buffer,
			
 
				+		size_t offset,
			
 
				+		size_t cb,
			
 
				+		const void * ptr);
			
 
				+
			
 
				+command_copy_buffer command_copy_buffer_create(
			
 
				+		cl_mem src_buffer,
			
 
				+		cl_mem dst_buffer,
			
 
				+		size_t src_offset,
			
 
				+		size_t dst_offset,
			
 
				+		size_t cb);
			
 
				+
			
 
				+/*************************
			
 
				+ * Submit functions
			
 
				+ *************************/
			
 
				+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd);
			
 
				+cl_int command_read_buffer_submit(command_read_buffer cmd);
			
 
				+cl_int command_write_buffer_submit(command_write_buffer cmd);
			
 
				+cl_int command_copy_buffer_submit(command_copy_buffer cmd);
			
 
				+cl_int command_map_buffer_submit(command_map_buffer cmd);
			
 
				+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd);
			
 
				+cl_int command_marker_submit(command_marker cmd);
			
 
				+
			
 
				+
			
 
				+#endif /* SOCL_COMMANDS_H */
			
 
				+
			
--- a/socl/src/command_list.c
+++ b/socl/src/command_list.c
@@ -0,0 +1,40 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "socl.h"
			
 
				+
			
 
				+command_list command_list_cons(cl_command cmd, command_list ls) {
			
 
				+	command_list e = malloc(sizeof(struct command_list_t));
			
 
				+	e->cmd = cmd;
			
 
				+	e->next = ls;
			
 
				+	if (ls != NULL)
			
 
				+		ls->prev = e;
			
 
				+	return e;
			
 
				+}
			
 
				+
			
 
				+command_list command_list_remove(command_list l, cl_command cmd) {
			
 
				+	command_list e = l;
			
 
				+	while (e != NULL) {
			
 
				+		if (e->cmd == cmd) {
			
 
				+			if (e->prev != NULL) e->prev->next = e->next;
			
 
				+			if (e->next != NULL) e->next->prev = e->prev;
			
 
				+			command_list next = e->next;
			
 
				+			free(e);
			
 
				+			if (e == l) return next;
			
 
				+		}
			
 
				+	}
			
 
				+	return l;
			
 
				+}
			
--- a/socl/src/command_list.h
+++ b/socl/src/command_list.h
@@ -0,0 +1,28 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "socl.h"
			
 
				+
			
 
				+typedef struct command_list_t * command_list;
			
 
				+
			
 
				+struct command_list_t {
			
 
				+	cl_command cmd;
			
 
				+	command_list next;
			
 
				+	command_list prev;
			
 
				+};
			
 
				+
			
 
				+command_list command_list_cons(cl_command cmd, command_list ls);
			
 
				+command_list command_list_remove(command_list l, cl_command cmd);
			
--- a/socl/src/command_queue.c
+++ b/socl/src/command_queue.c
@@ -24,74 +24,145 @@
 
				  * its command queue.
			
 
				  */
			
 
				 
			
 
				+
			
 
				 /**
			
 
				- * Enqueue the given task but put fake_event into the command queue.
			
 
				- * This is used when a tag notified by application is used (cf clEnqueueMapBuffer, etc.)
			
 
				+ * Returned implicit dependencies for a task
			
 
				+ * Command queue must be locked!
			
 
				  */
			
 
				-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event) {
			
 
				-
			
 
				-  int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
			
 
				-
			
 
				-  /* Set explicit task dependencies */
			
 
				-  task_dependency_add(task, num_events, events);
			
 
				-
			
 
				-  /* Lock command queue */
			
 
				-  pthread_spin_lock(&cq->spin);
			
 
				+void command_queue_dependencies_implicit(
			
 
				+	cl_command_queue cq, 	/* Command queue */
			
 
				+	char is_barrier,	/* Is the task a barrier */
			
 
				+	cl_int * ret_num_events,	/* Returned number of dependencies */
			
 
				+	cl_event ** ret_events	/* Returned dependencies */
			
 
				+) {
			
 
				+
			
 
				+	/*********************
			
 
				+	 * Count dependencies
			
 
				+	 *********************/
			
 
				+	int ndeps = 0;
			
 
				+
			
 
				+	/* Add dependency to last barrier if applicable */
			
 
				+	if (cq->barrier != NULL)
			
 
				+		ndeps++;
			
 
				+
			
 
				+	/* Add dependencies to out-of-order events (if any) */
			
 
				+	if (is_barrier) {
			
 
				+		command_list cl = cq->commands;
			
 
				+		while (cl != NULL) {
			
 
				+			ndeps++;
			
 
				+			cl = cl->next;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*********************
			
 
				+	 * Return dependencies
			
 
				+	 *********************/
			
 
				+
			
 
				+	cl_event * evs = malloc(ndeps * sizeof(cl_event));
			
 
				+	int n = 0;
			
 
				+
			
 
				+	/* Add dependency to last barrier if applicable */
			
 
				+	if (cq->barrier != NULL)
			
 
				+		evs[n++] = cq->barrier->event;
			
 
				+
			
 
				+	/* Add dependencies to out-of-order events (if any) */
			
 
				+	if (is_barrier) {
			
 
				+		command_list cl = cq->commands;
			
 
				+		while (cl != NULL) {
			
 
				+			evs[n++] = cl->cmd->event;
			
 
				+			cl = cl->next;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	*ret_num_events = ndeps;
			
 
				+	*ret_events = evs;
			
 
				+}
			
 
				+	
			
 
				+/**
			
 
				+ * Insert a command in the command queue
			
 
				+ * The command queue must be locked!
			
 
				+ */
			
 
				+void command_queue_insert(
			
 
				+	cl_command_queue 	cq, 	/* Command queue */
			
 
				+	cl_command 		cmd,	/* Command */
			
 
				+	int 			is_barrier		/* Is the task a barrier */
			
 
				+) {
			
 
				 
			
 
				-  /* Add dependency to last barrier if applicable */
			
 
				-  if (cq->barrier != NULL)
			
 
				-    task_dependency_add(task, 1, &cq->barrier);
			
 
				+	int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
			
 
				 
			
 
				-  /* Add dependencies to out-of-order events (if any) */
			
 
				-  if (barrier) {
			
 
				-    while (cq->events != NULL) {
			
 
				-      task_dependency_add(task, 1, &cq->events);
			
 
				-      cq->events = cq->events->next;
			
 
				-    }
			
 
				-  }
			
 
				+	if (is_barrier)
			
 
				+		cq->commands = NULL;
			
 
				 
			
 
				-  cl_event ev = (fake_event == NULL ? task_event(task) : fake_event);
			
 
				+	/* Add command to the list of out-of-order commands */
			
 
				+	if (!in_order)
			
 
				+		cq->commands = command_list_cons(cmd, cq->commands);
			
 
				 
			
 
				-  /* Add event to the list of out-of-order events */
			
 
				-  if (!in_order) {
			
 
				-    ev->next = cq->events;
			
 
				-    ev->prev = NULL;
			
 
				-    if (cq->events != NULL)
			
 
				-      cq->events->prev = ev;
			
 
				-    cq->events = ev;
			
 
				-  }
			
 
				+	/* Register this event as last barrier */
			
 
				+	if (is_barrier || in_order)
			
 
				+		cq->barrier = cmd;
			
 
				 
			
 
				-  /* Register this event as last barrier */
			
 
				-  if (barrier || in_order)
			
 
				-    cq->barrier = ev;
			
 
				+	/* Add reference to the command queue */
			
 
				+	gc_entity_store(&cmd->event->cq, cq);
			
 
				+}
			
 
				 
			
 
				-   /* Unlock command queue */
			
 
				-   pthread_spin_unlock(&cq->spin);
			
 
				+/**
			
 
				+ * Return implicit and explicit dependencies for a task
			
 
				+ * The command queue must be locked!
			
 
				+ */
			
 
				+void command_queue_dependencies(
			
 
				+	cl_command_queue 	cq,		/* Command queue */
			
 
				+	int 			is_barrier,	/* Is the task a barrier */
			
 
				+	cl_int 			num_events,	/* Number of explicit dependencies */
			
 
				+	const cl_event *	events,		/* Explicit dependencies */
			
 
				+	cl_int * 		ret_num_events,	/* Returned number of dependencies */
			
 
				+	cl_event ** 		ret_events	/* Returned dependencies */
			
 
				+) {
			
 
				+	cl_int implicit_num_events;
			
 
				+	cl_event * implicit_events;
			
 
				+
			
 
				+	/* Implicit dependencies */
			
 
				+	command_queue_dependencies_implicit(cq, is_barrier, &implicit_num_events, &implicit_events);
			
 
				+
			
 
				+	/* Explicit dependencies */
			
 
				+	cl_int ndeps = implicit_num_events + num_events;
			
 
				+	cl_event * evs = malloc(sizeof(cl_event) * ndeps);
			
 
				+	memcpy(evs, implicit_events, sizeof(cl_event) * implicit_num_events);
			
 
				+	memcpy(&evs[implicit_num_events], events, sizeof(cl_event) * num_events);
			
 
				+
			
 
				+	*ret_num_events = ndeps;
			
 
				+	*ret_events = evs;
			
 
				+}
			
 
				 
			
 
				-   /* Add reference to the command queue */
			
 
				-   gc_entity_store(&ev->cq, cq);
			
 
				+void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_events, const cl_event * events) {
			
 
				 
			
 
				-   /* Submit task */
			
 
				-   gc_entity_retain(task_event(task));
			
 
				-   int ret = starpu_task_submit(task);
			
 
				-   if (ret != 0)
			
 
				-      DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
			
 
				+	/* Check if the command is a barrier */
			
 
				+	int is_barrier = 0;
			
 
				+	if (cmd->typ == CL_COMMAND_BARRIER) {
			
 
				+		is_barrier = 1;
			
 
				+		/* OpenCL has no CL_COMMAND_BARRIER type, so we fall back on CL_COMMAND_MARKER */
			
 
				+		cmd->typ = CL_COMMAND_MARKER;
			
 
				+	}
			
 
				 
			
 
				-   return CL_SUCCESS;
			
 
				-}
			
 
				+	/* Set command queue field */
			
 
				+	cmd->cq = cq;
			
 
				 
			
 
				-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events) {
			
 
				-  return command_queue_enqueue_fakeevent(cq, task, barrier, num_events, events, NULL);
			
 
				-}
			
 
				+	/* Lock command queue */
			
 
				+	pthread_mutex_lock(&cq->mutex);
			
 
				 
			
 
				+	//FIXME: crappy separation (command_queue_dependencies + command_queue_insert)
			
 
				 
			
 
				-cl_event enqueueBarrier(cl_command_queue cq) {
			
 
				+	/* Get all (explicit + implicit) dependencies */
			
 
				+	cl_int all_num_events;
			
 
				+	cl_event * all_events;
			
 
				+	command_queue_dependencies(cq, is_barrier, num_events, events, &all_num_events, &all_events);
			
 
				 
			
 
				-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_BARRIER doesn't exist
			
 
				-   starpu_task * task = task_create(CL_COMMAND_MARKER);
			
 
				+	/* Make all dependencies explicit for the command */
			
 
				+	cmd->num_events = all_num_events;
			
 
				+	cmd->events = all_events;
			
 
				 
			
 
				-   DEBUG_MSG("Submitting barrier task (event %d)\n", task->tag_id);
			
 
				-   command_queue_enqueue(cq, task, 1, 0, NULL);
			
 
				+	/* Insert command in the queue */
			
 
				+	command_queue_insert(cq, cmd, is_barrier);
			
 
				 
			
 
				-   return task_event(task);
			
 
				+	/* Unlock command queue */
			
 
				+	pthread_mutex_unlock(&cq->mutex);
			
 
				 }
			
--- a/socl/src/command_queue.h
+++ b/socl/src/command_queue.h
@@ -17,10 +17,14 @@
 
				 #ifndef SOCL_COMMAND_QUEUE_H
			
 
				 #define SOCl_COMMAND_QUEUE_H
			
 
				 
			
 
				-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events);
			
 
				+void command_queue_enqueue_ex(
			
 
				+	cl_command_queue 	cq,		/* Command queue */
			
 
				+	cl_command		cmd,		/* Command to enqueue */
			
 
				+	cl_uint			num_events,	/* Number of explicit dependencies */
			
 
				+	const cl_event *	events		/* Explicit dependencies */
			
 
				+	);
			
 
				 
			
 
				-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event);
			
 
				-
			
 
				-cl_event enqueueBarrier(cl_command_queue cq);
			
 
				+#define command_queue_enqueue(cq, cmd, num_events, events)\
			
 
				+	command_queue_enqueue_ex(cq, (cl_command)cmd, num_events, events)
			
 
				 
			
 
				 #endif /* SOCl_COMMAND_QUEUE_H */
			
--- a/socl/src/debug.c
+++ b/socl/src/debug.c
@@ -0,0 +1,76 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "socl.h"
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+void DEBUG_CL(char *s, cl_int err) {
			
 
				+   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
			
 
				+   switch(err) {
			
 
				+      case CL_SUCCESS:
			
 
				+         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
			
 
				+         break;
			
 
				+      ERR_CASE(CL_DEVICE_NOT_FOUND)
			
 
				+      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
			
 
				+      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
			
 
				+      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
			
 
				+      ERR_CASE(CL_OUT_OF_RESOURCES)
			
 
				+      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
			
 
				+      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
			
 
				+      ERR_CASE(CL_MEM_COPY_OVERLAP)
			
 
				+      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
			
 
				+      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
			
 
				+      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
			
 
				+      ERR_CASE(CL_MAP_FAILURE)
			
 
				+      ERR_CASE(CL_INVALID_VALUE)
			
 
				+      ERR_CASE(CL_INVALID_DEVICE_TYPE)
			
 
				+      ERR_CASE(CL_INVALID_PLATFORM)
			
 
				+      ERR_CASE(CL_INVALID_DEVICE)
			
 
				+      ERR_CASE(CL_INVALID_CONTEXT)
			
 
				+      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
			
 
				+      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
			
 
				+      ERR_CASE(CL_INVALID_HOST_PTR)
			
 
				+      ERR_CASE(CL_INVALID_MEM_OBJECT)
			
 
				+      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
			
 
				+      ERR_CASE(CL_INVALID_IMAGE_SIZE)
			
 
				+      ERR_CASE(CL_INVALID_SAMPLER)
			
 
				+      ERR_CASE(CL_INVALID_BINARY)
			
 
				+      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
			
 
				+      ERR_CASE(CL_INVALID_PROGRAM)
			
 
				+      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
			
 
				+      ERR_CASE(CL_INVALID_KERNEL_NAME)
			
 
				+      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
			
 
				+      ERR_CASE(CL_INVALID_KERNEL)
			
 
				+      ERR_CASE(CL_INVALID_ARG_INDEX)
			
 
				+      ERR_CASE(CL_INVALID_ARG_VALUE)
			
 
				+      ERR_CASE(CL_INVALID_ARG_SIZE)
			
 
				+      ERR_CASE(CL_INVALID_KERNEL_ARGS)
			
 
				+      ERR_CASE(CL_INVALID_WORK_DIMENSION)
			
 
				+      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
			
 
				+      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
			
 
				+      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
			
 
				+      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
			
 
				+      ERR_CASE(CL_INVALID_EVENT)
			
 
				+      ERR_CASE(CL_INVALID_OPERATION)
			
 
				+      ERR_CASE(CL_INVALID_GL_OBJECT)
			
 
				+      ERR_CASE(CL_INVALID_BUFFER_SIZE)
			
 
				+      ERR_CASE(CL_INVALID_MIP_LEVEL)
			
 
				+      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
			
 
				+      default:
			
 
				+         DEBUG_MSG("%s CL error: Error message not supported by DEBUG_CL macro (%d).\n", s, err);
			
 
				+   }
			
 
				+}
			
 
				+#endif
			
--- a/socl/src/debug.h
+++ b/socl/src/debug.h
@@ -17,10 +17,12 @@
 
				 #ifndef SOCL_DEBUG_H
			
 
				 #define SOCL_DEBUG_H
			
 
				 
			
 
				+#include <../src/common/config.h>
			
 
				+
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 #define DEBUG
			
 
				 #include <stdio.h>
			
 
				-   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__); } while (0);
			
 
				+   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__);} while (0);
			
 
				    #define DEBUG_MSG_NOHEAD(...) fprintf(stderr, __VA_ARGS__)
			
 
				    #define DEBUG_ERROR(...) do { fprintf(stderr, "[SOCL] ERROR: "__VA_ARGS__); exit(1); } while (0);
			
 
				 #else
			
@@ -35,62 +37,7 @@
 
				 #define ERROR_STOP(...) do { ERROR_MSG(__VA_ARGS__); exit(1); } while(0);
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE
			
 
				-void DEBUG_CL(char *s, cl_int err) {
			
 
				-   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
			
 
				-   switch(err) {
			
 
				-      case CL_SUCCESS:
			
 
				-         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
			
 
				-         break;
			
 
				-      ERR_CASE(CL_DEVICE_NOT_FOUND)
			
 
				-      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
			
 
				-      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
			
 
				-      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
			
 
				-      ERR_CASE(CL_OUT_OF_RESOURCES)
			
 
				-      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
			
 
				-      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
			
 
				-      ERR_CASE(CL_MEM_COPY_OVERLAP)
			
 
				-      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
			
 
				-      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
			
 
				-      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
			
 
				-      ERR_CASE(CL_MAP_FAILURE)
			
 
				-      ERR_CASE(CL_INVALID_VALUE)
			
 
				-      ERR_CASE(CL_INVALID_DEVICE_TYPE)
			
 
				-      ERR_CASE(CL_INVALID_PLATFORM)
			
 
				-      ERR_CASE(CL_INVALID_DEVICE)
			
 
				-      ERR_CASE(CL_INVALID_CONTEXT)
			
 
				-      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
			
 
				-      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
			
 
				-      ERR_CASE(CL_INVALID_HOST_PTR)
			
 
				-      ERR_CASE(CL_INVALID_MEM_OBJECT)
			
 
				-      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
			
 
				-      ERR_CASE(CL_INVALID_IMAGE_SIZE)
			
 
				-      ERR_CASE(CL_INVALID_SAMPLER)
			
 
				-      ERR_CASE(CL_INVALID_BINARY)
			
 
				-      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
			
 
				-      ERR_CASE(CL_INVALID_PROGRAM)
			
 
				-      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
			
 
				-      ERR_CASE(CL_INVALID_KERNEL_NAME)
			
 
				-      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
			
 
				-      ERR_CASE(CL_INVALID_KERNEL)
			
 
				-      ERR_CASE(CL_INVALID_ARG_INDEX)
			
 
				-      ERR_CASE(CL_INVALID_ARG_VALUE)
			
 
				-      ERR_CASE(CL_INVALID_ARG_SIZE)
			
 
				-      ERR_CASE(CL_INVALID_KERNEL_ARGS)
			
 
				-      ERR_CASE(CL_INVALID_WORK_DIMENSION)
			
 
				-      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
			
 
				-      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
			
 
				-      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
			
 
				-      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
			
 
				-      ERR_CASE(CL_INVALID_EVENT)
			
 
				-      ERR_CASE(CL_INVALID_OPERATION)
			
 
				-      ERR_CASE(CL_INVALID_GL_OBJECT)
			
 
				-      ERR_CASE(CL_INVALID_BUFFER_SIZE)
			
 
				-      ERR_CASE(CL_INVALID_MIP_LEVEL)
			
 
				-      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
			
 
				-      default:
			
 
				-         DEBUG_MSG("%s CL error: Error message not supported by print_cl_error (%d).\n", s, err);
			
 
				-   }
			
 
				-}
			
 
				+void DEBUG_CL(char *s, cl_int err);
			
 
				 #else
			
 
				    #define DEBUG_CL(...) while(0);
			
 
				 #endif
			
--- a/socl/src/event.c
+++ b/socl/src/event.c
@@ -20,21 +20,24 @@
 
				 
			
 
				 static void release_callback_event(void * e);
			
 
				 
			
 
				+int event_unique_id() {
			
 
				+   static int id = 1;
			
 
				+
			
 
				+   return __sync_fetch_and_add(&id,1);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * Create a new event
			
 
				  *
			
 
				  * Events have one-to-one relation with tag. Tag number is event ID
			
 
				  */
			
 
				 cl_event event_create(void) {
			
 
				-   static int id = 1;
			
 
				    cl_event ev;
			
 
				    ev = gc_entity_alloc(sizeof(struct _cl_event), release_callback_event);
			
 
				 
			
 
				-   ev->next = NULL;
			
 
				-   ev->prev = NULL;
			
 
				-   ev->id = __sync_fetch_and_add(&id,1);
			
 
				+   ev->id = event_unique_id();
			
 
				    ev->status = CL_SUBMITTED;
			
 
				-   ev->type = 0;
			
 
				+   ev->command = NULL;
			
 
				    ev->profiling_info = NULL;
			
 
				    ev->cq = NULL;
			
 
				 
			
@@ -49,22 +52,17 @@ static void release_callback_event(void * e) {
 
				   /* Remove from command queue */
			
 
				   if (cq != NULL) {
			
 
				     /* Lock command queue */
			
 
				-    pthread_spin_lock(&cq->spin);
			
 
				+    pthread_mutex_lock(&cq->mutex);
			
 
				 
			
 
				     /* Remove barrier if applicable */
			
 
				-    if (cq->barrier == event)
			
 
				+    if (cq->barrier == event->command)
			
 
				       cq->barrier = NULL;
			
 
				 
			
 
				-    /* Remove from the list of out-of-order events */
			
 
				-    if (event->prev != NULL)
			
 
				-      event->prev->next = event->next;
			
 
				-    if (event->next != NULL)
			
 
				-      event->next->prev = event->prev;
			
 
				-    if (cq->events == event)
			
 
				-      cq->events = event->next;
			
 
				+    /* Remove from the list of out-of-order commands */
			
 
				+    cq->commands = command_list_remove(cq->commands, event->command);
			
 
				 
			
 
				     /* Unlock command queue */
			
 
				-    pthread_spin_unlock(&cq->spin);
			
 
				+    pthread_mutex_unlock(&cq->mutex);
			
 
				 
			
 
				     gc_entity_unstore(&cq);
			
 
				   }
			
--- a/socl/src/event.h
+++ b/socl/src/event.h
@@ -26,4 +26,9 @@
 
				  */
			
 
				 cl_event event_create(void);
			
 
				 
			
 
				+/**
			
 
				+ * Generate a unique tag id
			
 
				+ */
			
 
				+int event_unique_id();
			
 
				+
			
 
				 #endif /* SOCL_EVENT_H */
			
--- a/socl/src/gc.c
+++ b/socl/src/gc.c
@@ -104,13 +104,15 @@ void gc_stop(void) {
 
				   pthread_join(gc_thread, NULL);
			
 
				 }
			
 
				 
			
 
				-void gc_entity_release_ex(entity e) {
			
 
				+int gc_entity_release_ex(entity e) {
			
 
				 
			
 
				   /* Decrement reference count */
			
 
				   int refs = __sync_sub_and_fetch(&e->refs, 1);
			
 
				 
			
 
				   if (refs != 0)
			
 
				-    return;
			
 
				+    return 0;
			
 
				+
			
 
				+  DEBUG_MSG("Releasing entity %lx\n", e);
			
 
				 
			
 
				   GC_LOCK;
			
 
				 
			
@@ -127,6 +129,8 @@ void gc_entity_release_ex(entity e) {
 
				   gc_list = e;
			
 
				 
			
 
				   GC_UNLOCK;
			
 
				+
			
 
				+  return 1;
			
 
				 }
			
 
				 
			
 
				 
			
--- a/socl/src/gc.h
+++ b/socl/src/gc.h
@@ -27,7 +27,7 @@ void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*));
 
				 void gc_entity_retain(void *arg);
			
 
				 
			
 
				 /** Decrement reference counter and release entity if applicable */
			
 
				-void gc_entity_release_ex(entity e);
			
 
				+int gc_entity_release_ex(entity e);
			
 
				 
			
 
				 int gc_active_entity_count(void);
			
 
				 
			
--- a/socl/src/graph.c
+++ b/socl/src/graph.c
@@ -1,123 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "socl.h"
			
 
				-#include "graph.h"
			
 
				-#include "event.h"
			
 
				-
			
 
				-static pthread_spinlock_t graph_lock;
			
 
				-static graph_node graph_nodes = NULL;
			
 
				-
			
 
				-
			
 
				-/**
			
 
				- * Initialize graph structure
			
 
				- */
			
 
				-void graph_init(void) {
			
 
				-	pthread_spin_init(&graph_lock, PTHREAD_PROCESS_PRIVATE);
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * Release graph structure
			
 
				- */
			
 
				-void graph_destroy(void) {
			
 
				-	pthread_spin_destroy(&graph_lock);
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * Initialize a graph node
			
 
				- */
			
 
				-void graph_node_init(graph_node node) {
			
 
				-	node->id = -1;
			
 
				-	node->next = NULL;
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * Store a node in the graph
			
 
				- */
			
 
				-void graph_store(void * node) {
			
 
				-	pthread_spin_lock(&graph_lock);
			
 
				-
			
 
				-	graph_node n = (graph_node)node;
			
 
				-	n->next = graph_nodes;
			
 
				-	graph_nodes = n;
			
 
				-
			
 
				-	pthread_spin_unlock(&graph_lock);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-/**
			
 
				- * Duplicate a memory area into a fresh allocated buffer
			
 
				- */
			
 
				-static void * memdupa(const void *p, size_t size) {
			
 
				-	void * s = malloc(size);
			
 
				-	memcpy(s,p,size);
			
 
				-	return s;
			
 
				-}
			
 
				-
			
 
				-#define memdup(p, size) ((typeof(p))memdupa(p,size))
			
 
				-#define nullOrDup(name,size) s->name = (name == NULL ? NULL : memdup(name,size))
			
 
				-#define dup(name) s->name = name
			
 
				-
			
 
				-
			
 
				-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
			
 
				-		cl_command_queue cq,
			
 
				-		cl_kernel        kernel,
			
 
				-		cl_uint          work_dim,
			
 
				-		const size_t *   global_work_offset,
			
 
				-		const size_t *   global_work_size,
			
 
				-		const size_t *   local_work_size,
			
 
				-		cl_uint          num_events,
			
 
				-		const cl_event * events,
			
 
				-		cl_event *       event,
			
 
				-		cl_uint 		num_args,
			
 
				-		size_t *		arg_sizes,
			
 
				-		enum kernel_arg_type * arg_types,
			
 
				-		void **		args)
			
 
				-{
			
 
				-	node_enqueue_kernel s = malloc(sizeof(struct node_enqueue_kernel_t));
			
 
				-	graph_node_init(&s->node);
			
 
				-	s->node.id = NODE_ENQUEUE_KERNEL;
			
 
				-
			
 
				-	dup(is_task);
			
 
				-	dup(cq);
			
 
				-	dup(kernel);
			
 
				-	dup(work_dim);
			
 
				-	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
			
 
				-	nullOrDup(global_work_size, work_dim*sizeof(size_t));
			
 
				-	nullOrDup(local_work_size, work_dim*sizeof(size_t));
			
 
				-	dup(num_events);
			
 
				-	nullOrDup(events, num_events * sizeof(cl_event));
			
 
				-	dup(num_args);
			
 
				-	nullOrDup(arg_sizes, num_args * sizeof(size_t));
			
 
				-	nullOrDup(arg_types, num_args * sizeof(enum kernel_arg_type));
			
 
				-	nullOrDup(args, num_args * sizeof(void*));
			
 
				-
			
 
				-	
			
 
				-	if (event != NULL) {
			
 
				-		*event = event_create();
			
 
				-		s->event = event;
			
 
				-	}
			
 
				-	else {
			
 
				-		s->event = NULL;
			
 
				-	}
			
 
				-
			
 
				-	return s;
			
 
				-}
			
 
				-
			
 
				-#undef nullOrDup
			
 
				-#undef memdup
			
 
				-#undef dup
			
--- a/socl/src/graph.h
+++ b/socl/src/graph.h
@@ -1,73 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef SOCL_GRAPH_H
			
 
				-#define SOCL_GRAPH_H
			
 
				-
			
 
				-#include "socl.h"
			
 
				-
			
 
				-typedef struct graph_node_t * graph_node;
			
 
				-
			
 
				-struct graph_node_t {
			
 
				-	int id; /* Kind of node */
			
 
				-	graph_node next; /* Linked-list of nodes... */
			
 
				-};
			
 
				-
			
 
				-void graph_init(void);
			
 
				-void graph_destroy(void);
			
 
				-void graph_node_init(graph_node node);
			
 
				-void graph_store(void * node);
			
 
				-
			
 
				-#define NODE_ENQUEUE_KERNEL 1
			
 
				-
			
 
				-
			
 
				-typedef struct node_enqueue_kernel_t {
			
 
				-	struct graph_node_t node;
			
 
				-
			
 
				-	char 		 is_task; /* Set if clEnqueueTask is used */
			
 
				-	cl_command_queue cq;
			
 
				-	cl_kernel        kernel;
			
 
				-	cl_uint          work_dim;
			
 
				-	const size_t *   global_work_offset;
			
 
				-	const size_t *   global_work_size;
			
 
				-	const size_t *   local_work_size;
			
 
				-	cl_uint          num_events;
			
 
				-	const cl_event * events;
			
 
				-	cl_event * 	 event;
			
 
				-	cl_uint 	 num_args;
			
 
				-	size_t *	 arg_sizes;
			
 
				-	enum kernel_arg_type * arg_types;
			
 
				-	void **		 args;
			
 
				-} * node_enqueue_kernel;
			
 
				-
			
 
				-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
			
 
				-		cl_command_queue cq,
			
 
				-		cl_kernel        kernel,
			
 
				-		cl_uint          work_dim,
			
 
				-		const size_t *   global_work_offset,
			
 
				-		const size_t *   global_work_size,
			
 
				-		const size_t *   local_work_size,
			
 
				-		cl_uint          num_events,
			
 
				-		const cl_event * events,
			
 
				-		cl_event *       event,
			
 
				-		cl_uint 		num_args,
			
 
				-		size_t *		arg_sizes,
			
 
				-		enum kernel_arg_type * arg_types,
			
 
				-		void **		args);
			
 
				-
			
 
				-cl_int node_play_enqueue_kernel(node_enqueue_kernel n);
			
 
				-
			
 
				-#endif /* SOCL_GRAPH_H */
			
--- a/socl/src/init.c
+++ b/socl/src/init.c
@@ -15,7 +15,6 @@
 
				  */
			
 
				 
			
 
				 #include "socl.h"
			
 
				-#include "graph.h"
			
 
				 #include "gc.h"
			
 
				 #include "mem_objects.h"
			
 
				 
			
@@ -25,7 +24,6 @@
 
				 __attribute__((constructor)) static void socl_init() {
			
 
				   
			
 
				   mem_object_init();
			
 
				-  graph_init();
			
 
				 
			
 
				   starpu_init(NULL);
			
 
				   
			
@@ -51,7 +49,5 @@ __attribute__((destructor)) static void socl_shutdown() {
 
				   if (active_entities != 0)
			
 
				     fprintf(stderr, "Unreleased entities: %d\n", active_entities);
			
 
				 
			
 
				-  graph_destroy();
			
 
				-
			
 
				   starpu_shutdown();
			
 
				 }
			
--- a/socl/src/socl.h
+++ b/socl/src/socl.h
@@ -23,6 +23,9 @@
 
				 #include CL_HEADERS "CL/cl.h"
			
 
				 #endif
			
 
				 
			
 
				+/* Additional command type */
			
 
				+#define CL_COMMAND_BARRIER 0x99987
			
 
				+
			
 
				 #include <string.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <stdint.h>
			
@@ -35,7 +38,7 @@
 
				 #include <starpu_profiling.h>
			
 
				 #include <starpu_task.h>
			
 
				 
			
 
				-typedef struct starpu_task starpu_task;
			
 
				+typedef struct starpu_task * starpu_task;
			
 
				 
			
 
				 #ifdef UNUSED
			
 
				 #elif defined(__GNUC__)
			
@@ -50,12 +53,13 @@ typedef struct starpu_task starpu_task;
 
				  */
			
 
				 typedef struct entity * entity;
			
 
				 
			
 
				+#include "command.h"
			
 
				+#include "command_list.h"
			
 
				 #include "command_queue.h"
			
 
				 #include "debug.h"
			
 
				 #include "devices.h"
			
 
				 #include "event.h"
			
 
				 #include "gc.h"
			
 
				-#include "graph.h"
			
 
				 #include "mem_objects.h"
			
 
				 #include "task.h"
			
 
				 #include "util.h"
			
@@ -79,11 +83,32 @@ struct entity {
 
				 
			
 
				 struct _cl_platform_id {};
			
 
				 
			
 
				-#define RETURN_EVENT(ev, event) \
			
 
				-   if (event != NULL) \
			
 
				-      *event = ev; \
			
 
				-   else\
			
 
				-      gc_entity_release(ev);
			
 
				+#define RETURN_EVENT(cmd, event) \
			
 
				+	if (event != NULL) { \
			
 
				+		cl_event ev = command_event_get(cmd);\
			
 
				+		gc_entity_retain(ev);\
			
 
				+		*event = ev; \
			
 
				+	}
			
 
				+
			
 
				+#define RETURN_CUSTOM_EVENT(src, tgt) \
			
 
				+	if (tgt != NULL) { \
			
 
				+		gc_entity_retain(src); \
			
 
				+		*tgt = src; \
			
 
				+	}
			
 
				+
			
 
				+#define MAY_BLOCK(blocking) \
			
 
				+	if ((blocking) == CL_TRUE) {\
			
 
				+		cl_event ev = command_event_get(cmd);\
			
 
				+		soclWaitForEvents(1, &ev);\
			
 
				+		gc_entity_release(ev);\
			
 
				+	}
			
 
				+
			
 
				+#define MAY_BLOCK_CUSTOM(blocking,event) \
			
 
				+	if ((blocking) == CL_TRUE) {\
			
 
				+		cl_event ev = (event);\
			
 
				+		soclWaitForEvents(1, &ev);\
			
 
				+		gc_entity_release(ev);\
			
 
				+	}
			
 
				 
			
 
				 /* Constants */
			
 
				 struct _cl_platform_id socl_platform;
			
@@ -121,14 +146,14 @@ struct _cl_command_queue {
 
				   cl_device_id device;
			
 
				   cl_context context;
			
 
				 
			
 
				-  /* Stored command events */
			
 
				-  cl_event events;
			
 
				+  /* Stored commands */
			
 
				+  command_list commands;
			
 
				 
			
 
				   /* Last enqueued barrier-like event */
			
 
				-  cl_event barrier;
			
 
				+  cl_command barrier;
			
 
				 
			
 
				   /* Mutex */
			
 
				-  pthread_spinlock_t spin;
			
 
				+  pthread_mutex_t mutex;
			
 
				 
			
 
				   /* ID  */
			
 
				 #ifdef DEBUG
			
@@ -142,12 +167,8 @@ struct _cl_event {
 
				   /* Command queue */
			
 
				   cl_command_queue cq;
			
 
				 
			
 
				-  /* Command type */
			
 
				-  cl_command_type type;
			
 
				-
			
 
				-  /* Command queue list */
			
 
				-  cl_event prev;
			
 
				-  cl_event next;
			
 
				+  /* Command */
			
 
				+  cl_command command;
			
 
				 
			
 
				   /* Event status */
			
 
				   cl_int status;
			
@@ -244,7 +265,7 @@ struct _cl_kernel {
 
				   cl_int *errcodes;
			
 
				 
			
 
				   /* Arguments */
			
 
				-  unsigned int arg_count;
			
 
				+  unsigned int num_args;
			
 
				   size_t *arg_size;
			
 
				   enum kernel_arg_type  *arg_type;
			
 
				   void  **arg_value;
			
--- a/socl/src/task.c
+++ b/socl/src/task.c
@@ -18,64 +18,90 @@
 
				 #include "gc.h"
			
 
				 #include "event.h"
			
 
				 
			
 
				-cl_event task_event(starpu_task *task) {
			
 
				-  return (cl_event)task->callback_arg;
			
 
				-}
			
 
				-
			
 
				 static void task_release_callback(void *arg) {
			
 
				-  starpu_task *task = starpu_get_current_task();
			
 
				-  cl_event ev = (cl_event)arg;
			
 
				+  starpu_task task = starpu_get_current_task();
			
 
				+  cl_command cmd = (cl_command)arg;
			
 
				   
			
 
				+  cl_event ev = command_event_get(cmd);
			
 
				   ev->status = CL_COMPLETE;
			
 
				 
			
 
				+  DEBUG_MSG("notifying tag %x as well as task tag %x\n", ev->id, task->tag_id);
			
 
				+
			
 
				+  /* Trigger the tag associated to the command event */
			
 
				+  starpu_tag_notify_from_apps(ev->id);
			
 
				+
			
 
				   if (task->profiling_info != NULL && (intptr_t)task->profiling_info != -ENOSYS) {
			
 
				     ev->profiling_info = malloc(sizeof(*task->profiling_info));
			
 
				     memcpy(ev->profiling_info, task->profiling_info, sizeof(*task->profiling_info));
			
 
				   }
			
 
				 
			
 
				   gc_entity_release(ev);
			
 
				+
			
 
				+  /* Release the command */
			
 
				+  //TODO
			
 
				 }
			
 
				 
			
 
				 
			
 
				 /*
			
 
				  * Create a StarPU task
			
 
				- *
			
 
				- * Task's callback_arg is event
			
 
				- * Task's tag is set to event ID
			
 
				  */
			
 
				-starpu_task * task_create(cl_command_type type) {
			
 
				-   cl_event event;
			
 
				-   struct starpu_task * task;
			
 
				+starpu_task task_create() {
			
 
				+	struct starpu_task * task;
			
 
				+
			
 
				+	/* Create StarPU task */
			
 
				+	task = starpu_task_create();
			
 
				+
			
 
				+	/* Set task common settings */
			
 
				+	task->destroy = 1;
			
 
				+	task->detach = 1;
			
 
				+
			
 
				+	task->use_tag = 1;
			
 
				+	task->tag_id = event_unique_id();
			
 
				 
			
 
				-   /* Create event */
			
 
				-   event = event_create();
			
 
				-   event->type = type;
			
 
				+	DEBUG_MSG("creating task with tag %x\n", task->tag_id);
			
 
				 
			
 
				-   /* Create StarPU task */
			
 
				-   task = starpu_task_create();
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
			
 
				+
			
 
				+	if (num_events != 0) {
			
 
				+		cl_uint i;
			
 
				+
			
 
				+		starpu_tag_t * tags = malloc(num_events * sizeof(starpu_tag_t));	
			
 
				 
			
 
				-   /* Task tag is set to event id */
			
 
				-   task->use_tag = 1;
			
 
				-   task->tag_id = event->id;
			
 
				+		if (num_events != 0)
			
 
				+			DEBUG_MSG("Tag %d depends on %u tags:", task->tag_id, num_events);
			
 
				 
			
 
				-   /* Set task common settings */
			
 
				-   task->destroy = 1;
			
 
				-   task->detach = 1;
			
 
				-   task->callback_func = task_release_callback;
			
 
				-   task->callback_arg = event;
			
 
				+		for (i=0; i<num_events; i++) {
			
 
				+			tags[i] = events[i]->id;
			
 
				+			DEBUG_MSG_NOHEAD(" %u", events[i]->id);
			
 
				+		}
			
 
				+		DEBUG_MSG_NOHEAD("\n");
			
 
				 
			
 
				-   return task;
			
 
				+		starpu_tag_declare_deps_array(task->tag_id, num_events, tags);
			
 
				+
			
 
				+		free(tags);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				+cl_int task_submit_ex(starpu_task task, cl_command cmd) {
			
 
				+
			
 
				+	/* Associated the task to the command */
			
 
				+	cmd->task = task;
			
 
				+
			
 
				+	task_depends_on(task, command_num_events_get(cmd), command_events_get(cmd));
			
 
				+
			
 
				+	task->callback_func = task_release_callback;
			
 
				+	task->callback_arg = cmd;
			
 
				 
			
 
				-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events) {
			
 
				-   unsigned int i;
			
 
				+	/* Submit task */
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	if (ret != 0)
			
 
				+		DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
			
 
				 
			
 
				-   for (i=0; i<num; i++) {
			
 
				-      starpu_tag_t tag = events[i]->id;
			
 
				-      DEBUG_MSG("Event %d depends on event %d\n", task->tag_id, events[i]->id);
			
 
				-      starpu_tag_declare_deps_array(task->tag_id, 1, &tag);
			
 
				-   }
			
 
				+	return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -106,14 +132,14 @@ static starpu_codelet cputask_codelet = {
 
				    .cpu_func = &cputask_task
			
 
				 };
			
 
				 
			
 
				-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg) {
			
 
				+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg) {
			
 
				   
			
 
				   struct cputask_arg * a = malloc(sizeof(struct cputask_arg));
			
 
				   a->callback = callback;
			
 
				   a->arg = arg;
			
 
				   a->free_arg = free_arg;
			
 
				 
			
 
				-  starpu_task *task = task_create(type);
			
 
				+  starpu_task task = task_create();
			
 
				   task->cl = &cputask_codelet;
			
 
				   task->cl_arg = a;
			
 
				 
			
--- a/socl/src/task.h
+++ b/socl/src/task.h
@@ -19,9 +19,22 @@
 
				 
			
 
				 #include "socl.h"
			
 
				 
			
 
				-starpu_task * task_create(cl_command_type type);
			
 
				-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events);
			
 
				-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg);
			
 
				-cl_event task_event(starpu_task *task);
			
 
				+starpu_task task_create();
			
 
				+void task_dependency_add(starpu_task task, cl_uint num_events, cl_event *events);
			
 
				+
			
 
				+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg);
			
 
				+
			
 
				+/**
			
 
				+ * Associate a StarPU task to a command and submit it
			
 
				+ *
			
 
				+ * When the task terminates, the command is set as terminated too
			
 
				+ */
			
 
				+cl_int task_submit_ex(starpu_task task, cl_command cmd);
			
 
				+#define task_submit(task,cmd) task_submit_ex(task, (cl_command)cmd)
			
 
				+
			
 
				+/**
			
 
				+ * Add task dependencies
			
 
				+ */
			
 
				+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events);
			
 
				 
			
 
				 #endif /* SOCL_TASK_H */
			
--- a/socl/src/util.c
+++ b/socl/src/util.c
@@ -24,3 +24,27 @@ int starpu_worker_get_range() {
 
				 
			
 
				    return oid;
			
 
				 }
			
 
				+
			
 
				+void * memdupa(const void *p, size_t size) {
			
 
				+	void * s = malloc(size);
			
 
				+	memcpy(s,p,size);
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+void ** memdup_deep_safea(const void **p, unsigned n, size_t size) {
			
 
				+	void ** s = (void**)malloc(sizeof(void*) * n);
			
 
				+	unsigned i;
			
 
				+	for (i=0; i<n; i++) {
			
 
				+		s[i] = memdup_safe((void*)p[i], size);
			
 
				+	}
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size) {
			
 
				+	void ** s = (void**)malloc(sizeof(void*) * n);
			
 
				+	unsigned i;
			
 
				+	for (i=0; i<n; i++) {
			
 
				+		s[i] = memdup_safe((void*)p[i], size[i]);
			
 
				+	}
			
 
				+	return s;
			
 
				+}
			
--- a/socl/src/util.h
+++ b/socl/src/util.h
@@ -19,4 +19,28 @@
 
				 
			
 
				 int starpu_worker_get_range();
			
 
				 
			
 
				+/**
			
 
				+ * Duplicate a memory area into a fresh allocated buffer
			
 
				+ * Consider using memdup or memdup_safe instead
			
 
				+ */
			
 
				+void * memdupa(const void *p, size_t size);
			
 
				+
			
 
				+#define memdup(p, size) ((typeof(p))memdupa((const void*)p,size))
			
 
				+#define memdup_safe(p,size) (p == NULL ? NULL : memdup(p,size))
			
 
				+
			
 
				+/**
			
 
				+ * Duplicate an array of pointers by performing a deep copy
			
 
				+ */
			
 
				+void ** memdup_deep_safea(const void **p, unsigned n, size_t size);
			
 
				+
			
 
				+#define memdup_deep_safe(p,n,size) ((typeof(p))memdup_deep_safea((const void **)p,n,size))
			
 
				+
			
 
				+/**
			
 
				+ * Duplicate an array of pointers by performing a deep copy
			
 
				+ * Sizes are different for each cell
			
 
				+ */
			
 
				+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size);
			
 
				+
			
 
				+#define memdup_deep_varsize_safe(p,n,size) ((typeof(p))memdup_deep_varsize_safea((const void **)p,n,size))
			
 
				+
			
 
				 #endif /* SOCL_UTIL_H */
			
--- a/src/common/htable32.c
+++ b/src/common/htable32.c