лет назад: 15 · 64ee8754cc
--- a/.gitignore
+++ b/.gitignore
@@ -185,3 +185,4 @@ starpu.log
 
																 /gcc-plugin/tests/unregister
															
 
																 /gcc-plugin/tests/lib-user
															
 
																 /gcc-plugin/examples/matrix-mult
															
 
																+/gcc-plugin/src/c-expr.c
															
--- a/AUTHORS
+++ b/AUTHORS
@@ -10,3 +10,5 @@ William Braik <wbraik@gmail.com>
 
																 Yann Courtois <yann.courtois33@gmail.com>
															
 
																 Jean-Marie Couteyen <jm.couteyen@gmail.com>
															
 
																 Anthony Roy <theanthony33@gmail.com>
															
 
																+David Gómez <david_gomez1380@yahoo.com.mx>
															
 
																+Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
															
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,6 +18,8 @@ ACLOCAL_AMFLAGS=-I m4
 
																 CLEANFILES = *.gcno *.gcda *.linkinfo
															
 
																 SUBDIRS = src
															
 
																+SUBDIRS += tools tests doc
															
 
																+
															
 
																 if USE_MPI
															
 
																 SUBDIRS += mpi
															
 
																 endif
															
@@ -26,7 +28,7 @@ if BUILD_SOCL
 
																 SUBDIRS += socl
															
 
																 endif
															
 
																-SUBDIRS += tools examples tests doc
															
 
																+SUBDIRS += examples
															
 
																 if COND_OPT
															
 
																 SUBDIRS += tests/opt examples/opt
															
@@ -68,6 +70,8 @@ clean-local:
 
																 install-exec-local:
															
 
																 	$(MKDIR_P) $(DESTDIR)$(bindir)
															
 
																 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
															
 
																+uninstall-local:
															
 
																+	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
															
 
																 endif
															
 
																 if STARPU_HAVE_WINDOWS
															
@@ -77,3 +81,5 @@ txtdir = ${docdir}
 
																 endif
															
 
																 txt_DATA = AUTHORS COPYING.LGPL README
															
 
																 EXTRA_DIST = AUTHORS COPYING.LGPL README
															
 
																+
															
 
																+include starpu-top/extradist
															
--- a/configure.ac
+++ b/configure.ac
@@ -2,6 +2,7 @@
 
																 #
															
 
																 # Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+# Copyright (C) 2011  Télécom-SudParis
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -33,8 +34,8 @@ AC_PROG_SED
 
																 AC_PROG_LN_S
															
 
																 AC_PROG_F77
															
 
																-AC_LIBTOOL_WIN32_DLL
															
 
																-AC_PROG_LIBTOOL
															
 
																+LT_PREREQ([2.2])
															
 
																+LT_INIT([win32-dll])
															
 
																 AC_PROG_INSTALL
															
 
																 AC_PROG_MKDIR_P
															
@@ -158,18 +159,6 @@ if test x$enable_libnuma = xyes; then
 
																 fi
															
 
																 ###############################################################################
															
 
																-#									      #
															
 
																-#				SCED_CTX settings			      #
															
 
																-#									      #
															
 
																-###############################################################################
															
 
																-AC_MSG_CHECKING(maximum number of sched_ctxs)
															
 
																-AC_ARG_ENABLE(max_sched_ctxs, [AS_HELP_STRING([--enable-max-sched-ctxs=<number>],
															
 
																-			[maximum number of sched_ctxs])],
															
 
																-			max_sched_ctxs=$enableval, max_sched_ctxs=10)
															
 
																-AC_MSG_RESULT($max_sched_ctxs)
															
 
																-AC_DEFINE_UNQUOTED(STARPU_NMAX_SCHED_CTXS, [$max_sched_ctxs], [Maximum number of sched_ctxs supported])
															
 
																-
															
 
																-###############################################################################
															
 
																 #                                                                             #
															
 
																 #                                 CPUs settings                               #
															
 
																 #                                                                             #
															
@@ -282,6 +271,7 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 
																     if test "$have_valid_cuda" = "no" ; then
															
 
																         LDFLAGS="${SAVED_LDFLAGS}"
															
 
																+	unset STARPU_CUDA_LDFLAGS
															
 
																     fi
															
 
																 ])
															
@@ -460,8 +450,10 @@ if test x$have_curand = xyes; then
 
																 fi
															
 
																 # Peer transfers are only supported since CUDA 4.0
															
 
																+# Disable them if user explicity wants to disable them
															
 
																+AC_ARG_ENABLE(cuda_memcpy_peer, [AS_HELP_STRING([--disable-cuda-memcpy-peer], [do not allow peer transfers when using CUDA 4.0])],, [enable_cuda_memcpy_peer=yes])
															
 
																 have_cuda_memcpy_peer=no
															
 
																-if test x$enable_cuda = xyes; then
															
 
																+if test x$enable_cuda_memcpy_peer = xyes -a x$enable_cuda = xyes ; then
															
 
																     SAVED_LDFLAGS="${LDFLAGS}"
															
 
																     LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
															
 
																     AC_CHECK_FUNC([cudaMemcpyPeer], have_cuda_memcpy_peer=yes, have_cuda_memcpy_peer=no)
															
@@ -898,6 +890,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 
																 AC_MSG_RESULT($nmaxworkers)
															
 
																 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
															
 
																+# Computes the maximum number of implementations per arch
															
 
																+AC_MSG_CHECKING(maximum number of implementations)
															
 
																+AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
															
 
																+		[maximum number of implementations])],
															
 
																+		maximplementations=$enableval, maximplementations=1)
															
 
																+AC_MSG_RESULT($maximplementations)
															
 
																+AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
															
 
																+		[maximum number of implementations])
															
 
																+
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																 #                                    MPI                                      #
															
@@ -1116,6 +1117,10 @@ else
 
																    run_gcc_plugin_test_suite="no"
															
 
																 fi
															
 
																+# Bison is used to generate the C expression parser.  The generated
															
 
																+# parser is part of the distribution, though.
															
 
																+AC_PROG_YACC
															
 
																+
															
 
																 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
															
 
																 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
															
@@ -1285,6 +1290,10 @@ AC_MSG_CHECKING(which BLAS lib should be used)
 
																 AC_MSG_RESULT($blas_lib)
															
 
																 AC_SUBST(BLAS_LIB,$blas_lib)
															
 
																+##########################################
															
 
																+# FFT                                    #
															
 
																+##########################################
															
 
																+
															
 
																 have_fftw=no
															
 
																 have_fftwf=no
															
 
																 have_fftwl=no
															
@@ -1295,7 +1304,7 @@ PKG_CHECK_MODULES([FFTW],  [fftw3],  [
 
																   have_fftw=yes
															
 
																 ], [:])
															
 
																 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
															
 
																-
															
 
																+ 
															
 
																 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
															
 
																   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
															
 
																   AC_SUBST([STARPU_HAVE_FFTWF], [1])
															
@@ -1310,6 +1319,10 @@ PKG_CHECK_MODULES([FFTWL], [fftw3l], [
 
																 ], [:])
															
 
																 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
															
 
																+##########################################
															
 
																+# hwloc                                  #
															
 
																+##########################################
															
 
																+
															
 
																 AC_ARG_WITH([hwloc], [AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])])
															
 
																 SAVED_LDFLAGS="${LDFLAGS}"
															
 
																 SAVED_CPPFLAGS="${CPPFLAGS}"
															
@@ -1364,23 +1377,29 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 
																 # File configuration
															
 
																 AC_CONFIG_COMMANDS([executable-scripts], [
															
 
																   chmod +x tests/regression/regression.sh
															
 
																-  chmod +x gcc-plugin/tests/run-test
															
 
																 ])
															
 
																 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
															
 
																 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
															
 
																-AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
															
 
																+if test $build_gcc_plugin == "yes" ; then
															
 
																+    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
															
 
																+    AC_OUTPUT([
															
 
																+	    gcc-plugin/Makefile
															
 
																+	    gcc-plugin/src/Makefile
															
 
																+	    gcc-plugin/tests/Makefile
															
 
																+	    gcc-plugin/tests/run-test
															
 
																+	    gcc-plugin/examples/Makefile
															
 
																+            ])
															
 
																+    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
															
 
																+            chmod +x gcc-plugin/tests/run-test
															
 
																+            ])
															
 
																+fi 
															
 
																 AC_OUTPUT([
															
 
																 	Makefile
															
 
																 	src/Makefile
															
 
																 	tools/Makefile
															
 
																-	gcc-plugin/Makefile
															
 
																-	gcc-plugin/src/Makefile
															
 
																-	gcc-plugin/tests/Makefile
															
 
																-	gcc-plugin/tests/run-test
															
 
																-	gcc-plugin/examples/Makefile
															
 
																 	socl/Makefile
															
 
																 	socl/src/Makefile
															
 
																 	libstarpu.pc
															
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009  Université de Bordeaux 1
															
 
																-# Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -16,6 +16,8 @@
 
																 info_TEXINFOS = starpu.texi
															
 
																+starpu_TEXINFOS = c-extensions.texi
															
 
																+
															
 
																 MAINTAINERCLEANFILES = starpu.pdf
															
 
																 EXTRA_DIST = starpu.pdf \
															
@@ -28,6 +30,9 @@ EXTRA_DIST = starpu.pdf \
 
																 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
															
 
																+uninstall-local:
															
 
																+	$(RM) $(DESTDIR)$(infodir)/dir
															
 
																+
															
 
																 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
															
 
																 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
															
 
																 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
															
--- a/doc/c-extensions.texi
+++ b/doc/c-extensions.texi
@@ -0,0 +1,160 @@
 
																+@c This is part of the StarPU Handbook.
															
 
																+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
															
 
																+
															
 
																+@node C Extensions
															
 
																+@chapter C Extensions
															
 
																+
															
 
																+@cindex C extensions
															
 
																+@cindex GCC plug-in
															
 
																+
															
 
																+When configured with @code{--enable-gcc-extensions}, StarPU builds a
															
 
																+plug-in for the GNU Compiler Collection (GCC), which defines extensions
															
 
																+to the C language that make it easier to write StarPU code@footnote{This
															
 
																+feature is only available for GCC 4.5 and later.}.  Those extensions
															
 
																+include syntactic sugar for defining tasks and their implementations,
															
 
																+invoking a task, and manipulating data buffers.
															
 
																+
															
 
																+This section does not require detailed knowledge of the StarPU library.
															
 
																+
															
 
																+Note: as of StarPU @value{VERSION}, this is still an area under
															
 
																+development and subject to change.
															
 
																+
															
 
																+@menu
															
 
																+* Defining Tasks::              Defining StarPU tasks
															
 
																+* Registered Data Buffers::     Manipulating data buffers
															
 
																+@end menu
															
 
																+
															
 
																+@node Defining Tasks
															
 
																+@section Defining Tasks
															
 
																+
															
 
																+@cindex task
															
 
																+@cindex task implementation
															
 
																+
															
 
																+The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
															
 
																+
															
 
																+@enumerate
															
 
																+@item
															
 
																+tasks may have several implementations---e.g., one for CPUs, one written
															
 
																+in OpenCL, one written in CUDA;
															
 
																+@item
															
 
																+when a task is invoked, it may run in parallel, and StarPU is free to
															
 
																+choose any of its implementations.
															
 
																+@end enumerate
															
 
																+
															
 
																+Tasks and their implementations must be @emph{declared}.  These
															
 
																+declarations are annotated with @dfn{attributes} (@pxref{Attribute
															
 
																+Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
															
 
																+(GCC)}): the declaration of a task is a regular C function declaration
															
 
																+with an additional @code{task} attribute, and task implementations are
															
 
																+declared with a @code{task_implementation} attribute.
															
 
																+
															
 
																+The following function attributes are provided:
															
 
																+
															
 
																+@table @code
															
 
																+
															
 
																+@item task
															
 
																+@cindex @code{task} attribute
															
 
																+Declare the given function as a StarPU task.  Its return type must be
															
 
																+@code{void}, and it must not be defined---instead, a definition will
															
 
																+automatically be provided by the compiler.
															
 
																+
															
 
																+Under the hood, declaring a task leads to the declaration of the
															
 
																+corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
															
 
																+more task implementations are declared in the same compilation unit,
															
 
																+then the codelet and the function itself are also defined; they inherit
															
 
																+the scope of the task.
															
 
																+
															
 
																+Scalar arguments to the task are passed by value and copied to the
															
 
																+target device if need be---technically, they are passed as the
															
 
																+@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
															
 
																+
															
 
																+Pointer arguments are assumed to be registered data buffers---the
															
 
																+@code{buffers} argument of a task (@pxref{Codelets and Tasks,
															
 
																+@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
															
 
																+read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
															
 
																+buffers are assumed to be used read-write (@code{STARPU_RW}).
															
 
																+
															
 
																+@item task_implementation (@var{target}, @var{task})
															
 
																+@cindex @code{task_implementation} attribute
															
 
																+Declare the given function as an implementation of @var{task} to run on
															
 
																+@var{target}.  @var{target} must be a string, currently one of
															
 
																+@code{"cpu"} or @code{"cuda"}.
															
 
																+@c FIXME: Update when OpenCL support is ready.
															
 
																+
															
 
																+@end table
															
 
																+
															
 
																+Here is an example:
															
 
																+
															
 
																+@example
															
 
																+static void matmul (const float *A, const float *B, float *C,
															
 
																+		    size_t nx, size_t ny, size_t nz)
															
 
																+  __attribute__ ((task));
															
 
																+
															
 
																+static void matmul_cpu (const float *A, const float *B, float *C,
															
 
																+			size_t nx, size_t ny, size_t nz)
															
 
																+  __attribute__ ((task_implementation ("cpu", matmul)));
															
 
																+
															
 
																+
															
 
																+static void
															
 
																+matmul_cpu (const float *A, const float *B, float *C,
															
 
																+	    size_t nx, size_t ny, size_t nz)
															
 
																+@{
															
 
																+  size_t i, j, k;
															
 
																+
															
 
																+  for (j = 0; j < ny; j++)
															
 
																+    for (i = 0; i < nx; i++)
															
 
																+      @{
															
 
																+	for (k = 0; k < nz; k++)
															
 
																+	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
															
 
																+      @}
															
 
																+@}
															
 
																+@end example
															
 
																+
															
 
																+@noindent
															
 
																+A @code{matmult} task is defined; it has only one implementation,
															
 
																+@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
															
 
																+@var{B} are input buffers, whereas @var{C} is considered an input/output
															
 
																+buffer.  The task can be invoked like a regular C function:
															
 
																+
															
 
																+@example
															
 
																+matmul (&A[i * zdim * bydim + k * bzdim * bydim],
															
 
																+        &B[k * xdim * bzdim + j * bxdim * bzdim],
															
 
																+        &C[i * xdim * bydim + j * bxdim * bydim],
															
 
																+        bxdim, bydim, bzdim);
															
 
																+@end example
															
 
																+
															
 
																+@noindent
															
 
																+This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
															
 
																+implementation may run in parallel with the continuation of the caller.
															
 
																+
															
 
																+The next section describes how memory buffers must be handled in
															
 
																+StarPU-GCC code.
															
 
																+
															
 
																+
															
 
																+@node Registered Data Buffers
															
 
																+@section Registered Data Buffers
															
 
																+
															
 
																+Data buffers such as matrices and vectors that are to be passed to tasks
															
 
																+must be @dfn{registered}.  Registration allows StarPU to handle data
															
 
																+transfers among devices---e.g., transferring an input buffer from the
															
 
																+CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
															
 
																+Management Library}).
															
 
																+
															
 
																+The following pragmas are provided:
															
 
																+
															
 
																+@table @code
															
 
																+
															
 
																+@item #pragma starpu register @var{ptr} [@var{size}]
															
 
																+Register @var{ptr} as a @var{size}-element buffer.
															
 
																+
															
 
																+@item #pragma starpu unregister @var{ptr}
															
 
																+@item #pragma starpu acquire @var{ptr}
															
 
																+
															
 
																+@end table
															
 
																+
															
 
																+FIXME: finish
															
 
																+
															
 
																+@c Local Variables:
															
 
																+@c TeX-master: "guile.texi"
															
 
																+@c ispell-local-dictionary: "american"
															
 
																+@c End:
															
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -49,8 +49,9 @@ was last updated on @value{UPDATED}.
 
																 * Configuring StarPU::          How to configure StarPU
															
 
																 * StarPU API::                  The API to use StarPU
															
 
																 * Advanced Topics::             Advanced use of StarPU
															
 
																-* Full source code for the 'Scaling a Vector' example::  
															
 
																+* C Extensions::                Easier StarPU programming with GCC
															
 
																+* Full source code for the 'Scaling a Vector' example::
															
 
																 * Function Index::              Index of C functions.
															
 
																 @end menu
															
@@ -1227,16 +1228,16 @@ Partitioning can be applied several times, see
 
																 @section Performance model example
															
 
																 To achieve good scheduling, StarPU scheduling policies need to be able to
															
 
																-estimate in advance the duration of a task. This is done by giving to codelets a
															
 
																-performance model. There are several kinds of performance models.
															
 
																+estimate in advance the duration of a task. This is done by giving to codelets
															
 
																+a performance model, by defining a @code{starpu_perfmodel_t} structure and
															
 
																+providing its address in the @code{model} field of the @code{starpu_codelet}
															
 
																+structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel_t}
															
 
																+are mandatory, to give a name to the model, and the type of the model, since
															
 
																+there are several kinds of performance models.
															
 
																 @itemize
															
 
																 @item
															
 
																-Providing an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
															
 
																-see for instance
															
 
																-@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)
															
 
																-@item
															
 
																-Measured at runtime (STARPU_HISTORY_BASED model type). This assumes that for a
															
 
																+Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
															
 
																 given set of data input/output sizes, the performance will always be about the
															
 
																 same. This is very true for regular kernels on GPUs for instance (<0.1% error),
															
 
																 and just a bit less true on CPUs (~=1% error). This also assumes that there are
															
@@ -1276,7 +1277,7 @@ starpu_codelet cl = @{
 
																 @end cartouche
															
 
																 @item
															
 
																-Measured at runtime and refined by regression (STARPU_REGRESSION_*_BASED
															
 
																+Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
															
 
																 model type). This still assumes performance regularity, but can work
															
 
																 with various data input sizes, by applying regression over observed
															
 
																 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
															
@@ -1286,7 +1287,12 @@ STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
 
																 model for the @code{memset} operation.
															
 
																 @item
															
 
																-Provided explicitly by the application (STARPU_PER_ARCH model type): the
															
 
																+Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
															
 
																+see for instance
															
 
																+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
															
 
																+
															
 
																+@item
															
 
																+Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
															
 
																 @code{.per_arch[i].cost_model} fields have to be filled with pointers to
															
 
																 functions which return the expected duration of the task in micro-seconds, one
															
 
																 per architecture.
															
@@ -1877,6 +1883,22 @@ TODO
 
																 @c what kind of information do we get ?
															
 
																+The bus speed measured by StarPU can be displayed by using the
															
 
																+@code{starpu_machine_display} tool, for instance:
															
 
																+
															
 
																+@example
															
 
																+StarPU has found :
															
 
																+        3 CUDA devices
															
 
																+                CUDA 0 (Tesla C2050 02:00.0)
															
 
																+                CUDA 1 (Tesla C2050 03:00.0)
															
 
																+                CUDA 2 (Tesla C2050 84:00.0)
															
 
																+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
															
 
																+RAM     0.000000        5176.530428     5176.492994     5191.710722
															
 
																+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
															
 
																+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
															
 
																+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
															
 
																+@end example
															
 
																+
															
 
																 @node StarPU-Top
															
 
																 @subsection StarPU-Top interface
															
@@ -2057,7 +2079,7 @@ This will create an @code{activity.data} file in the current
 
																 directory. A profile of the application showing the activity of StarPU
															
 
																 during the execution of the program can be generated:
															
 
																 @example
															
 
																-$ starpu_top.sh activity.data
															
 
																+$ starpu_top activity.data
															
 
																 @end example
															
 
																 This will create a file named @code{activity.eps} in the current directory.
															
@@ -2558,6 +2580,7 @@ Enable flags for the @code{gcov} coverage tool.
 
																 * --with-cuda-dir::             
															
 
																 * --with-cuda-include-dir::             
															
 
																 * --with-cuda-lib-dir::             
															
 
																+* --disable-cuda-memcpy-peer::
															
 
																 * --enable-maxopencldev::       
															
 
																 * --disable-opencl::            
															
 
																 * --with-opencl-dir::           
															
@@ -2624,6 +2647,13 @@ notably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to
 
																 @end table
															
 
																+@node --disable-cuda-memcpy-peer
															
 
																+@subsubsection @code{--disable-cuda-memcpy-peer}
															
 
																+@table @asis
															
 
																+@item @emph{Description}
															
 
																+Explicitely disables peer transfers when using CUDA 4.0
															
 
																+@end table
															
 
																+
															
 
																 @node --enable-maxopencldev
															
 
																 @subsubsection @code{--enable-maxopencldev=<number>}
															
 
																 @table @asis
															
@@ -4893,6 +4923,13 @@ static struct starpu_sched_policy_s dummy_sched_policy = @{
 
																 @c ---------------------------------------------------------------------
															
 
																+@c C Extensions
															
 
																+@c ---------------------------------------------------------------------
															
 
																+
															
 
																+@include c-extensions.texi
															
 
																+
															
 
																+
															
 
																+@c ---------------------------------------------------------------------
															
 
																 @c Appendices
															
 
																 @c ---------------------------------------------------------------------
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -2,6 +2,7 @@
 
																 #
															
 
																 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+# Copyright (C) 2011  Télécom-SudParis
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -152,6 +153,7 @@ examplebin_PROGRAMS +=				\
 
																 	basic_examples/mult			\
															
 
																 	basic_examples/block			\
															
 
																 	basic_examples/variable			\
															
 
																+	basic_examples/mult_impl                \
															
 
																 	filters/fvector				\
															
 
																 	filters/fblock				\
															
 
																 	filters/fmatrix				\
															
--- a/examples/basic_examples/hello_world.c
+++ b/examples/basic_examples/hello_world.c
@@ -52,12 +52,12 @@ struct params {
 
																 };
															
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																-	struct params *params = cl_arg;
															
 
																+	struct params *params = (struct params *) cl_arg;
															
 
																 	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
															
 
																 }
															
 
																-starpu_codelet cl;
															
 
																+starpu_codelet cl = {};
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
--- a/examples/basic_examples/mult.c
+++ b/examples/basic_examples/mult.c
@@ -127,9 +127,9 @@ static void init_problem_data(void)
 
																 	/* we initialize matrices A, B and C in the usual way */
															
 
																-	A = malloc(zdim*ydim*sizeof(float));
															
 
																-	B = malloc(xdim*zdim*sizeof(float));
															
 
																-	C = malloc(xdim*ydim*sizeof(float));
															
 
																+	A = (float *) malloc(zdim*ydim*sizeof(float));
															
 
																+	B = (float *) malloc(xdim*zdim*sizeof(float));
															
 
																+	C = (float *) malloc(xdim*ydim*sizeof(float));
															
 
																 	/* fill the A and B matrices */
															
 
																 	srand(2009);
															
--- a/examples/basic_examples/mult_impl.c
+++ b/examples/basic_examples/mult_impl.c
@@ -0,0 +1,384 @@
 
																+/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Télécom-SudParis
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+
															
 
																+#include <string.h>
															
 
																+#include <math.h>
															
 
																+#include <sys/types.h>
															
 
																+#include <sys/time.h>
															
 
																+#include <pthread.h>
															
 
																+#include <signal.h>
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+static float *A, *B, *C;
															
 
																+static starpu_data_handle A_handle, B_handle, C_handle;
															
 
																+
															
 
																+static unsigned nslicesx = 4;
															
 
																+static unsigned nslicesy = 4;
															
 
																+static unsigned xdim = 1024;
															
 
																+static unsigned ydim = 1024;
															
 
																+static unsigned zdim = 512;
															
 
																+
															
 
																+
															
 
																+double mult_gemm_cost(starpu_buffer_descr *descr)
															
 
																+{
															
 
																+	/* C = A * B */
															
 
																+	uint32_t nxC, nyC, nxA;
															
 
																+
															
 
																+
															
 
																+	nxC = starpu_matrix_get_nx(descr[2].handle);
															
 
																+	nyC = starpu_matrix_get_ny(descr[2].handle);
															
 
																+	nxA = starpu_matrix_get_nx(descr[0].handle);
															
 
																+
															
 
																+	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
															
 
																+
															
 
																+	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
															
 
																+
															
 
																+	printf("cost %e \n", cost);
															
 
																+
															
 
																+	return cost;
															
 
																+}
															
 
																+
															
 
																+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
															
 
																+{
															
 
																+	float *subA, *subB, *subC;
															
 
																+	uint32_t nxC, nyC, nyA;
															
 
																+	uint32_t ldA, ldB, ldC;
															
 
																+	printf("On application: Hello, this is kernel cpu_mult\n\n");
															
 
																+	/* .blas.ptr gives a pointer to the first element of the local copy */
															
 
																+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
															
 
																+
															
 
																+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
															
 
																+	 * is the number of lines that are separated by .blas.ld elements (ld
															
 
																+	 * stands for leading dimension).
															
 
																+	 * NB: in case some filters were used, the leading dimension is not
															
 
																+	 * guaranteed to be the same in main memory (on the original matrix)
															
 
																+	 * and on the accelerator! */
															
 
																+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
															
 
																+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
															
 
																+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+
															
 
																+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																+
															
 
																+	/* we assume a FORTRAN-ordering! */
															
 
																+	unsigned i,j,k;
															
 
																+	for (i = 0; i < nyC; i++)
															
 
																+	{
															
 
																+		for (j = 0; j < nxC; j++)
															
 
																+		{
															
 
																+			float sum = 0.0;
															
 
																+
															
 
																+			for (k = 0; k < nyA; k++)
															
 
																+			{
															
 
																+				sum += subA[j+k*ldA]*subB[k+i*ldB];
															
 
																+			}
															
 
																+
															
 
																+			subC[j + i*ldC] = sum;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
															
 
																+{
															
 
																+	float *subA, *subB, *subC;
															
 
																+	uint32_t nxC, nyC, nyA;
															
 
																+	uint32_t ldA, ldB, ldC;
															
 
																+	printf("On application: this is kernel cpu_mult_2\n\n");
															
 
																+	/* .blas.ptr gives a pointer to the first element of the local copy */
															
 
																+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
															
 
																+
															
 
																+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
															
 
																+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
															
 
																+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+
															
 
																+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																+
															
 
																+	/* we assume a FORTRAN-ordering! */
															
 
																+	unsigned i,j,k;
															
 
																+	for (j = 0; j < nxC; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < nyC; i++)
															
 
																+		{
															
 
																+			float sum = 0.0;
															
 
																+
															
 
																+			for (k = 0; k < nyA; k++)
															
 
																+			{
															
 
																+				sum += subA[j+k*ldA]*subB[k+i*ldB];
															
 
																+			}
															
 
																+
															
 
																+			subC[j + i*ldC] = sum;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+
															
 
																+
															
 
																+static void init_problem_data(void)
															
 
																+{
															
 
																+	unsigned i,j;
															
 
																+
															
 
																+	/* we initialize matrices A, B and C in the usual way */
															
 
																+
															
 
																+	A = malloc(zdim*ydim*sizeof(float));
															
 
																+	B = malloc(xdim*zdim*sizeof(float));
															
 
																+	C = malloc(xdim*ydim*sizeof(float));
															
 
																+
															
 
																+	/* fill the A and B matrices */
															
 
																+	srand(2009);
															
 
																+	for (j=0; j < ydim; j++) {
															
 
																+		for (i=0; i < zdim; i++) {
															
 
																+			A[j+i*ydim] = (float)(starpu_drand48());
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	for (j=0; j < zdim; j++) {
															
 
																+		for (i=0; i < xdim; i++) {
															
 
																+			B[j+i*zdim] = (float)(starpu_drand48());
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	for (j=0; j < ydim; j++) {
															
 
																+		for (i=0; i < xdim; i++) {
															
 
																+			C[j+i*ydim] = (float)(0);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void partition_mult_data(void)
															
 
																+{
															
 
																+	/* note that we assume a FORTRAN ordering here! */
															
 
																+
															
 
																+	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
															
 
																+		ydim, ydim, zdim, sizeof(float));
															
 
																+	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
															
 
																+		zdim, zdim, xdim, sizeof(float));
															
 
																+	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
															
 
																+		ydim, ydim, xdim, sizeof(float));
															
 
																+
															
 
																+	/* A filter is a method to partition a data into disjoint chunks, it is
															
 
																+	 * described by the means of the "struct starpu_data_filter" structure that
															
 
																+	 * contains a function that is applied on a data handle to partition it
															
 
																+	 * into smaller chunks, and an argument that is passed to the function
															
 
																+	 * (eg. the number of blocks to create here).
															
 
																+	 */
															
 
																+
															
 
																+	struct starpu_data_filter vert = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nslicesx,
															
 
																+		.get_nchildren = NULL,
															
 
																+		.get_child_ops = NULL
															
 
																+	};
															
 
																+
															
 
																+	struct starpu_data_filter horiz = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nslicesy,
															
 
																+		.get_nchildren = NULL,
															
 
																+		.get_child_ops = NULL
															
 
																+	};
															
 
																+
															
 
																+/*
															
 
																+ *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
															
 
																+ *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
															
 
																+ *	the number of filters to apply, and the indexes for each filters, for
															
 
																+ *	instance:
															
 
																+ *
															
 
																+ *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
															
 
																+ *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
															
 
																+ *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
															
 
																+ *
															
 
																+ *	Note that here we applied 2 filters recursively onto C.
															
 
																+ *
															
 
																+ *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
															
 
																+ *	of blocked matrix C for example.
															
 
																+ *
															
 
																+ *		              |---|---|---|---|
															
 
																+ *		              |   |   | B'|   | B
															
 
																+ *		              |---|---|---|---|
															
 
																+ *		                0   1   2   3
															
 
																+ *		     |----|   |---|---|---|---|
															
 
																+ *		     |    |   |   |   |   |   |
															
 
																+ *		     |    | 0 |   |   |   |   |
															
 
																+ *		     |----|   |---|---|---|---|
															
 
																+ *		     | A' |   |   |   | C'|   |
															
 
																+ *		     |    |   |   |   |   |   |
															
 
																+ *		     |----|   |---|---|---|---|
															
 
																+ *		       A              C
															
 
																+ *
															
 
																+ *	IMPORTANT: applying filters is equivalent to partitionning a piece of
															
 
																+ *	data in a hierarchical manner, so that memory consistency is enforced
															
 
																+ *	for each of the elements independantly. The tasks should therefore NOT
															
 
																+ *	access inner nodes (eg. one column of C or the whole C) but only the
															
 
																+ *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
															
 
																+ *	possible by disapplying the filters (using starpu_data_unpartition), to
															
 
																+ *	enforce memory consistency.
															
 
																+ */
															
 
																+
															
 
																+	starpu_data_partition(B_handle, &vert);
															
 
																+	starpu_data_partition(A_handle, &horiz);
															
 
																+
															
 
																+	/* starpu_data_map_filters is a variable-arity function, the first argument
															
 
																+	 * is the handle of the data to partition, the second argument is the
															
 
																+	 * number of filters to apply recursively. Filters are applied in the
															
 
																+	 * same order as the arguments.
															
 
																+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
															
 
																+	 * then applying horiz on each sub-data (ie. each column of C)
															
 
																+	 */
															
 
																+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
															
 
																+}
															
 
																+
															
 
																+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
															
 
																+	.cost_model = mult_gemm_cost,
															
 
																+	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
															
 
																+	.symbol = "mult_perf_model"
															
 
																+};
															
 
																+
															
 
																+/*
															
 
																+static struct starpu_perfmodel_t mult_perf_model = {
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = "mult_perf_model"
															
 
																+};
															
 
																+*/
															
 
																+
															
 
																+struct starpu_conf conf = {
															
 
																+		.sched_policy_name = "heft",
															
 
																+		.calibrate = 1,
															
 
																+		.ncpus = 4
															
 
																+};
															
 
																+
															
 
																+
															
 
																+static starpu_codelet cl = {
															
 
																+        /* we can only execute that kernel on a CPU yet */
															
 
																+        .where = STARPU_CPU,
															
 
																+        //.starpu_impl_multiple = 1,
															
 
																+        /* CPU implementation of the codelet */
															
 
																+        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
															
 
																+        .cpu_funcs = {cpu_mult,cpu_mult_2},
															
 
																+        /* the codelet manipulates 3 buffers that are managed by the
															
 
																+         * DSM */
															
 
																+        .nbuffers = 3,
															
 
																+        /* in case the scheduling policy may use performance models */
															
 
																+        .model = &starpu_dgemm_model_common
															
 
																+};
															
 
																+
															
 
																+static void launch_tasks(void)
															
 
																+{
															
 
																+	/* partition the work into slices */
															
 
																+	unsigned taskx, tasky;
															
 
																+
															
 
																+	for (taskx = 0; taskx < nslicesx; taskx++)
															
 
																+	{
															
 
																+		for (tasky = 0; tasky < nslicesy; tasky++)
															
 
																+		{
															
 
																+			/* C[taskx, tasky] = A[tasky] B[taskx] */
															
 
																+
															
 
																+			/* by default, starpu_task_create() returns an
															
 
																+ 			 * asynchronous task (ie. task->synchronous = 0) */
															
 
																+			struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+			/* this task implements codelet "cl" */
															
 
																+			task->cl = &cl;
															
 
																+
															
 
																+			/*
															
 
																+			 *              |---|---|---|---|
															
 
																+			 *              |   | * |   |   | B
															
 
																+			 *              |---|---|---|---|
															
 
																+			 *                    X
															
 
																+			 *     |----|   |---|---|---|---|
															
 
																+			 *     |****| Y |   |***|   |   |
															
 
																+			 *     |****|   |   |***|   |   |
															
 
																+			 *     |----|   |---|---|---|---|
															
 
																+			 *     |    |   |   |   |   |   |
															
 
																+			 *     |    |   |   |   |   |   |
															
 
																+			 *     |----|   |---|---|---|---|
															
 
																+			 *       A              C
															
 
																+			 */
															
 
																+
															
 
																+			/* there was a single filter applied to matrices A
															
 
																+			 * (respectively B) so we grab the handle to the chunk
															
 
																+			 * identified by "tasky" (respectively "taskx). The "1"
															
 
																+			 * tells StarPU that there is a single argument to the
															
 
																+			 * variable-arity function starpu_data_get_sub_data */
															
 
																+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
															
 
																+			task->buffers[0].mode = STARPU_R;
															
 
																+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
															
 
																+			task->buffers[1].mode = STARPU_R;
															
 
																+
															
 
																+			/* 2 filters were applied on matrix C, so we give
															
 
																+			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
															
 
																+			 * must match the order in which the filters were
															
 
																+			 * applied.
															
 
																+			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
															
 
																+			 * a handle to the column number k of matrix C.
															
 
																+			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
															
 
																+			 * equivalent to
															
 
																+			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
															
 
																+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
															
 
																+			task->buffers[2].mode = STARPU_W;
															
 
																+
															
 
																+			/* this is not a blocking call since task->synchronous = 0 */
															
 
																+			int summit_task;
															
 
																+			summit_task = starpu_task_submit(task);
															
 
																+			printf("task is submmited or not %d\n",summit_task);
															
 
																+
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+int main(void)
															
 
																+{
															
 
																+	/* start the runtime */
															
 
																+	starpu_init(&conf);
															
 
																+
															
 
																+	/* initialize matrices A, B and C and register them to StarPU */
															
 
																+	init_problem_data();
															
 
																+
															
 
																+	/* partition matrices into blocks that can be manipulated by the
															
 
																+ 	 * codelets */
															
 
																+	partition_mult_data();
															
 
																+
															
 
																+	/* submit all tasks in an asynchronous fashion */
															
 
																+	launch_tasks();
															
 
																+
															
 
																+	/* wait for termination */
															
 
																+	starpu_task_wait_for_all();
															
 
																+
															
 
																+	/* remove the filters applied by the means of starpu_data_map_filters; now
															
 
																+ 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
															
 
																+	 * starpu_data_map_filters is called again on C_handle.
															
 
																+	 * The second argument is the memory node where the different subsets
															
 
																+	 * should be reassembled, 0 = main memory (RAM) */
															
 
																+	starpu_data_unpartition(C_handle, 0);
															
 
																+
															
 
																+	/* stop monitoring matrix C : after this, it is not possible to pass C
															
 
																+	 * (or any subset of C) as a codelet input/output. This also implements
															
 
																+	 * a barrier so that the piece of data is put back into main memory in
															
 
																+	 * case it was only available on a GPU for instance. */
															
 
																+	starpu_data_unregister(C_handle);
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/basic_examples/vector_scal_cpu.c
+++ b/examples/basic_examples/vector_scal_cpu.c
@@ -24,7 +24,7 @@
 
																 void scal_cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																 	unsigned i;
															
 
																-	float *factor = cl_arg;
															
 
																+	float *factor = (float *) cl_arg;
															
 
																 	/*
															
 
																 	 * The "buffers" array matches the task->buffers array: for instance
															
@@ -37,7 +37,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 
																 	 * migrated/replicated), and elemsize gives the size of each elements.
															
 
																 	 */
															
 
																-	starpu_vector_interface_t *vector = buffers[0];
															
 
																+	starpu_vector_interface_t *vector = (starpu_vector_interface_t *) buffers[0];
															
 
																 	/* length of the vector */
															
 
																 	unsigned n = STARPU_VECTOR_GET_NX(vector);
															
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -125,27 +126,27 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 
																 }
															
 
																 struct starpu_perfmodel_t chol_model_11 = {
															
 
																-	.per_arch = { 
															
 
																-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
															
 
																-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
															
 
																+	.per_arch = {
															
 
																+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
															
 
																+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
															
 
																 	},
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "chol_model_11"
															
 
																 };
															
 
																 struct starpu_perfmodel_t chol_model_21 = {
															
 
																-	.per_arch = { 
															
 
																-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
															
 
																-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
															
 
																+	.per_arch = {
															
 
																+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
															
 
																+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
															
 
																 	},
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "chol_model_21"
															
 
																 };
															
 
																 struct starpu_perfmodel_t chol_model_22 = {
															
 
																-	.per_arch = { 
															
 
																-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
															
 
																-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
															
 
																+	.per_arch = {
															
 
																+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
															
 
																+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
															
 
																 	},
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 	.symbol = "chol_model_22"
															
--- a/examples/filters/fblock_cpu.c
+++ b/examples/filters/fblock_cpu.c
@@ -19,7 +19,7 @@
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																         unsigned i, j, k;
															
 
																-        int *factor = cl_arg;
															
 
																+        int *factor = (int *) cl_arg;
															
 
																 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
															
 
																 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
															
 
																 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
															
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -25,7 +25,7 @@
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																         unsigned i, j;
															
 
																-        int *factor = cl_arg;
															
 
																+        int *factor = (int *) cl_arg;
															
 
																         /* length of the matrix */
															
 
																         unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
															
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -24,7 +24,7 @@
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																         unsigned i;
															
 
																-        int *factor = cl_arg;
															
 
																+        int *factor = (int *) cl_arg;
															
 
																         /* length of the vector */
															
 
																         unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
--- a/examples/heat/lu_kernels_model.c
+++ b/examples/heat/lu_kernels_model.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -215,9 +216,9 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
																 struct starpu_perfmodel_t model_11 = {
															
 
																 	.cost_model = task_11_cost,
															
 
																-	.per_arch = { 
															
 
																-		[STARPU_CPU_DEFAULT] = { .cost_model = task_11_cost_cpu },
															
 
																-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
															
 
																+	.per_arch = {
															
 
																+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_11_cost_cpu },
															
 
																+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_11_cost_cuda }
															
 
																 	},
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 #ifdef STARPU_ATLAS
															
@@ -231,9 +232,9 @@ struct starpu_perfmodel_t model_11 = {
 
																 struct starpu_perfmodel_t model_12 = {
															
 
																 	.cost_model = task_12_cost,
															
 
																-	.per_arch = { 
															
 
																-		[STARPU_CPU_DEFAULT] = { .cost_model = task_12_cost_cpu },
															
 
																-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
															
 
																+	.per_arch = {
															
 
																+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_12_cost_cpu },
															
 
																+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_12_cost_cuda }
															
 
																 	},
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 #ifdef STARPU_ATLAS
															
@@ -247,9 +248,9 @@ struct starpu_perfmodel_t model_12 = {
 
																 struct starpu_perfmodel_t model_21 = {
															
 
																 	.cost_model = task_21_cost,
															
 
																-	.per_arch = { 
															
 
																-		[STARPU_CPU_DEFAULT] = { .cost_model = task_21_cost_cpu },
															
 
																-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
															
 
																+	.per_arch = {
															
 
																+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_21_cost_cpu },
															
 
																+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_21_cost_cuda }
															
 
																 	},
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 #ifdef STARPU_ATLAS
															
@@ -263,9 +264,9 @@ struct starpu_perfmodel_t model_21 = {
 
																 struct starpu_perfmodel_t model_22 = {
															
 
																 	.cost_model = task_22_cost,
															
 
																-	.per_arch = { 
															
 
																-		[STARPU_CPU_DEFAULT] = { .cost_model = task_22_cost_cpu },
															
 
																-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
															
 
																+	.per_arch = {
															
 
																+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_22_cost_cpu },
															
 
																+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_22_cost_cuda }
															
 
																 	},
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																 #ifdef STARPU_ATLAS
															
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -333,7 +333,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
																 	while (1)
															
 
																 	{
															
 
																-		local_iy = STARPU_ATOMIC_ADD(pcnt, 1) - 1;
															
 
																+		local_iy = STARPU_ATOMIC_ADD((unsigned int *)pcnt, 1) - 1;
															
 
																 		if (local_iy >= block_size)
															
 
																 			break;
															
--- a/examples/ppm_downscaler/ppm_downscaler.c
+++ b/examples/ppm_downscaler/ppm_downscaler.c
@@ -28,7 +28,7 @@
 
																 struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
															
 
																 {
															
 
																-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
															
 
																+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
															
 
																 	assert(ppm);
															
 
																 	ppm->ncols = ncols;
															
@@ -36,9 +36,9 @@ struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
 
																 	ppm->coldepth = coldepth;
															
 
																 #ifdef STARPU_HAVE_MEMALIGN
															
 
																-	ppm->data = memalign(16384, ncols*nlines*sizeof(struct ppm_color));
															
 
																+	ppm->data = (struct ppm_color *) memalign(16384, ncols*nlines*sizeof(struct ppm_color));
															
 
																 #else
															
 
																-	ppm->data = malloc(ncols*nlines*sizeof(struct ppm_color));
															
 
																+	ppm->data = (struct ppm_color *) malloc(ncols*nlines*sizeof(struct ppm_color));
															
 
																 #endif
															
 
																 	assert(ppm->data);
															
@@ -49,7 +49,7 @@ struct ppm_image *file_to_ppm(char *filename)
 
																 {
															
 
																 	int ret;
															
 
																-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
															
 
																+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
															
 
																 	assert(ppm);
															
 
																 	FILE *file = fopen(filename, "r");
															
@@ -64,9 +64,9 @@ struct ppm_image *file_to_ppm(char *filename)
 
																 	/* allocate a buffer for the image */
															
 
																 #ifdef STARPU_HAVE_MEMALIGN
															
 
																-	ppm->data = memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
															
 
																+	ppm->data = (struct ppm_color *) memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
															
 
																 #else
															
 
																-	ppm->data = malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
															
 
																+	ppm->data = (struct ppm_color *) malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
															
 
																 #endif
															
 
																 	assert(ppm->data);
															
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -123,7 +123,7 @@ int main(int argc, char **argv)
 
																 	FILE *f_in = fopen(filename_in, "r");
															
 
																 	assert(f_in);
															
 
																-	struct yuv_frame *yuv_in_buffer = malloc(nframes*FRAMESIZE);
															
 
																+	struct yuv_frame *yuv_in_buffer = (struct yuv_frame *) malloc(nframes*FRAMESIZE);
															
 
																 	fread(yuv_in_buffer, FRAMESIZE, nframes, f_in);
															
 
																 	/* allocate room for an output buffer */
															
@@ -131,16 +131,16 @@ int main(int argc, char **argv)
 
																 	assert(f_out);
															
 
																 /*	fprintf(stderr, "Alloc output file ...\n"); */
															
 
																-	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
															
 
																+	struct yuv_new_frame *yuv_out_buffer = (struct yuv_new_frame *) calloc(nframes, NEW_FRAMESIZE);
															
 
																 	assert(yuv_out_buffer);
															
 
																-	starpu_data_handle *frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
															
 
																-	starpu_data_handle *frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
															
 
																-	starpu_data_handle *frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
															
 
																+	starpu_data_handle *frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
															
 
																+	starpu_data_handle *frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
															
 
																+	starpu_data_handle *frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
															
 
																-	starpu_data_handle *new_frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
															
 
																-	starpu_data_handle *new_frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
															
 
																-	starpu_data_handle *new_frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
															
 
																+	starpu_data_handle *new_frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
															
 
																+	starpu_data_handle *new_frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
															
 
																+	starpu_data_handle *new_frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
															
 
																 	starpu_init(NULL);
															
--- a/examples/profiling/profiling.c
+++ b/examples/profiling/profiling.c
@@ -53,7 +53,7 @@ int main(int argc, char **argv)
 
																 		.nbuffers = 0
															
 
																 	};
															
 
																-	struct starpu_task **tasks = malloc(niter*sizeof(struct starpu_task *));
															
 
																+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
															
 
																 	assert(tasks);
															
 
																 	unsigned i;
															
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -164,11 +164,11 @@ int main(int argc, char **argv)
 
																 	unsigned long nelems = nblocks*entries_per_block;
															
 
																 	size_t size = nelems*sizeof(float);
															
 
																-	x = malloc(size);
															
 
																-	y = malloc(size);
															
 
																+	x = (float *) malloc(size);
															
 
																+	y = (float *) malloc(size);
															
 
																-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
															
 
																-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
															
 
																+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
															
 
																+	y_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
															
 
																 	assert(x && y);
															
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -129,8 +129,8 @@ int main(int argc, char **argv)
 
																 	unsigned long nelems = nblocks*entries_per_bock;
															
 
																 	size_t size = nelems*sizeof(TYPE);
															
 
																-	x = malloc(size);
															
 
																-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
															
 
																+	x = (TYPE *) malloc(size);
															
 
																+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
															
 
																 	assert(x && x_handles);
															
--- a/examples/socl/mandelbrot/mandelbrot.c
+++ b/examples/socl/mandelbrot/mandelbrot.c
@@ -21,7 +21,7 @@
 
																 #include <unistd.h>
															
 
																 /* Uncomment this to activate X11 display */
															
 
																-#define USE_X11
															
 
																+//#define USE_X11
															
 
																 #define SHORT_LOG 1
															
 
																 #define ROUND_ROBIN
															
--- a/examples/spmv/spmv.c
+++ b/examples/spmv/spmv.c
@@ -44,8 +44,8 @@ static void parse_args(int argc, char **argv)
 
																  * same number of non-zero entries. */
															
 
																 static void csr_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
															
 
																 {
															
 
																-	starpu_csr_interface_t *csr_father = father_interface;
															
 
																-	starpu_csr_interface_t *csr_child = child_interface;
															
 
																+	starpu_csr_interface_t *csr_father = (starpu_csr_interface_t *) father_interface;
															
 
																+	starpu_csr_interface_t *csr_child = (starpu_csr_interface_t *) child_interface;
															
 
																 	uint32_t nrow = csr_father->nrow;
															
 
																 	size_t elemsize = csr_father->elemsize;
															
--- a/examples/starpufft/.gitignore
+++ b/examples/starpufft/.gitignore
@@ -0,0 +1 @@
 
																+/.deps
															
--- a/examples/stencil/.gitignore
+++ b/examples/stencil/.gitignore
@@ -0,0 +1 @@
 
																+/.deps
															
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -90,7 +90,7 @@ int MPI_TAG1(int z, int iter, int dir)
 
																 /* Compute the size of the different blocks */
															
 
																 static void compute_block_sizes(void)
															
 
																 {
															
 
																-	block_sizes_z = malloc(nbz*sizeof(unsigned));
															
 
																+	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
															
 
																 	STARPU_ASSERT(block_sizes_z);
															
 
																 	/* Perhaps the last chunk is smaller */
															
@@ -136,7 +136,7 @@ void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsi
 
																 	sizez = _sizez;
															
 
																 	/* Create a grid of block descriptors */
															
 
																-	blocks = calloc(nbz, sizeof(struct block_description));
															
 
																+	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
															
 
																 	STARPU_ASSERT(blocks);
															
 
																 	/* What is the size of the different blocks ? */
															
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -165,10 +165,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 
																 /*
															
 
																  * Load a neighbour's boundary into block, CPU version
															
 
																  */
															
 
																-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_from_buffer_cpu(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
															
 
																+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -184,10 +186,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
 
																  * Load a neighbour's boundary into block, CUDA version
															
 
																  */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_from_buffer_cuda(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
															
 
																+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -245,16 +249,16 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
																 	for (i=1; i<=K; i++)
															
 
																 	{
															
 
																 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
															
 
																+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
															
 
																 		/* Shadow data */
															
 
																 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 		/* And perform actual computation */
															
 
																 #ifdef LIFE
															
 
																-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 #else
															
 
																-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																 #endif /* LIFE */
															
 
																 	}
															
@@ -338,16 +342,16 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
																 	for (i=1; i<=K; i++)
															
 
																 	{
															
 
																 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
															
 
																+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
															
 
																 		/* Shadow data */
															
 
																 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 		/* And perform actual computation */
															
 
																 #ifdef LIFE
															
 
																-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 #else
															
 
																-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
															
 
																+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, NULL);
															
 
																 #endif /* LIFE */
															
 
																 	}
															
@@ -365,7 +369,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 
																  */
															
 
																 static void update_func_cpu(void *descr[], void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
															
 
																 	if (block->bz == 0)
															
@@ -405,8 +409,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
																 	for (i=1; i<=K; i++)
															
 
																 	{
															
 
																-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
															
 
																-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
															
 
																+		starpu_block_interface_t *oldb = (starpu_block_interface_t *) descr[i%2], *newb = (starpu_block_interface_t *) descr[(i+1)%2];
															
 
																+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
															
 
																 		/* Shadow data */
															
 
																 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
															
@@ -424,9 +428,9 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
																 		/* And perform actual computation */
															
 
																 #ifdef LIFE
															
 
																-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 #else
															
 
																-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
															
 
																+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
															
 
																 #endif /* LIFE */
															
 
																 	}
															
@@ -465,10 +469,12 @@ starpu_codelet cl_update = {
 
																  */
															
 
																 /* CPU version */
															
 
																-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_into_buffer_cpu(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
															
 
																+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -482,10 +488,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
																 /* CUDA version */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
															
 
																-					starpu_block_interface_t *boundary,
															
 
																+static void load_subblock_into_buffer_cuda(void *_block,
															
 
																+					void *_boundary,
															
 
																 					unsigned firstz)
															
 
																 {
															
 
																+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
															
 
																+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
															
 
																 	check_load(block, boundary);
															
 
																 	/* We do a contiguous memory transfer */
															
@@ -527,7 +535,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 
																 /* top save, CPU version */
															
 
																 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	top_per_worker[workerid]++;
															
@@ -543,7 +551,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 
																 /* bottom save, CPU version */
															
 
																 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	bottom_per_worker[workerid]++;
															
@@ -557,7 +565,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	top_per_worker[workerid]++;
															
@@ -574,7 +582,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
																 /* bottom save, CUDA version */
															
 
																 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	bottom_per_worker[workerid]++;
															
@@ -590,7 +598,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	top_per_worker[workerid]++;
															
@@ -610,7 +618,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 
																 /* bottom save, OPENCL version */
															
 
																 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
															
 
																 {
															
 
																-	struct block_description *block = arg;
															
 
																+	struct block_description *block = (struct block_description *) arg;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	bottom_per_worker[workerid]++;
															
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -132,9 +132,9 @@ static void init_problem(int argc, char **argv, int rank, int world_size)
 
																 	display_memory_consumption(rank);
															
 
																 	who_runs_what_len = 2*niter;
															
 
																-	who_runs_what = calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
															
 
																-	who_runs_what_index = calloc(nbz, sizeof(*who_runs_what_index));
															
 
																-	last_tick = calloc(nbz, sizeof(*last_tick));
															
 
																+	who_runs_what = (int *) calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
															
 
																+	who_runs_what_index = (int *) calloc(nbz, sizeof(*who_runs_what_index));
															
 
																+	last_tick = (struct timeval *) calloc(nbz, sizeof(*last_tick));
															
 
																 }
															
 
																 /*
															
--- a/examples/tag_example/tag_example.c
+++ b/examples/tag_example/tag_example.c
@@ -30,7 +30,7 @@
 
																 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 #define TAG(i, j, iter)	((starpu_tag_t) ( ((uint64_t)(iter)<<48) |  ((uint64_t)(j)<<24) | (i)) )
															
 
																-starpu_codelet cl;
															
 
																+starpu_codelet cl = {};
															
 
																 #define Ni	64
															
 
																 #define Nj	32
															
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -30,7 +30,7 @@
 
																 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
															
 
																-starpu_codelet cl;
															
 
																+starpu_codelet cl = {};
															
 
																 #define Ni	64
															
 
																 #define Nk	256
															
--- a/examples/tag_example/tag_example3.c
+++ b/examples/tag_example/tag_example3.c
@@ -30,7 +30,7 @@
 
																 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
															
 
																-starpu_codelet cl;
															
 
																+starpu_codelet cl = {};
															
 
																 #define Ni	64
															
 
																 #define Nk	256
															
--- a/examples/tag_example/tag_restartable.c
+++ b/examples/tag_example/tag_restartable.c
@@ -35,7 +35,7 @@
 
																 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)((iter)%Nrolls))<<32 | (i)) )
															
 
																-starpu_codelet cl;
															
 
																+starpu_codelet cl = {};
															
 
																 #define Ni	64
															
 
																 #define Nk	256
															
@@ -134,7 +134,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
																 	FPRINTF(stderr, "ITER : %u\n", nk);
															
 
																 	for (i = 0; i < Nrolls; i++) {
															
 
																-		tasks[i] = malloc(ni * sizeof(*tasks[i]));
															
 
																+		tasks[i] = (struct starpu_task **) malloc(ni * sizeof(*tasks[i]));
															
 
																 		create_task_grid(i);
															
 
																 	}
															
--- a/examples/top/hello_world_top.c
+++ b/examples/top/hello_world_top.c
@@ -62,7 +62,7 @@ struct params {
 
																 };
															
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																-	struct params *params = cl_arg;
															
 
																+	struct params *params = (struct params *) cl_arg;
															
 
																 	//loosing time for top example...
															
 
																 	int sum = 0;
															
@@ -82,14 +82,14 @@ void cpu_func(void *buffers[], void *cl_arg)
 
																 void callback_name_changed(starputop_param* param)
															
 
																 {
															
 
																-	char* message = malloc(256);
															
 
																+	char* message = (char *) malloc(256);
															
 
																 	sprintf(message, "Name have been changed to %s", names[name_selected]);
															
 
																 	starputop_debug_log(message);
															
 
																 }
															
 
																 void callback_number_addition_changed(starputop_param* param)
															
 
																 {
															
 
																-	char* message = malloc(256);
															
 
																+	char* message = (char *) malloc(256);
															
 
																 	sprintf(message, "Number of addition is now %d", number_of_addition);
															
 
																 	starputop_debug_log(message);
															
--- a/gcc-plugin/README
+++ b/gcc-plugin/README
@@ -8,3 +8,5 @@ tasks.
 
																 Plug-ins are supported starting from GCC 4.5.
															
 
																 To run the test suite, GNU Guile 1.8.x or 2.0.x is needed.
															
 
																+
															
 
																+When building from SVN, GNU Bison 2.5+ is required.
															
--- a/gcc-plugin/examples/Makefile.am
+++ b/gcc-plugin/examples/Makefile.am
@@ -17,4 +17,9 @@ noinst_PROGRAMS =				\
 
																   matrix-mult
															
 
																 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
															
 
																+
															
 
																+AM_CPPFLAGS =						\
															
 
																+  -I$(top_srcdir)/include				\
															
 
																+  $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
															
 
																+
															
 
																 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
															
--- a/gcc-plugin/examples/matrix-mult.c
+++ b/gcc-plugin/examples/matrix-mult.c
@@ -187,34 +187,15 @@ main (int argc, char **argv)
 
																   gettimeofday (&start_register, NULL);
															
 
																   for (i = 0; i < nslicesy; i++)
															
 
																     for (j = 0; j < nslicesz; j++)
															
 
																-      {
															
 
																-	/* TODO: Get rid of the `ptr' and `size' variables when the pragma
															
 
																-	   parser supports arbitrary C expressions.  */
															
 
																-
															
 
																-	typeof (A) ptr = &A[i*zdim*bydim + j*bzdim*bydim];
															
 
																-	size_t size = (bzdim * bydim);
															
 
																-#pragma starpu register ptr size
															
 
																-      }
															
 
																+#pragma starpu register &A[i*zdim*bydim + j*bzdim*bydim] (bzdim * bydim)
															
 
																   for (i = 0; i < nslicesz; i++)
															
 
																-    {
															
 
																-      for (j = 0; j < nslicesx; j++)
															
 
																-	{
															
 
																-	  typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
															
 
																-	  size_t size = (bxdim * bzdim);
															
 
																-#pragma starpu register ptr size
															
 
																-	}
															
 
																-    }
															
 
																+    for (j = 0; j < nslicesx; j++)
															
 
																+#pragma starpu register &B[i*xdim*bzdim + j*bxdim*bzdim] (bxdim * bzdim)
															
 
																   for (i = 0; i < nslicesy; i++)
															
 
																-    {
															
 
																-      for (j = 0; j < nslicesx; j++)
															
 
																-	{
															
 
																-	  typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
															
 
																-	  size_t size = (bxdim * bydim);
															
 
																-#pragma starpu register ptr size
															
 
																-	}
															
 
																-    }
															
 
																+    for (j = 0; j < nslicesx; j++)
															
 
																+#pragma starpu register &C[i*xdim*bydim + j*bxdim*bydim] (bxdim * bydim)
															
 
																   gettimeofday (&end_register, NULL);
															
@@ -256,25 +237,16 @@ main (int argc, char **argv)
 
																   gettimeofday (&start_unregister, NULL);
															
 
																   for (i = 0; i < nslicesy; i++)
															
 
																     for (j = 0; j < nslicesz; j++)
															
 
																-      {
															
 
																-	typeof (A) ptr =  &A[i*zdim*bydim + j*bzdim*bydim];
															
 
																-#pragma starpu unregister ptr
															
 
																-      }
															
 
																+#pragma starpu unregister &A[i*zdim*bydim + j*bzdim*bydim]
															
 
																   for (i = 0; i < nslicesz; i++)
															
 
																     for (j = 0; j < nslicesx; j++)
															
 
																-      {
															
 
																-	typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
															
 
																-#pragma starpu unregister ptr
															
 
																-      }
															
 
																+#pragma starpu unregister &B[i*xdim*bzdim + j*bxdim*bzdim]
															
 
																   for (i = 0; i < nslicesy; i++)
															
 
																     for (j = 0; j < nslicesx; j++)
															
 
																-      {
															
 
																-	typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
															
 
																-#pragma starpu unregister ptr
															
 
																-      }
															
 
																+#pragma starpu unregister &C[i*xdim*bydim + j*bxdim*bydim]
															
 
																   gettimeofday (&end_unregister, NULL);
															
 
																   gettimeofday (&end_all, NULL);
															
--- a/gcc-plugin/src/Makefile.am
+++ b/gcc-plugin/src/Makefile.am
@@ -17,7 +17,10 @@
 
																 # requires a name prefixed by `lib'.
															
 
																 pkglib_LTLIBRARIES = starpu.la
															
 
																-starpu_la_SOURCES = starpu.c
															
 
																+starpu_la_SOURCES = starpu.c c-expr.y
															
 
																+
															
 
																+AM_CPPFLAGS =						\
															
 
																+  -I$(top_srcdir)/include				\
															
 
																+  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1
															
 
																-AM_CPPFLAGS = -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall
															
 
																 AM_LDFLAGS = -module
															
--- a/gcc-plugin/src/c-expr.y
+++ b/gcc-plugin/src/c-expr.y
@@ -0,0 +1,234 @@
 
																+/* GCC-StarPU
															
 
																+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
															
 
																+
															
 
																+   GCC-StarPU is free software: you can redistribute it and/or modify
															
 
																+   it under the terms of the GNU General Public License as published by
															
 
																+   the Free Software Foundation, either version 3 of the License, or
															
 
																+   (at your option) any later version.
															
 
																+
															
 
																+   GCC-StarPU is distributed in the hope that it will be useful,
															
 
																+   but WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
															
 
																+   GNU General Public License for more details.
															
 
																+
															
 
																+   You should have received a copy of the GNU General Public License
															
 
																+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
															
 
																+
															
 
																+/* Parser for simple C expressions in pragmas.  */
															
 
																+
															
 
																+%define api.pure
															
 
																+%parse-param { location_t loc }
															
 
																+%parse-param { const char *pragma }
															
 
																+%parse-param { tree *seq }
															
 
																+%debug
															
 
																+
															
 
																+%{
															
 
																+  #include <starpu-gcc-config.h>
															
 
																+
															
 
																+  #include <gcc-plugin.h>
															
 
																+  #include <plugin.h>
															
 
																+  #include <tree.h>
															
 
																+  #include <cpplib.h>
															
 
																+
															
 
																+  #ifdef HAVE_C_FAMILY_C_COMMON_H
															
 
																+  # include <c-family/c-common.h>
															
 
																+  #elif HAVE_C_COMMON_H
															
 
																+  # include <c-common.h>
															
 
																+  #endif
															
 
																+
															
 
																+  #ifdef HAVE_C_FAMILY_C_PRAGMA_H
															
 
																+  # include <c-family/c-pragma.h>
															
 
																+  #elif HAVE_C_PRAGMA_H
															
 
																+  # include <c-pragma.h>
															
 
																+  #endif
															
 
																+
															
 
																+  #if !HAVE_DECL_BUILD_ARRAY_REF
															
 
																+  /* This declaration is missing in GCC 4.6.1.  */
															
 
																+  extern tree build_array_ref (location_t loc, tree array, tree index);
															
 
																+  #endif
															
 
																+
															
 
																+
															
 
																+  #define YYSTYPE tree
															
 
																+  #define YYLTYPE location_t
															
 
																+
															
 
																+  static void
															
 
																+  yyerror (location_t loc, const char *pragma, tree *seq,
															
 
																+	   char const *message)
															
 
																+  {
															
 
																+    error_at (loc, "parse error in pragma %qs: %s", pragma, message);
															
 
																+  }
															
 
																+
															
 
																+  /* Return SOMETHING if it's a VAR_DECL, an identifier bound to a VAR_DECL,
															
 
																+     or another object; raise an error otherwise.  */
															
 
																+
															
 
																+  static tree
															
 
																+  ensure_bound (location_t loc, tree something)
															
 
																+  {
															
 
																+    gcc_assert (something != NULL_TREE);
															
 
																+
															
 
																+    if (DECL_P (something))
															
 
																+      return something;
															
 
																+    else if (TREE_CODE (something) == IDENTIFIER_NODE)
															
 
																+      {
															
 
																+	tree var = lookup_name (something);
															
 
																+	if (var == NULL_TREE)
															
 
																+	  {
															
 
																+	    error_at (loc, "unbound variable %qE", something);
															
 
																+	    return error_mark_node;
															
 
																+	  }
															
 
																+	else
															
 
																+	  return var;
															
 
																+      }
															
 
																+
															
 
																+    return something;
															
 
																+  }
															
 
																+
															
 
																+  static tree
															
 
																+  build_component_ref (location_t loc, tree what, tree field)
															
 
																+  {
															
 
																+    sorry ("struct field access not implemented yet"); /* XXX */
															
 
																+    return error_mark_node;
															
 
																+  }
															
 
																+%}
															
 
																+
															
 
																+%code {
															
 
																+  /* Mapping of libcpp token names to Bison-generated token names.  This is
															
 
																+     not ideal but Bison cannot be told to use the `enum cpp_ttype'
															
 
																+     values.  */
															
 
																+  static const int cpplib_bison_token_map[] =
															
 
																+    {
															
 
																+      [CPP_NAME] = YCPP_NAME,
															
 
																+      [CPP_NUMBER] = YCPP_NUM,
															
 
																+      [CPP_AND] = YCPP_AND,
															
 
																+      [CPP_OPEN_SQUARE] = YCPP_OPEN_SQUARE,
															
 
																+      [CPP_CLOSE_SQUARE] = YCPP_CLOSE_SQUARE,
															
 
																+      [CPP_OPEN_PAREN] = YCPP_OPEN_PAREN,
															
 
																+      [CPP_CLOSE_PAREN] = YCPP_CLOSE_PAREN,
															
 
																+      [CPP_PLUS] = YCPP_PLUS,
															
 
																+      [CPP_MINUS] = YCPP_MINUS,
															
 
																+      [CPP_MULT] = YCPP_MULT,
															
 
																+      [CPP_DIV] = YCPP_DIV,
															
 
																+      [CPP_DOT] = YCPP_DOT,
															
 
																+      [CPP_DEREF] = YCPP_DEREF
															
 
																+    };
															
 
																+
															
 
																+  static int
															
 
																+  yylex (YYSTYPE *lvalp)
															
 
																+  {
															
 
																+    int ret;
															
 
																+
															
 
																+    ret = pragma_lex (lvalp);
															
 
																+    if (ret < sizeof cpplib_bison_token_map / sizeof cpplib_bison_token_map[0])
															
 
																+      ret = cpplib_bison_token_map[ret];
															
 
																+    else
															
 
																+      ret = -1;
															
 
																+
															
 
																+    return ret;
															
 
																+  }
															
 
																+}
															
 
																+
															
 
																+%token YCPP_NAME "identifier"
															
 
																+%token YCPP_NUM "integer"
															
 
																+%token YCPP_AND "&"
															
 
																+%token YCPP_OPEN_SQUARE "["
															
 
																+%token YCPP_CLOSE_SQUARE "]"
															
 
																+%token YCPP_OPEN_PAREN "("
															
 
																+%token YCPP_CLOSE_PAREN ")"
															
 
																+%token YCPP_PLUS "+"
															
 
																+%token YCPP_MINUS "-"
															
 
																+%token YCPP_MULT "*"
															
 
																+%token YCPP_DIV "/"
															
 
																+%token YCPP_DOT "."
															
 
																+%token YCPP_DEREF "->"
															
 
																+
															
 
																+%% /* Grammar rules.  */
															
 
																+
															
 
																+ /* Always return a TREE_LIST rather than a raw chain, because the elements
															
 
																+    of that list may be already chained for other purposes---e.g., PARM_DECLs
															
 
																+    of a function are chained together.  */
															
 
																+
															
 
																+sequence: expression {
															
 
																+          gcc_assert (*seq == NULL_TREE);
															
 
																+	  *seq = tree_cons (NULL_TREE, $1, NULL_TREE);
															
 
																+	  $$ = *seq;
															
 
																+      }
															
 
																+      | expression sequence {
															
 
																+	  gcc_assert ($2 == *seq);
															
 
																+	  *seq = tree_cons (NULL_TREE, $1, $2);
															
 
																+	  $$ = *seq;
															
 
																+      }
															
 
																+;
															
 
																+
															
 
																+expression: identifier | binary_expression | unary_expression;
															
 
																+
															
 
																+/* XXX: `ensure_bound' below leads to errors raised even for non-significant
															
 
																+   arguments---e.g., junk after pragma.  */
															
 
																+identifier: YCPP_NAME  { $$ = ensure_bound (loc, $1); }
															
 
																+;
															
 
																+
															
 
																+binary_expression: multiplicative_expression
															
 
																+     | additive_expression
															
 
																+;
															
 
																+
															
 
																+multiplicative_expression: multiplicative_expression YCPP_MULT cast_expression {
															
 
																+       $$ = build_binary_op (UNKNOWN_LOCATION, MULT_EXPR, $1, $3, 0);
															
 
																+     }
															
 
																+     | multiplicative_expression YCPP_DIV cast_expression {
															
 
																+       $$ = build_binary_op (UNKNOWN_LOCATION, TRUNC_DIV_EXPR, $1, $3, 0);
															
 
																+     }
															
 
																+     | cast_expression
															
 
																+;
															
 
																+
															
 
																+additive_expression: multiplicative_expression
															
 
																+     | additive_expression YCPP_PLUS multiplicative_expression {
															
 
																+       $$ = build_binary_op (UNKNOWN_LOCATION, PLUS_EXPR, $1, $3, 0);
															
 
																+     }
															
 
																+     | additive_expression YCPP_MINUS multiplicative_expression {
															
 
																+       $$ = build_binary_op (UNKNOWN_LOCATION, MINUS_EXPR, $1, $3, 0);
															
 
																+     }
															
 
																+;
															
 
																+
															
 
																+cast_expression: unary_expression
															
 
																+		 /* XXX: No support for '(' TYPE-NAME ')' UNARY-EXPRESSION.  */
															
 
																+;
															
 
																+
															
 
																+unary_expression:
															
 
																+       primary_expression
															
 
																+     | postfix_expression
															
 
																+     | YCPP_AND cast_expression {
															
 
																+       $$ = build_addr (ensure_bound (loc, $2), current_function_decl);
															
 
																+     }
															
 
																+;
															
 
																+
															
 
																+postfix_expression:
															
 
																+       primary_expression
															
 
																+     | postfix_expression YCPP_OPEN_SQUARE expression YCPP_CLOSE_SQUARE {
															
 
																+#if 1
															
 
																+	 /* Build the array ref with proper error checking.  */
															
 
																+	 $$ = build_array_ref (loc, ensure_bound (loc, $1),
															
 
																+			       ensure_bound (loc, $3));
															
 
																+#else /* TIMTOWTDI */
															
 
																+	 $$ = build_indirect_ref (loc,
															
 
																+	       build_binary_op (loc, PLUS_EXPR, ensure_bound (loc, $1), ensure_bound (loc, $3), 0),
															
 
																+		RO_ARRAY_INDEXING);
															
 
																+#endif
															
 
																+     }
															
 
																+     | postfix_expression YCPP_DOT identifier {
															
 
																+        $$ = build_component_ref (loc, ensure_bound (loc, $1), $2);
															
 
																+     }
															
 
																+     | postfix_expression YCPP_DEREF identifier {
															
 
																+        $$ = build_component_ref (loc,
															
 
																+               build_indirect_ref (loc, ensure_bound (loc, $1), RO_ARRAY_INDEXING),
															
 
																+               $2);
															
 
																+     }
															
 
																+;
															
 
																+
															
 
																+primary_expression: identifier
															
 
																+     | constant
															
 
																+     | YCPP_OPEN_PAREN expression YCPP_CLOSE_PAREN { $$ = $2; }
															
 
																+;
															
 
																+
															
 
																+constant: YCPP_NUM { $$ = $1; }
															
 
																+;
															
 
																+
															
 
																+%%
															
--- a/gcc-plugin/src/starpu-gcc-config.h.in
+++ b/gcc-plugin/src/starpu-gcc-config.h.in
@@ -21,3 +21,11 @@
 
																 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_ARRAY
															
 
																 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_VEC
															
 
																+
															
 
																+#undef HAVE_DECL_BUILD_ARRAY_REF
															
 
																+
															
 
																+#undef HAVE_C_FAMILY_C_COMMON_H
															
 
																+#undef HAVE_C_COMMON_H
															
 
																+
															
 
																+#undef HAVE_C_FAMILY_C_PRAGMA_H
															
 
																+#undef HAVE_C_PRAGMA_H
															
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
@@ -30,8 +30,19 @@ int plugin_is_GPL_compatible;
 
																 #include <cpplib.h>
															
 
																 #include <tree.h>
															
 
																 #include <tree-iterator.h>
															
 
																-#include <c-common.h>
															
 
																-#include <c-pragma.h>
															
 
																+
															
 
																+#ifdef HAVE_C_FAMILY_C_COMMON_H
															
 
																+# include <c-family/c-common.h>
															
 
																+#elif HAVE_C_COMMON_H
															
 
																+# include <c-common.h>
															
 
																+#endif
															
 
																+
															
 
																+#ifdef HAVE_C_FAMILY_C_PRAGMA_H
															
 
																+# include <c-family/c-pragma.h>
															
 
																+#elif HAVE_C_PRAGMA_H
															
 
																+# include <c-pragma.h>
															
 
																+#endif
															
 
																+
															
 
																 #include <tm.h>
															
 
																 #include <gimple.h>
															
 
																 #include <tree-pass.h>
															
@@ -75,6 +86,9 @@ static tree build_codelet_declaration (tree task_decl);
 
																 static tree build_task_body (const_tree task_decl);
															
 
																 static tree build_pointer_lookup (tree pointer);
															
 
																+static bool task_p (const_tree decl);
															
 
																+static bool task_implementation_p (const_tree decl);
															
 
																+
															
 
																 /* Lookup the StarPU function NAME in the global scope and store the result
															
 
																    in VAR (this can't be done from `lower_starpu'.)  */
															
@@ -119,7 +133,9 @@ build_call_expr_loc_vec (location_t loc, tree fndecl, VEC(tree,gc) *vec)
 
																 /* Build a reference to the INDEXth element of ARRAY.  `build_array_ref' is
															
 
																-   not exported, so we roll our own.  */
															
 
																+   not exported, so we roll our own.
															
 
																+   FIXME: This version may not work for array types and doesn't do as much
															
 
																+   type-checking as `build_array_ref'.  */
															
 
																 static tree
															
 
																 array_ref (tree array, size_t index)
															
@@ -347,48 +363,45 @@ handle_pragma_shutdown (struct cpp_reader *reader)
 
																 static void
															
 
																 handle_pragma_wait (struct cpp_reader *reader)
															
 
																 {
															
 
																-  tree fndecl;
															
 
																+  if (task_implementation_p (current_function_decl))
															
 
																+    {
															
 
																+      location_t loc;
															
 
																-  fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
															
 
																-  gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
															
 
																+      loc = cpp_peek_token (reader, 0)->src_loc;
															
 
																+
															
 
																+      /* TODO: In the future we could generate a task for the continuation
															
 
																+	 and have it depend on what's before here.  */
															
 
																+      error_at (loc, "task implementation is not allowed to wait");
															
 
																+    }
															
 
																+  else
															
 
																+    {
															
 
																+      tree fndecl;
															
 
																+
															
 
																+      fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
															
 
																+      gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
															
 
																-  add_stmt (build_call_expr (fndecl, 0));
															
 
																+      add_stmt (build_call_expr (fndecl, 0));
															
 
																+    }
															
 
																 }
															
 
																-/* Parse a pointer variable for PRAGMA, raising the appropriate error if
															
 
																-   needed.  Return the pointer variable on success, NULL_TREE otherwise.  */
															
 
																+/* The minimal C expression parser.  */
															
 
																+
															
 
																+extern int yyparse (location_t, const char *, tree *);
															
 
																+extern int yydebug;
															
 
																+
															
 
																+/* Parse expressions from the CPP reader for PRAGMA, which is located at LOC.
															
 
																+   Return a TREE_LIST of C expressions.  */
															
 
																 static tree
															
 
																-read_pragma_pointer_variable (const char *pragma, location_t loc)
															
 
																+read_pragma_expressions (const char *pragma, location_t loc)
															
 
																 {
															
 
																-  tree token, var = NULL_TREE;
															
 
																-  enum cpp_ttype type;
															
 
																-
															
 
																-  type = pragma_lex (&token);
															
 
																-  if (type == CPP_EOF)
															
 
																-    error_at (loc, "unterminated %<starpu %s%> pragma", pragma);
															
 
																-  else if (type != CPP_NAME)
															
 
																-    error_at (loc, "identifier expected");
															
 
																-  else
															
 
																-    {
															
 
																-      /* Get the variable name.  */
															
 
																-      tree var_name = token;
															
 
																-      tree decl = lookup_name (var_name);
															
 
																-
															
 
																-      if (decl == NULL_TREE || !DECL_P (decl))
															
 
																-	error_at (loc, "unbound variable %qE", var_name);
															
 
																-      else if (!POINTER_TYPE_P (TREE_TYPE (decl))
															
 
																-	       && TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
															
 
																-	error_at (loc, "%qE is neither a pointer nor an array", var_name);
															
 
																-      else
															
 
																-	{
															
 
																-	  var = decl;
															
 
																-	  TREE_USED (var) = true;
															
 
																-	  DECL_READ_P (var) = true;
															
 
																-	}
															
 
																-    }
															
 
																+  tree expr = NULL_TREE;
															
 
																+
															
 
																+  if (yyparse (loc, pragma, &expr))
															
 
																+    /* Parse error or memory exhaustion.  */
															
 
																+    expr = NULL_TREE;
															
 
																-  return var;
															
 
																+  return expr;
															
 
																 }
															
 
																 /* Process `#pragma starpu register VAR [COUNT]' and emit the corresponding
															
@@ -397,19 +410,37 @@ read_pragma_pointer_variable (const char *pragma, location_t loc)
 
																 static void
															
 
																 handle_pragma_register (struct cpp_reader *reader)
															
 
																 {
															
 
																-  tree token, var;
															
 
																+  tree args, ptr, count_arg;
															
 
																   location_t loc;
															
 
																-  enum cpp_ttype type;
															
 
																   loc = cpp_peek_token (reader, 0)->src_loc;
															
 
																-  var = read_pragma_pointer_variable ("register", loc);
															
 
																-  if (var == NULL_TREE)
															
 
																+  args = read_pragma_expressions ("register", loc);
															
 
																+  if (args == NULL_TREE)
															
 
																+    /* Parse error, presumably already handled by the parser.  */
															
 
																     return;
															
 
																-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE
															
 
																-      && !DECL_EXTERNAL (var)
															
 
																-      && !TREE_STATIC (var)
															
 
																+  /* First argument should be a pointer expression.  */
															
 
																+  ptr = TREE_VALUE (args);
															
 
																+  args = TREE_CHAIN (args);
															
 
																+
															
 
																+  if (ptr == error_mark_node)
															
 
																+    return;
															
 
																+
															
 
																+  if (!POINTER_TYPE_P (TREE_TYPE (ptr))
															
 
																+      && TREE_CODE (TREE_TYPE (ptr)) != ARRAY_TYPE)
															
 
																+    {
															
 
																+      error_at (loc, "%qE is neither a pointer nor an array", ptr);
															
 
																+      return;
															
 
																+    }
															
 
																+
															
 
																+  TREE_USED (ptr) = true;
															
 
																+  if (DECL_P (ptr))
															
 
																+    DECL_READ_P (ptr) = true;
															
 
																+
															
 
																+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE
															
 
																+      && !DECL_EXTERNAL (ptr)
															
 
																+      && !TREE_STATIC (ptr)
															
 
																       && !MAIN_NAME_P (DECL_NAME (current_function_decl)))
															
 
																     warning_at (loc, 0, "using an on-stack array as a task input "
															
 
																 		"considered unsafe");
															
@@ -417,9 +448,9 @@ handle_pragma_register (struct cpp_reader *reader)
 
																   /* Determine the number of elements in the vector.  */
															
 
																   tree count = NULL_TREE;
															
 
																-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE)
															
 
																+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE)
															
 
																     {
															
 
																-      tree domain = TYPE_DOMAIN (TREE_TYPE (var));
															
 
																+      tree domain = TYPE_DOMAIN (TREE_TYPE (ptr));
															
 
																       if (domain != NULL_TREE)
															
 
																 	{
															
@@ -435,52 +466,44 @@ handle_pragma_register (struct cpp_reader *reader)
 
																 	}
															
 
																     }
															
 
																+  /* Second argument is optional but should be an integer.  */
															
 
																+  count_arg = (args == NULL_TREE) ? NULL_TREE : TREE_VALUE (args);
															
 
																+  if (args != NULL_TREE)
															
 
																+    {
															
 
																+      args = TREE_CHAIN (args);
															
 
																+      TREE_CHAIN (count_arg) = NULL_TREE;
															
 
																+    }
															
 
																-  type = pragma_lex (&token);
															
 
																-  if (type == CPP_EOF)
															
 
																+  if (count_arg == NULL_TREE)
															
 
																     {
															
 
																-      /* End of line reached: don't consume TOKEN and check whether the array
															
 
																-	 size was determined.  */
															
 
																+      /* End of line reached: check whether the array size was
															
 
																+	 determined.  */
															
 
																       if (count == NULL_TREE)
															
 
																 	{
															
 
																-	  error_at (loc, "cannot determine size of array %qE", DECL_NAME (var));
															
 
																+	  error_at (loc, "cannot determine size of array %qE", ptr);
															
 
																 	  return;
															
 
																 	}
															
 
																     }
															
 
																+  else if (count_arg == error_mark_node)
															
 
																+    /* COUNT_ARG could not be parsed and an error was already reported.  */
															
 
																+    return;
															
 
																+  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
															
 
																+    {
															
 
																+      error_at (loc, "%qE is not an integer", count_arg);
															
 
																+      return;
															
 
																+    }
															
 
																   else
															
 
																     {
															
 
																-      /* TOKEN may be a number or a integer variable.  */
															
 
																-
															
 
																-      tree count_arg;
															
 
																-
															
 
																-      if (TREE_CODE (token) == IDENTIFIER_NODE)
															
 
																-	{
															
 
																-	  count_arg = lookup_name (token);
															
 
																-	  if (count_arg == NULL_TREE)
															
 
																-	    {
															
 
																-	      error_at (loc, "unbound variable %qE", token);
															
 
																-	      return;
															
 
																-	    }
															
 
																-	  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
															
 
																-	    {
															
 
																-	      error_at (loc, "integer expected");
															
 
																-	      return;
															
 
																-	    }
															
 
																-
															
 
																-	  TREE_USED (count_arg) = true;
															
 
																-	  DECL_READ_P (count_arg) = true;
															
 
																-	}
															
 
																-      else if (TREE_CODE (token) != INTEGER_CST)
															
 
																-	error_at (loc, "integer expected");
															
 
																-      else
															
 
																-	count_arg = token;
															
 
																+      TREE_USED (count_arg) = true;
															
 
																+      if (DECL_P (count_arg))
															
 
																+	DECL_READ_P (count_arg) = true;
															
 
																       if (count != NULL_TREE)
															
 
																 	{
															
 
																 	  /* The number of elements of this array was already determined.  */
															
 
																 	  inform (loc,
															
 
																 		  "element count can be omitted for bounded array %qE",
															
 
																-		  DECL_NAME (var));
															
 
																+		  ptr);
															
 
																 	  if (count_arg != NULL_TREE)
															
 
																 	    {
															
@@ -489,7 +512,7 @@ handle_pragma_register (struct cpp_reader *reader)
 
																 		  if (!tree_int_cst_equal (count, count_arg))
															
 
																 		    error_at (loc, "specified element count differs "
															
 
																 			      "from actual size of array %qE",
															
 
																-			      DECL_NAME (var));
															
 
																+			      ptr);
															
 
																 		}
															
 
																 	      else
															
 
																 		/* Using a variable to determine the array size whereas the
															
@@ -501,19 +524,24 @@ handle_pragma_register (struct cpp_reader *reader)
 
																 	}
															
 
																       else
															
 
																 	count = count_arg;
															
 
																-
															
 
																-      if (pragma_lex (&token) != CPP_EOF)
															
 
																-	error_at (loc, "junk after %<starpu register%> pragma");
															
 
																     }
															
 
																-  /* If VAR is an array, take its address.  */
															
 
																+  /* Any remaining args?  */
															
 
																+  if (args != NULL_TREE)
															
 
																+    error_at (loc, "junk after %<starpu register%> pragma");
															
 
																+
															
 
																+  /* If PTR is an array, take its address.  */
															
 
																   tree pointer =
															
 
																-    POINTER_TYPE_P (TREE_TYPE (var))
															
 
																-    ? var
															
 
																-    : build_addr (var, current_function_decl);
															
 
																+    POINTER_TYPE_P (TREE_TYPE (ptr))
															
 
																+    ? ptr
															
 
																+    : build_addr (ptr, current_function_decl);
															
 
																   /* Introduce a local variable to hold the handle.  */
															
 
																-  tree handle_var = create_tmp_var (ptr_type_node, ".handle");
															
 
																+  tree handle_var = build_decl (loc, VAR_DECL, create_tmp_var_name (".handle"),
															
 
																+				ptr_type_node);
															
 
																+  DECL_CONTEXT (handle_var) = current_function_decl;
															
 
																+  DECL_ARTIFICIAL (handle_var) = true;
															
 
																+  DECL_INITIAL (handle_var) = NULL_TREE;
															
 
																   tree register_fn =
															
 
																     lookup_name (get_identifier ("starpu_vector_data_register"));
															
@@ -525,9 +553,13 @@ handle_pragma_register (struct cpp_reader *reader)
 
																 		     build_addr (handle_var, current_function_decl),
															
 
																 		     build_zero_cst (uintptr_type_node), /* home node */
															
 
																 		     pointer, count,
															
 
																-		     size_in_bytes (TREE_TYPE (TREE_TYPE (var))));
															
 
																+		     size_in_bytes (TREE_TYPE (TREE_TYPE (ptr))));
															
 
																-  add_stmt (call);
															
 
																+  tree bind;
															
 
																+  bind = build3 (BIND_EXPR, void_type_node, handle_var, call,
															
 
																+		 NULL_TREE);
															
 
																+
															
 
																+  add_stmt (bind);
															
 
																 }
															
 
																 /* Process `#pragma starpu acquire VAR' and emit the corresponding
															
@@ -539,16 +571,26 @@ handle_pragma_acquire (struct cpp_reader *reader)
 
																   static tree acquire_fn;
															
 
																   LOOKUP_STARPU_FUNCTION (acquire_fn, "starpu_data_acquire");
															
 
																-  tree token, var;
															
 
																+  tree args, var;
															
 
																   location_t loc;
															
 
																   loc = cpp_peek_token (reader, 0)->src_loc;
															
 
																-  var = read_pragma_pointer_variable ("acquire", loc);
															
 
																-  if (var == NULL_TREE)
															
 
																+  args = read_pragma_expressions ("acquire", loc);
															
 
																+  if (args == NULL_TREE)
															
 
																     return;
															
 
																-  if (pragma_lex (&token) != CPP_EOF)
															
 
																+  var = TREE_VALUE (args);
															
 
																+
															
 
																+  if (var == error_mark_node)
															
 
																+    return;
															
 
																+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
															
 
																+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
															
 
																+    {
															
 
																+      error_at (loc, "%qE is neither a pointer nor an array", var);
															
 
																+      return;
															
 
																+    }
															
 
																+  else if (TREE_CHAIN (var) != NULL_TREE)
															
 
																     error_at (loc, "junk after %<starpu acquire%> pragma");
															
 
																   /* If VAR is an array, take its address.  */
															
@@ -573,16 +615,26 @@ handle_pragma_unregister (struct cpp_reader *reader)
 
																   static tree unregister_fn;
															
 
																   LOOKUP_STARPU_FUNCTION (unregister_fn, "starpu_data_unregister");
															
 
																-  tree token, var;
															
 
																+  tree args, var;
															
 
																   location_t loc;
															
 
																   loc = cpp_peek_token (reader, 0)->src_loc;
															
 
																-  var = read_pragma_pointer_variable ("unregister", loc);
															
 
																-  if (var == NULL_TREE)
															
 
																+  args = read_pragma_expressions ("unregister", loc);
															
 
																+  if (args == NULL_TREE)
															
 
																     return;
															
 
																-  if (pragma_lex (&token) != CPP_EOF)
															
 
																+  var = TREE_VALUE (args);
															
 
																+
															
 
																+  if (var == error_mark_node)
															
 
																+    return;
															
 
																+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
															
 
																+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
															
 
																+    {
															
 
																+      error_at (loc, "%qE is neither a pointer nor an array", var);
															
 
																+      return;
															
 
																+    }
															
 
																+  else if (TREE_CHAIN (args) != NULL_TREE)
															
 
																     error_at (loc, "junk after %<starpu unregister%> pragma");
															
 
																   /* If VAR is an array, take its address.  */
															
--- a/gcc-plugin/tests/Makefile.am
+++ b/gcc-plugin/tests/Makefile.am
@@ -28,6 +28,7 @@ gcc_tests =					\
 
																   pointer-tasks.c				\
															
 
																   no-initialize.c				\
															
 
																   lib-user.c					\
															
 
																+  wait-errors.c					\
															
 
																   shutdown-errors.c
															
 
																 dist_noinst_HEADERS = mocks.h
															
--- a/gcc-plugin/tests/acquire-errors.c
+++ b/gcc-plugin/tests/acquire-errors.c
@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
																 #pragma starpu register x
															
 
																-#pragma starpu acquire /* (error "unterminated") */
															
 
																-#pragma starpu acquire 123 /* (error "identifier expected") */
															
 
																+#pragma starpu acquire /* (error "parse error") */
															
 
																+#pragma starpu acquire 123 /* (error "neither a pointer nor an array") */
															
 
																 #pragma starpu acquire does_not_exit /* (error "unbound variable") */
															
 
																 #pragma starpu acquire argc /* (error "neither a pointer nor an array") */
															
--- a/gcc-plugin/tests/lib-user.c
+++ b/gcc-plugin/tests/lib-user.c
@@ -28,7 +28,10 @@ main (int argc, char *argv[])
 
																 {
															
 
																 #pragma starpu initialize
															
 
																-  static const char x[] = { 0, 1, 2, 3, 4, 5 };
															
 
																+  /* Align X so that the assumptions behind `dummy_pointer_to_handle'
															
 
																+     hold.  */
															
 
																+  static const char x[] __attribute__ ((aligned (8))) = { 0, 1, 2, 3, 4, 5 };
															
 
																+
															
 
																   float y[sizeof x];
															
 
																   static const char forty_two = 42;
															
--- a/gcc-plugin/tests/register-errors.c
+++ b/gcc-plugin/tests/register-errors.c
@@ -23,9 +23,9 @@ main (int argc, char *argv[])
 
																 {
															
 
																 #pragma starpu initialize
															
 
																-#pragma starpu register /* (error "unterminated") */
															
 
																+#pragma starpu register /* (error "parse error") */
															
 
																-#pragma starpu register argv 234 junk right here /* (error "junk after") */
															
 
																+#pragma starpu register argv 234 junk here /* (error "junk after") *//* (error "unbound") *//* (error "unbound") */
															
 
																   static int x[123] __attribute__ ((unused));
															
 
																 #pragma starpu register x 234 /* (note "can be omitted") *//* (error "differs from actual size") */
															
@@ -37,9 +37,15 @@ main (int argc, char *argv[])
 
																 #pragma starpu register argv does_not_exit /* (error "unbound variable") */
															
 
																 #pragma starpu register argv /* (error "cannot determine size") */
															
 
																+#pragma starpu register &argv[2] /* (error "cannot determine size") */
															
 
																+#pragma starpu register &x[2] /* (error "cannot determine size") */
															
 
																 #pragma starpu register argc /* (error "neither a pointer nor an array") */
															
 
																+#pragma starpu register argv[2][3] 3 /* (error "neither a pointer nor an array") */
															
 
																+
															
 
																+#pragma starpu register argv[does_not_exist] 3 /* (error "unbound variable") */
															
 
																+
															
 
																   char **p = argv;
															
 
																   size_t ps = argc;
															
 
																 #pragma starpu register p ps  /* No unused variable warning, please! */
															
--- a/gcc-plugin/tests/register.c
+++ b/gcc-plugin/tests/register.c
@@ -31,6 +31,15 @@ foo (void)
 
																 #pragma starpu register x /* (warning "considered unsafe") */
															
 
																 }
															
 
																+static void
															
 
																+bar (float *p, int s)
															
 
																+{
															
 
																+  expected_register_arguments.pointer = p;
															
 
																+  expected_register_arguments.elements = s;
															
 
																+  expected_register_arguments.element_size = sizeof *p;
															
 
																+#pragma starpu register p s
															
 
																+}
															
 
																+
															
 
																 int
															
 
																 main (int argc, char *argv[])
															
 
																 {
															
@@ -39,6 +48,7 @@ main (int argc, char *argv[])
 
																   int x[123];
															
 
																   double *y;
															
 
																   static char z[345];
															
 
																+  static float m[7][42];
															
 
																   short w[] = { 1, 2, 3 };
															
 
																   size_t y_size = 234;
															
@@ -84,8 +94,36 @@ main (int argc, char *argv[])
 
																 #undef N
															
 
																   foo ();
															
 
																+  bar ((float *) argv, argc);
															
 
																+
															
 
																+  expected_register_arguments.pointer = argv;
															
 
																+  expected_register_arguments.elements = argc;
															
 
																+  expected_register_arguments.element_size = sizeof argv[0];
															
 
																+
															
 
																+  int chbouib = argc;
															
 
																+#pragma starpu register argv chbouib
															
 
																+
															
 
																+  expected_register_arguments.pointer = &argv[2];
															
 
																+  expected_register_arguments.elements = 3;
															
 
																+  expected_register_arguments.element_size = sizeof argv[0];
															
 
																+#pragma starpu register &argv[2] 3
															
 
																+
															
 
																+  expected_register_arguments.pointer = &argv[argc + 3 / 2];
															
 
																+  expected_register_arguments.elements = argc * 4;
															
 
																+  expected_register_arguments.element_size = sizeof argv[0];
															
 
																+#pragma starpu register &argv[argc + 3 / 2] (argc * 4)
															
 
																+
															
 
																+  expected_register_arguments.pointer = &y[y_size / 2];
															
 
																+  expected_register_arguments.elements = (y_size / 2 - 7);
															
 
																+  expected_register_arguments.element_size = sizeof y[0];
															
 
																+#pragma starpu register &y[y_size / 2] (y_size / 2 - 7)
															
 
																+
															
 
																+  expected_register_arguments.pointer = m[6];
															
 
																+  expected_register_arguments.elements = 42;
															
 
																+  expected_register_arguments.element_size = sizeof m[0][0];
															
 
																+#pragma starpu register m[6]
															
 
																-  assert (data_register_calls == 8);
															
 
																+  assert (data_register_calls == 14);
															
 
																   free (y);
															
--- a/gcc-plugin/tests/run-test.in
+++ b/gcc-plugin/tests/run-test.in
@@ -85,11 +85,11 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
 
																     ;; the real file name.
															
 
																     ,(string-append "-fplugin=" %builddir "/../src/.libs/starpu.so")
															
 
																-    "-g" "-O2"
															
 
																+    "-g"
															
 
																     "-fdump-tree-gimple" "-Wall"))
															
 
																 (define %default-ldflags
															
 
																-  `(,(string-append "-L" %srcdir "/../../src")))
															
 
																+  `(,(string-append "-L" %builddir "/../../src")))
															
 
																 (define %libtool
															
 
																   (string-append %builddir "/../../libtool"))
															
@@ -105,7 +105,7 @@ compiler status and the list of lines printed on stdout/stderr."
 
																          (mode     (if compile?
															
 
																                        "compile"
															
 
																                        "link"))
															
 
																-         (command  (format #f "LANG=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
															
 
																+         (command  (format #f "LC_ALL=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
															
 
																                            %libtool mode cc cflags file ldflags))
															
 
																          (pipe     (begin
															
 
																                      (log "running `~a'" command)
															
@@ -311,7 +311,7 @@ otherwise."
 
																                 file (length dependencies) dependencies))
															
 
																     (and (every (cut compile/match <> cc cflags ldflags)
															
 
																-                dependencies)
															
 
																+                (map (cut string-append %srcdir "/" <>) dependencies))
															
 
																          (let*-values (((goal)
															
 
																                         (if error-expected?
															
 
																                             'compile
															
@@ -421,6 +421,12 @@ otherwise."
 
																 ;;;
															
 
																 (define (build/run . file)
															
 
																-  (exit (every (cut compile/match <> %gcc %default-cflags %default-ldflags) file)))
															
 
																+  (exit (every (lambda (file)
															
 
																+                 ;; For each file, check that everything works both with and
															
 
																+                 ;; without optimizations.
															
 
																+                 (every (cut compile/match file %gcc <> %default-ldflags)
															
 
																+                        `((,"-O0" ,@%default-cflags)
															
 
																+                          (,"-O2" ,@%default-cflags))))
															
 
																+               file)))
															
 
																 ;;; run-test.in ends here
															
--- a/gcc-plugin/tests/unregister-errors.c
+++ b/gcc-plugin/tests/unregister-errors.c
@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
																 #pragma starpu register x
															
 
																-#pragma starpu unregister /* (error "unterminated") */
															
 
																-#pragma starpu unregister 123 /* (error "identifier expected") */
															
 
																+#pragma starpu unregister /* (error "parse error") */
															
 
																+#pragma starpu unregister 123 /* (error "neither a pointer nor an array") */
															
 
																 #pragma starpu unregister does_not_exit /* (error "unbound variable") */
															
 
																 #pragma starpu unregister argc /* (error "neither a pointer nor an array") */
															
--- a/gcc-plugin/tests/wait-errors.c
+++ b/gcc-plugin/tests/wait-errors.c
@@ -0,0 +1,27 @@
 
																+/* GCC-StarPU
															
 
																+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
															
 
																+
															
 
																+   GCC-StarPU is free software: you can redistribute it and/or modify
															
 
																+   it under the terms of the GNU General Public License as published by
															
 
																+   the Free Software Foundation, either version 3 of the License, or
															
 
																+   (at your option) any later version.
															
 
																+
															
 
																+   GCC-StarPU is distributed in the hope that it will be useful,
															
 
																+   but WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
															
 
																+   GNU General Public License for more details.
															
 
																+
															
 
																+   You should have received a copy of the GNU General Public License
															
 
																+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
															
 
																+
															
 
																+/* The task under test.  */
															
 
																+
															
 
																+void task (int x, char y, int z) __attribute__ ((task));
															
 
																+static void task_cpu (int x, char y, int z)
															
 
																+  __attribute__ ((task_implementation ("cpu", task)));
															
 
																+
															
 
																+static void
															
 
																+task_cpu (int x, char y, int z)
															
 
																+{
															
 
																+#pragma starpu wait /* (error "not allowed") */
															
 
																+}
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -46,6 +46,7 @@
 
																 #undef STARPU_MAXCUDADEVS
															
 
																 #undef STARPU_MAXOPENCLDEVS
															
 
																 #undef STARPU_NMAXWORKERS
															
 
																+#undef STARPU_MAXIMPLEMENTATIONS
															
 
																 #undef STARPU_HAVE_LIBNUMA
															
--- a/include/starpu_data_filters.h
+++ b/include/starpu_data_filters.h
@@ -19,6 +19,8 @@
 
																 #ifndef __STARPU_DATA_FILTERS_H__
															
 
																 #define __STARPU_DATA_FILTERS_H__
															
 
																+#include <stdarg.h>
															
 
																+
															
 
																 #include <starpu.h>
															
 
																 #include <starpu_config.h>
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -104,7 +105,7 @@ struct starpu_perfmodel_t {
 
																 	double (*cost_model)(struct starpu_buffer_descr_t *);
															
 
																 	/* per-architecture model */
															
 
																-	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS];
															
 
																+	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
															
 
																 	const char *symbol;
															
@@ -126,12 +127,12 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
 
																  * performance model files */
															
 
																 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
															
 
																 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
															
 
																-		enum starpu_perf_archtype arch, char *path, size_t maxlen);
															
 
																-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,
															
 
																-		char *archname, size_t maxlen);
															
 
																+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
															
 
																+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,	char *archname, size_t maxlen, unsigned nimpl);
															
 
																 int starpu_list_models(void);
															
 
																 void starpu_force_bus_sampling(void);
															
 
																+void starpu_print_bus_bandwidth(FILE *f);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -127,7 +128,7 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 
																 #endif
															
 
																 /* Check if the worker specified by workerid can execute the codelet. */
															
 
																-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
															
 
																+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																 /* The scheduling policy may put tasks directly into a worker's local queue so
															
 
																  * that it is not always necessary to create its own queue when the local queue
															
@@ -147,6 +148,7 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back);
 
																 /* By convention, the default priority level should be 0 so that we can
															
 
																  * statically allocate tasks with a default priority. */
															
 
																 #define STARPU_DEFAULT_PRIO	0
															
 
																+
															
 
																 int starpu_sched_get_min_priority(void);
															
 
																 int starpu_sched_get_max_priority(void);
															
@@ -164,7 +166,7 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 
																 /* Get the description of a combined worker */
															
 
																 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
															
 
																 /* Variant of starpu_worker_may_execute_task compatible with combined workers */
															
 
																-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
															
 
																+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																 /*
															
 
																  *	Data prefetching
															
@@ -182,7 +184,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
 
																 /* Return the current date */
															
 
																 double starpu_timing_now(void);
															
 
																 /* Returns expected task duration in µs */
															
 
																-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch);
															
 
																+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																 /* Returns an estimated speedup factor relative to CPU speed */
															
 
																 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
															
 
																 /* Returns expected data transfer time in µs */
															
@@ -190,7 +192,7 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 
																 /* Predict the transfer time (in µs) to move a handle to a memory node */
															
 
																 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
															
 
																 /* Returns expected power consumption in J */
															
 
																-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch);
															
 
																+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																 /* Waits until all the tasks of a worker, already submitted, have been executed */
															
 
																 int starpu_wait_for_all_tasks_of_worker(int workerid);
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -23,7 +24,7 @@
 
																 #include <starpu_config.h>
															
 
																 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
															
 
																-#include <cuda.h>
															
 
																+# include <cuda.h>
															
 
																 #endif
															
 
																 #include <starpu_data.h>
															
@@ -56,6 +57,18 @@ extern "C" {
 
																 typedef uint64_t starpu_tag_t;
															
 
																+
															
 
																+typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
															
 
																+typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
															
 
																+typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
															
 
																+typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
															
 
																+
															
 
																+#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    (starpu_cpu_func_t) -1
															
 
																+#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   (starpu_cuda_func_t) -1
															
 
																+#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS (starpu_opencl_func_t) -1
															
 
																+#define STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS 255
															
 
																+
															
 
																+
															
 
																 /*
															
 
																  * A codelet describes the various function 
															
 
																  * that may be called from a worker
															
@@ -72,6 +85,11 @@ typedef struct starpu_codelet_t {
 
																 	void (*opencl_func)(void **, void *);
															
 
																 	uint8_t gordon_func;
															
 
																+	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																+	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																+	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																+	starpu_gordon_func_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS];
															
 
																+
															
 
																 	/* how many buffers do the codelet takes as argument ? */
															
 
																 	unsigned nbuffers;
															
--- a/include/starpu_task_bundle.h
+++ b/include/starpu_task_bundle.h
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -71,10 +72,10 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 
																 void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
															
 
																 /* Return the expected duration of the entire task bundle in µs. */
															
 
																-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch);
															
 
																+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																 /* Return the time (in µs) expected to transfer all data used within the bundle */
															
 
																 double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
															
 
																 /* Return the expected power consumption of the entire task bundle in J. */
															
 
																-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch);
															
 
																+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																 #endif // __STARPU_TASK_BUNDLE_H__
															
--- a/m4/.gitignore
+++ b/m4/.gitignore
@@ -0,0 +1,2 @@
 
																+/libtool.m4
															
 
																+/lt*.m4
															
--- a/m4/gcc.m4
+++ b/m4/gcc.m4
@@ -51,8 +51,16 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
 
																     dnl Reason:
															
 
																     dnl   build_call_expr_loc_array -- not in GCC 4.5.x; appears in 4.6
															
 
																     dnl   build_call_expr_loc_vec   -- likewise
															
 
																+    dnl   build_array_ref           -- present but undeclared in 4.6.1
															
 
																     _STARPU_WITH_GCC_PLUGIN_API([
															
 
																-      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec],
															
 
																+      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec,
															
 
																+                      build_array_ref],
															
 
																+        [], [], [#include <gcc-plugin.h>
															
 
																+	         #include <tree.h>])
															
 
																+
															
 
																+      dnl Work around header naming issues introduced upstream and in Debian
															
 
																+      dnl (see <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=631082>).
															
 
																+      AC_CHECK_HEADERS([c-common.h c-pragma.h c-family/c-common.h c-family/c-pragma.h],
															
 
																         [], [], [#include <gcc-plugin.h>
															
 
																 	         #include <tree.h>])
															
 
																     ])
															
--- a/mpi/.gitignore
+++ b/mpi/.gitignore
@@ -0,0 +1 @@
 
																+/.deps
															
--- a/mpi/tests/.gitignore
+++ b/mpi/tests/.gitignore
@@ -0,0 +1 @@
 
																+/.deps
															
--- a/socl/src/Makefile.am
+++ b/socl/src/Makefile.am
@@ -24,16 +24,18 @@ SUBDIRS =
 
																 lib_LTLIBRARIES = libsocl.la
															
 
																 libsocl_la_SOURCES = 						\
															
 
																-  socl.c \
															
 
																-  gc.c \
															
 
																-  graph.c \
															
 
																+  command.c \
															
 
																+  command_list.c \
															
 
																+  command_queue.c \
															
 
																+  debug.c \
															
 
																+  devices.c \
															
 
																   event.c \
															
 
																+  gc.c \
															
 
																   init.c \
															
 
																-  task.c \
															
 
																-  command_queue.c \
															
 
																   mem_objects.c \
															
 
																+  socl.c \
															
 
																+  task.c \
															
 
																   util.c \
															
 
																-  devices.c \
															
 
																   cl_getplatformids.c \
															
 
																   cl_getplatforminfo.c \
															
 
																   cl_getdeviceids.c \
															
--- a/socl/src/cl_createcommandqueue.c
+++ b/socl/src/cl_createcommandqueue.c
@@ -30,8 +30,7 @@ static void release_callback_command_queue(void * e) {
 
																   gc_entity_unstore(&cq->context);
															
 
																   /* Destruct object */
															
 
																-  pthread_spin_destroy(&cq->spin);
															
 
																-  free(cq->events);
															
 
																+  pthread_mutex_destroy(&cq->mutex);
															
 
																 }
															
@@ -68,9 +67,9 @@ soclCreateCommandQueue(cl_context                   context,
 
																       profiling_queue_count += 1;
															
 
																    }
															
 
																-   cq->events = NULL;
															
 
																+   cq->commands = NULL;
															
 
																    cq->barrier = NULL;
															
 
																-   pthread_spin_init(&cq->spin, 0);
															
 
																+   pthread_mutex_init(&cq->mutex, NULL);
															
 
																    if (errcode_ret != NULL)
															
 
																       *errcode_ret = CL_SUCCESS;
															
--- a/socl/src/cl_createkernel.c
+++ b/socl/src/cl_createkernel.c
@@ -32,23 +32,23 @@ static void soclCreateKernel_task(void *data) {
 
																    }
															
 
																    /* One worker creates argument structures */
															
 
																-   if (__sync_bool_compare_and_swap(&k->arg_count, 0, 666)) {
															
 
																+   if (__sync_bool_compare_and_swap(&k->num_args, 0, 666)) {
															
 
																       unsigned int i;
															
 
																-      cl_uint arg_count;
															
 
																+      cl_uint num_args;
															
 
																-      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(arg_count), &arg_count, NULL);
															
 
																+      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
															
 
																       if (err != CL_SUCCESS) {
															
 
																          DEBUG_CL("clGetKernelInfo", err);
															
 
																          ERROR_STOP("Unable to get kernel argument count. Aborting.\n");
															
 
																       }
															
 
																-      k->arg_count = arg_count;
															
 
																-      DEBUG_MSG("Kernel has %d arguments\n", arg_count);
															
 
																+      k->num_args = num_args;
															
 
																+      DEBUG_MSG("Kernel has %d arguments\n", num_args);
															
 
																-      k->arg_size = (size_t*)malloc(sizeof(size_t) * arg_count);
															
 
																-      k->arg_value = (void**)malloc(sizeof(void*) * arg_count);
															
 
																-      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * arg_count);
															
 
																+      k->arg_size = (size_t*)malloc(sizeof(size_t) * num_args);
															
 
																+      k->arg_value = (void**)malloc(sizeof(void*) * num_args);
															
 
																+      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * num_args);
															
 
																       /* Settings default type to NULL */
															
 
																-      for (i=0; i<arg_count; i++) {
															
 
																+      for (i=0; i<num_args; i++) {
															
 
																          k->arg_value[i] = NULL;
															
 
																          k->arg_type[i] = Null;
															
 
																       }
															
@@ -70,7 +70,7 @@ static void release_callback_kernel(void * e) {
 
																   //Free args
															
 
																   unsigned int j;
															
 
																-  for (j=0; j<kernel->arg_count; j++) {
															
 
																+  for (j=0; j<kernel->num_args; j++) {
															
 
																     switch (kernel->arg_type[j]) {
															
 
																       case Null:
															
 
																         break;
															
@@ -125,7 +125,7 @@ soclCreateKernel(cl_program    program,
 
																    gc_entity_store(&k->program, program);
															
 
																    k->kernel_name = strdup(kernel_name);
															
 
																-   k->arg_count = 0;
															
 
																+   k->num_args = 0;
															
 
																    k->arg_value = NULL;
															
 
																    k->arg_size = NULL;
															
--- a/socl/src/cl_createprogramwithsource.c
+++ b/socl/src/cl_createprogramwithsource.c
@@ -40,11 +40,11 @@ static void soclCreateProgramWithSource_task(void *data) {
 
																 }
															
 
																 static void rp_task(void *data) {
															
 
																-   struct _cl_program *d = (struct _cl_program*)data;
															
 
																+   cl_program program = (cl_program)data;
															
 
																    int range = starpu_worker_get_range();
															
 
																-   cl_int err = clReleaseProgram(d->cl_programs[range]);
															
 
																+   cl_int err = clReleaseProgram(program->cl_programs[range]);
															
 
																    if (err != CL_SUCCESS)
															
 
																       DEBUG_CL("clReleaseProgram", err);
															
 
																 }
															
--- a/socl/src/cl_enqueuebarrier.c
+++ b/socl/src/cl_enqueuebarrier.c
@@ -19,8 +19,9 @@
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclEnqueueBarrier(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																-   cl_event ev = enqueueBarrier(cq);   
															
 
																-   gc_entity_release(ev);
															
 
																+	command_marker cmd = command_barrier_create();
															
 
																-   return CL_SUCCESS;
															
 
																+	command_queue_enqueue(cq, cmd, 0, NULL);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_enqueuecopybuffer.c
+++ b/socl/src/cl_enqueuecopybuffer.c
@@ -66,6 +66,42 @@ static starpu_codelet codelet_copybuffer = {
 
																    .nbuffers = 2
															
 
																 };
															
 
																+cl_int command_copy_buffer_submit(command_copy_buffer cmd) {
															
 
																+	/* Aliases */
															
 
																+	cl_mem src_buffer = cmd->src_buffer;
															
 
																+	cl_mem dst_buffer = cmd->dst_buffer;
															
 
																+	size_t src_offset = cmd->src_offset;
															
 
																+	size_t dst_offset = cmd->dst_offset;
															
 
																+	size_t cb = cmd->cb;
															
 
																+
															
 
																+	struct starpu_task *task;
															
 
																+	struct arg_copybuffer *arg;
															
 
																+
															
 
																+	task = task_create(CL_COMMAND_COPY_BUFFER);
															
 
																+
															
 
																+	task->buffers[0].handle = src_buffer->handle;
															
 
																+	task->buffers[0].mode = STARPU_R;
															
 
																+	task->buffers[1].handle = dst_buffer->handle;
															
 
																+	task->buffers[1].mode = STARPU_RW;
															
 
																+	task->cl = &codelet_copybuffer;
															
 
																+
															
 
																+	arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
															
 
																+	arg->src_offset = src_offset;
															
 
																+	arg->dst_offset = dst_offset;
															
 
																+	arg->cb = cb;
															
 
																+	gc_entity_store(&arg->src_buffer, src_buffer);
															
 
																+	gc_entity_store(&arg->dst_buffer, dst_buffer);
															
 
																+	task->cl_arg = arg;
															
 
																+	task->cl_arg_size = sizeof(struct arg_copybuffer);
															
 
																+
															
 
																+	dst_buffer->scratch = 0;
															
 
																+
															
 
																+	task_submit(task, cmd);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclEnqueueCopyBuffer(cl_command_queue  cq, 
															
 
																                     cl_mem              src_buffer,
															
@@ -77,35 +113,11 @@ soclEnqueueCopyBuffer(cl_command_queue  cq,
 
																                     const cl_event *    events,
															
 
																                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																-   struct starpu_task *task;
															
 
																-   struct arg_copybuffer *arg;
															
 
																-   cl_event ev;
															
 
																-
															
 
																-   task = task_create(CL_COMMAND_COPY_BUFFER);
															
 
																-   ev = task_event(task);
															
 
																-
															
 
																-   task->buffers[0].handle = src_buffer->handle;
															
 
																-   task->buffers[0].mode = STARPU_R;
															
 
																-   task->buffers[1].handle = dst_buffer->handle;
															
 
																-   task->buffers[1].mode = STARPU_RW;
															
 
																-   task->cl = &codelet_copybuffer;
															
 
																-
															
 
																-   arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
															
 
																-   arg->src_offset = src_offset;
															
 
																-   arg->dst_offset = dst_offset;
															
 
																-   arg->cb = cb;
															
 
																-   gc_entity_store(&arg->src_buffer, src_buffer);
															
 
																-   gc_entity_store(&arg->dst_buffer, dst_buffer);
															
 
																-   task->cl_arg = arg;
															
 
																-   task->cl_arg_size = sizeof(struct arg_copybuffer);
															
 
																-
															
 
																-   dst_buffer->scratch = 0;
															
 
																-
															
 
																-   DEBUG_MSG("Submitting CopyBuffer task (event %d)\n", ev->id);
															
 
																+	command_copy_buffer cmd = command_copy_buffer_create(src_buffer, dst_buffer, src_offset, dst_offset, cb);
															
 
																-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																-   RETURN_EVENT(ev, event);
															
 
																+	RETURN_EVENT(cmd, event);
															
 
																-   return ret;
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_enqueuemapbuffer.c
+++ b/socl/src/cl_enqueuemapbuffer.c
@@ -16,76 +16,53 @@
 
																 #include "socl.h"
															
 
																-struct mb_data {
															
 
																-  cl_event ev;
															
 
																-  cl_mem buffer;
															
 
																-  cl_map_flags map_flags;
															
 
																-};
															
 
																-
															
 
																 static void mapbuffer_callback(void *args) {
															
 
																-  struct mb_data *arg = (struct mb_data*)args;
															
 
																-
															
 
																-  starpu_tag_notify_from_apps(arg->ev->id);
															
 
																-  arg->ev->status = CL_COMPLETE;
															
 
																+	command_map_buffer cmd = (command_map_buffer)args;
															
 
																-  gc_entity_unstore(&arg->ev);
															
 
																-  gc_entity_unstore(&arg->buffer);
															
 
																-  free(args);
															
 
																+	starpu_tag_notify_from_apps(cmd->event->id);
															
 
																+	cmd->event->status = CL_COMPLETE;
															
 
																 }
															
 
																 static void mapbuffer_task(void *args) {
															
 
																-  struct mb_data *arg = (struct mb_data*)args;
															
 
																+	command_map_buffer cmd = (command_map_buffer)args;
															
 
																+
															
 
																+	starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
															
 
																+
															
 
																+	starpu_data_acquire_cb(cmd->buffer->handle, mode, mapbuffer_callback, cmd);
															
 
																+}
															
 
																-  starpu_access_mode mode = (arg->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
															
 
																+cl_int command_map_buffer_submit(command_map_buffer cmd) {
															
 
																+	starpu_task task = task_create_cpu(mapbuffer_task, cmd, 0);
															
 
																-  starpu_data_acquire_cb(arg->buffer->handle, mode, mapbuffer_callback, arg);
															
 
																+	task_submit(task, cmd);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
 
																 CL_API_ENTRY void * CL_API_CALL
															
 
																 soclEnqueueMapBuffer(cl_command_queue cq,
															
 
																                    cl_mem           buffer,
															
 
																-                   cl_bool          blocking_map, 
															
 
																+                   cl_bool          blocking, 
															
 
																                    cl_map_flags     map_flags,
															
 
																                    size_t           offset, 
															
 
																-                   size_t           UNUSED(cb),
															
 
																+                   size_t           cb,
															
 
																                    cl_uint          num_events,
															
 
																                    const cl_event * events,
															
 
																                    cl_event *       event,
															
 
																                    cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																-   struct starpu_task *task;
															
 
																-   struct mb_data *arg;
															
 
																-   cl_event ev;
															
 
																-   cl_int err;
															
 
																-
															
 
																-   /* Create custom event that will be triggered when map is complete */
															
 
																-   ev = event_create();
															
 
																-
															
 
																-   /* Store arguments */
															
 
																-   arg = (struct mb_data*)malloc(sizeof(struct mb_data));
															
 
																-   arg->map_flags = map_flags;
															
 
																-   gc_entity_store(&arg->ev, ev);
															
 
																-   gc_entity_store(&arg->buffer, buffer);
															
 
																-
															
 
																-   /* Create StarPU task */
															
 
																-   task = task_create_cpu(CL_COMMAND_MAP_BUFFER, mapbuffer_task, arg, 0);
															
 
																-   cl_event map_event = task_event(task);
															
 
																-
															
 
																-   /* Enqueue task */
															
 
																-   DEBUG_MSG("Submitting MapBuffer task (event %d)\n", ev->id);
															
 
																-   err = command_queue_enqueue_fakeevent(cq, task, 0, num_events, events, ev);
															
 
																-   gc_entity_release(map_event);
															
 
																+	cl_event ev = event_create();
															
 
																-   if (errcode_ret != NULL)
															
 
																-      *errcode_ret = err;
															
 
																+	command_map_buffer cmd = command_map_buffer_create(buffer, map_flags, offset, cb, ev);
															
 
																-   if (err != CL_SUCCESS)
															
 
																-      return NULL;
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																-   if (blocking_map == CL_TRUE)
															
 
																-      soclWaitForEvents(1, &ev);
															
 
																+	if (errcode_ret != NULL)
															
 
																+		*errcode_ret = CL_SUCCESS;
															
 
																-   RETURN_EVENT(ev, event);
															
 
																+	RETURN_CUSTOM_EVENT(ev,event);
															
 
																-   return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
															
 
																+	MAY_BLOCK_CUSTOM(blocking,ev);
															
 
																+	
															
 
																+	return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
															
 
																 }
															
--- a/socl/src/cl_enqueuemarker.c
+++ b/socl/src/cl_enqueuemarker.c
@@ -20,11 +20,21 @@ CL_API_ENTRY cl_int CL_API_CALL
 
																 soclEnqueueMarker(cl_command_queue  cq,
															
 
																                 cl_event *          event) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																-   if (event == NULL)
															
 
																-      return CL_INVALID_VALUE;
															
 
																+	if (event == NULL)
															
 
																+		return CL_INVALID_VALUE;
															
 
																+	
															
 
																+	command_marker cmd = command_marker_create();
															
 
																-   starpu_task * task = task_create(CL_COMMAND_MARKER);
															
 
																-   *event = task_event(task);
															
 
																+	command_queue_enqueue(cq, cmd, 0, NULL);
															
 
																-   return command_queue_enqueue(cq, task, 0, 0, NULL);
															
 
																+	RETURN_EVENT(cmd, event);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+cl_int command_marker_submit(command_marker cmd) {
															
 
																+	struct starpu_task *task;
															
 
																+	task = task_create(CL_COMMAND_MARKER);
															
 
																+
															
 
																+	task_submit(task, cmd);
															
 
																 }
															
--- a/socl/src/cl_enqueuendrangekernel.c
+++ b/socl/src/cl_enqueuendrangekernel.c
@@ -16,81 +16,64 @@
 
																 #include "socl.h"
															
 
																-typedef struct running_kernel * running_kernel;
															
 
																-
															
 
																-struct running_kernel {
															
 
																-  cl_kernel kernel;
															
 
																-  cl_mem *buffers;
															
 
																-  unsigned int buffer_count;
															
 
																-  starpu_codelet *codelet;
															
 
																-  cl_uint work_dim;
															
 
																-  size_t * global_work_offset;
															
 
																-  size_t * global_work_size;
															
 
																-  size_t * local_work_size;
															
 
																-  /* Arguments */
															
 
																-  unsigned int arg_count;
															
 
																-  size_t *arg_size;
															
 
																-  enum kernel_arg_type  *arg_type;
															
 
																-  void  **arg_value;
															
 
																-};
															
 
																-static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
															
 
																-   running_kernel d;
															
 
																+void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
															
 
																+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
															
 
																+
															
 
																    cl_command_queue cq;
															
 
																    int wid;
															
 
																    cl_int err;
															
 
																-   d = (running_kernel)args;
															
 
																    wid = starpu_worker_get_id();
															
 
																    starpu_opencl_get_queue(wid, &cq);
															
 
																-   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, d->kernel->id);
															
 
																+   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, cmd->kernel->id);
															
 
																    int range = starpu_worker_get_range();
															
 
																    /* Set arguments */
															
 
																    {
															
 
																-      unsigned int i;
															
 
																-      int buf = 0;
															
 
																-      for (i=0; i<d->arg_count; i++) {
															
 
																-         switch (d->arg_type[i]) {
															
 
																-            case Null:
															
 
																-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], NULL);
															
 
																-               break;
															
 
																-            case Buffer: {
															
 
																-                  cl_mem mem;  
															
 
																-                  mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
															
 
																-                  err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], &mem);
															
 
																-                  buf++;
															
 
																-               }
															
 
																-               break;
															
 
																-            case Immediate:
															
 
																-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], d->arg_value[i]);
															
 
																-               break;
															
 
																-         }
															
 
																-         if (err != CL_SUCCESS) {
															
 
																-            DEBUG_CL("clSetKernelArg", err);
															
 
																-            DEBUG_ERROR("Aborting\n");
															
 
																-         }
															
 
																-      }
															
 
																+	   unsigned int i;
															
 
																+	   int buf = 0;
															
 
																+	   for (i=0; i<cmd->num_args; i++) {
															
 
																+		   switch (cmd->arg_types[i]) {
															
 
																+			   case Null:
															
 
																+				   err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], NULL);
															
 
																+				   break;
															
 
																+			   case Buffer: {
															
 
																+						cl_mem mem;  
															
 
																+						mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
															
 
																+						err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], &mem);
															
 
																+						buf++;
															
 
																+					}
															
 
																+					break;
															
 
																+			   case Immediate:
															
 
																+					err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], cmd->args[i]);
															
 
																+					break;
															
 
																+		   }
															
 
																+		   if (err != CL_SUCCESS) {
															
 
																+			   DEBUG_CL("clSetKernelArg", err);
															
 
																+			   DEBUG_ERROR("Aborting\n");
															
 
																+		   }
															
 
																+	   }
															
 
																    }
															
 
																    /* Calling Kernel */
															
 
																    cl_event event;
															
 
																-   err = clEnqueueNDRangeKernel(cq, d->kernel->cl_kernels[range], d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size, 0, NULL, &event);
															
 
																+   err = clEnqueueNDRangeKernel(cq, cmd->kernel->cl_kernels[range], cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size, 0, NULL, &event);
															
 
																    if (err != CL_SUCCESS) {
															
 
																-      ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
															
 
																-      DEBUG_CL("clEnqueueNDRangeKernel", err);
															
 
																-      DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
															
 
																-                d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size);
															
 
																-      DEBUG_MSG("Global work size: %ld %ld %ld\n", d->global_work_size[0],
															
 
																-            (d->work_dim > 1 ? d->global_work_size[1] : 1), (d->work_dim > 2 ? d->global_work_size[2] : 1)); 
															
 
																-      if (d->local_work_size != NULL)
															
 
																-         DEBUG_MSG("Local work size: %ld %ld %ld\n", d->local_work_size[0],
															
 
																-               (d->work_dim > 1 ? d->local_work_size[1] : 1), (d->work_dim > 2 ? d->local_work_size[2] : 1)); 
															
 
																-      ERROR_MSG("Aborting.\n");
															
 
																-      exit(1);
															
 
																+	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
															
 
																+	   DEBUG_CL("clEnqueueNDRangeKernel", err);
															
 
																+	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
															
 
																+			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
															
 
																+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
															
 
																+			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
															
 
																+	   if (cmd->local_work_size != NULL)
															
 
																+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
															
 
																+				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
															
 
																+	   ERROR_MSG("Aborting.\n");
															
 
																+	   exit(1);
															
 
																    }
															
 
																    /* Waiting for kernel to terminate */
															
@@ -99,219 +82,110 @@ static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
 
																 }
															
 
																 static void cleaning_task_callback(void *args) {
															
 
																-   running_kernel arg = (running_kernel)args;
															
 
																+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
															
 
																-   free(arg->arg_size);
															
 
																-   free(arg->arg_type);
															
 
																-
															
 
																-   unsigned int i;
															
 
																-   for (i=0; i<arg->arg_count; i++) {
															
 
																-      free(arg->arg_value[i]);
															
 
																-   }
															
 
																-   free(arg->arg_value);
															
 
																+	free(cmd->arg_sizes);
															
 
																+	free(cmd->arg_types);
															
 
																-   for (i=0; i<arg->buffer_count; i++)
															
 
																-      gc_entity_unstore(&arg->buffers[i]);
															
 
																+	unsigned int i;
															
 
																+	for (i=0; i<cmd->num_args; i++) {
															
 
																+		free(cmd->args[i]);
															
 
																+	}
															
 
																+	free(cmd->args);
															
 
																-   gc_entity_unstore(&arg->kernel);
															
 
																+	for (i=0; i<cmd->num_buffers; i++)
															
 
																+		gc_entity_unstore(&cmd->buffers[i]);
															
 
																-   free(arg->buffers);
															
 
																-   free(arg->global_work_offset);
															
 
																-   free(arg->global_work_size);
															
 
																-   free(arg->local_work_size);
															
 
																-   void * co = arg->codelet;
															
 
																-   arg->codelet = NULL;
															
 
																-   free(co);
															
 
																+	free(cmd->buffers);
															
 
																+	void * co = cmd->codelet;
															
 
																+	cmd->codelet = NULL;
															
 
																+	free(co);
															
 
																 }
															
 
																 static struct starpu_perfmodel_t perf_model = {
															
 
																-  .type = STARPU_HISTORY_BASED,
															
 
																-  .symbol = "perf_model"
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = "perf_model"
															
 
																 };
															
 
																 /**
															
 
																  * Real kernel enqueuing command
															
 
																  */
															
 
																-cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
															
 
																-
															
 
																-   struct starpu_task *task;
															
 
																-   running_kernel arg;
															
 
																-   starpu_codelet *codelet;
															
 
																-   cl_event ev;
															
 
																-   
															
 
																-   /* Alias struc fields */
															
 
																-   cl_command_queue cq = n->cq;
															
 
																-   cl_kernel        kernel = n->kernel;
															
 
																-   cl_uint          work_dim = n->work_dim;
															
 
																-   const size_t *   global_work_offset = n->global_work_offset;
															
 
																-   const size_t *   global_work_size = n->global_work_size;
															
 
																-   const size_t *   local_work_size = n->local_work_size;
															
 
																-   cl_uint          num_events = n->num_events;
															
 
																-   const cl_event * events = n->events;
															
 
																-   cl_event *       event = n->event;
															
 
																-   char 	    is_task = n->is_task;
															
 
																-
															
 
																-
															
 
																-   /* Allocate structures */
															
 
																-
															
 
																-   /* Codelet */
															
 
																-   codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
															
 
																-   if (codelet == NULL)
															
 
																-      return CL_OUT_OF_HOST_MEMORY;
															
 
																-
															
 
																-   /* Codelet arguments */
															
 
																-   arg = (running_kernel)malloc(sizeof(struct running_kernel));
															
 
																-   if (arg == NULL) {
															
 
																-      free(codelet);
															
 
																-      return CL_OUT_OF_HOST_MEMORY;
															
 
																-   }
															
 
																-
															
 
																-   /* StarPU task */
															
 
																-   task = task_create(is_task ? CL_COMMAND_TASK : CL_COMMAND_NDRANGE_KERNEL);
															
 
																-   ev = task_event(task);
															
 
																-
															
 
																-   /*******************
															
 
																-    * Initializations *
															
 
																-    *******************/
															
 
																-
															
 
																-   /* ------- *
															
 
																-    * Codelet *
															
 
																-    * ------- */
															
 
																-   codelet->where = STARPU_OPENCL;
															
 
																-   codelet->power_model = NULL;
															
 
																-   codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
															
 
																-   //codelet->model = NULL;
															
 
																-   codelet->model = &perf_model;
															
 
																-
															
 
																-   /* ---------------- *
															
 
																-    * Codelet argument *
															
 
																-    * ---------------- */
															
 
																-   gc_entity_store(&arg->kernel, kernel);
															
 
																-   arg->work_dim = work_dim;
															
 
																-   arg->codelet = codelet;
															
 
																-
															
 
																-   /* Global work offset */
															
 
																-   if (global_work_offset != NULL) {
															
 
																-      arg->global_work_offset = (size_t*)malloc(sizeof(size_t)*work_dim);
															
 
																-      memcpy(arg->global_work_offset, global_work_offset, work_dim*sizeof(size_t));
															
 
																-   }
															
 
																-   else arg->global_work_offset = NULL;
															
 
																-
															
 
																-   /* Global work size */
															
 
																-   arg->global_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
															
 
																-   memcpy(arg->global_work_size, global_work_size, work_dim*sizeof(size_t));
															
 
																-
															
 
																-   /* Local work size */
															
 
																-   if (local_work_size != NULL) {
															
 
																-      arg->local_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
															
 
																-      memcpy(arg->local_work_size, local_work_size, work_dim*sizeof(size_t));
															
 
																-   }
															
 
																-   else arg->local_work_size = NULL;
															
 
																-
															
 
																-   /* ----------- *
															
 
																-    * StarPU task *
															
 
																-    * ----------- */
															
 
																-   task->cl = codelet;
															
 
																-   task->cl_arg = arg;
															
 
																-   task->cl_arg_size = sizeof(struct running_kernel);
															
 
																-
															
 
																-   /* Convert OpenCL's memory objects to StarPU buffers */
															
 
																-   codelet->nbuffers = 0;
															
 
																-   {
															
 
																-      arg->buffers = malloc(sizeof(cl_mem) * kernel->arg_count);
															
 
																-      arg->buffer_count = 0;
															
 
																-
															
 
																-      unsigned int i;
															
 
																-      for (i=0; i<kernel->arg_count; i++) {
															
 
																-         if (kernel->arg_type[i] == Buffer) {
															
 
																-
															
 
																-            cl_mem buf = (cl_mem)kernel->arg_value[i];
															
 
																-
															
 
																-            /* We save cl_mem references in order to properly release them after kernel termination */
															
 
																-            gc_entity_store(&arg->buffers[arg->buffer_count], buf);
															
 
																-            arg->buffer_count += 1;
															
 
																-
															
 
																-            codelet->nbuffers++;
															
 
																-            task->buffers[codelet->nbuffers-1].handle = buf->handle;
															
 
																-
															
 
																-            /* Determine best StarPU buffer access mode */
															
 
																-            int mode;
															
 
																-            if (buf->mode == CL_MEM_READ_ONLY)
															
 
																-               mode = STARPU_R;
															
 
																-            else if (buf->mode == CL_MEM_WRITE_ONLY) {
															
 
																-               mode = STARPU_W;
															
 
																-               buf->scratch = 0;
															
 
																-            }
															
 
																-            else if (buf->scratch) { //RW but never accessed in RW or W mode
															
 
																-               mode = STARPU_W;
															
 
																-               buf->scratch = 0;
															
 
																-            }
															
 
																-            else {
															
 
																-               mode = STARPU_RW;
															
 
																-               buf->scratch = 0;
															
 
																-            }
															
 
																-            task->buffers[codelet->nbuffers-1].mode = mode; 
															
 
																-         }
															
 
																-      }
															
 
																-   }
															
 
																-
															
 
																-   /* Copy arguments as kernel args can be modified by the time we launch the kernel */
															
 
																-   {
															
 
																-      arg->arg_count = kernel->arg_count;
															
 
																-      arg->arg_size = malloc(sizeof(size_t) * kernel->arg_count);
															
 
																-      memcpy(arg->arg_size, kernel->arg_size, sizeof(size_t) * kernel->arg_count);
															
 
																-      arg->arg_type = malloc(sizeof(enum kernel_arg_type) * kernel->arg_count);
															
 
																-      memcpy(arg->arg_type, kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->arg_count);
															
 
																-      arg->arg_value = malloc(sizeof(void*) * kernel->arg_count);
															
 
																-      unsigned int i;
															
 
																-      for (i=0; i<kernel->arg_count; i++) {
															
 
																-         if (kernel->arg_value[i] != NULL) {
															
 
																-           arg->arg_value[i] = malloc(arg->arg_size[i]);
															
 
																-           memcpy(arg->arg_value[i], kernel->arg_value[i], arg->arg_size[i]);
															
 
																-         }
															
 
																-         else arg->arg_value[i] = NULL;
															
 
																-      }
															
 
																-   }
															
 
																-
															
 
																-   DEBUG_MSG("Submitting NDRange task (event %d)\n", ev->id);
															
 
																-
															
 
																-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
															
 
																-
															
 
																-   /* Enqueue a cleaning task */
															
 
																-   starpu_task * cleaning_task = task_create_cpu(0, cleaning_task_callback, arg,1);
															
 
																-   cl_event cleaning_event = task_event(cleaning_task);
															
 
																-   command_queue_enqueue(cq, cleaning_task, 0, 1, &ev);
															
 
																+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd) {
															
 
																+
															
 
																+	starpu_task task = task_create();
															
 
																+	task->cl = cmd->codelet;
															
 
																+	task->cl_arg = cmd;
															
 
																+	task->cl_arg_size = sizeof(cmd);
															
 
																+
															
 
																+	starpu_codelet * codelet = cmd->codelet;
															
 
																+
															
 
																+	/* We need to detect which parameters are OpenCL's memory objects and
															
 
																+	 * we retrieve their corresponding StarPU buffers */
															
 
																+	cmd->num_buffers = 0;
															
 
																+	cmd->buffers = malloc(sizeof(cl_mem) * cmd->num_args);
															
 
																+
															
 
																+	unsigned int i;
															
 
																+	for (i=0; i<cmd->num_args; i++) {
															
 
																+		if (cmd->arg_types[i] == Buffer) {
															
 
																+
															
 
																+			cl_mem buf = *(cl_mem*)cmd->args[i];
															
 
																+
															
 
																+			gc_entity_store(&cmd->buffers[cmd->num_buffers], buf);
															
 
																+			task->buffers[cmd->num_buffers].handle = buf->handle;
															
 
																+
															
 
																+			/* Determine best StarPU buffer access mode */
															
 
																+			int mode;
															
 
																+			if (buf->mode == CL_MEM_READ_ONLY)
															
 
																+				mode = STARPU_R;
															
 
																+			else if (buf->mode == CL_MEM_WRITE_ONLY) {
															
 
																+				mode = STARPU_W;
															
 
																+				buf->scratch = 0;
															
 
																+			}
															
 
																+			else if (buf->scratch) { //RW but never accessed in RW or W mode
															
 
																+				mode = STARPU_W;
															
 
																+				buf->scratch = 0;
															
 
																+			}
															
 
																+			else {
															
 
																+				mode = STARPU_RW;
															
 
																+				buf->scratch = 0;
															
 
																+			}
															
 
																+			task->buffers[cmd->num_buffers].mode = mode; 
															
 
																+
															
 
																+			cmd->num_buffers += 1;
															
 
																+		}
															
 
																+	}
															
 
																+	codelet->nbuffers = cmd->num_buffers;
															
 
																+
															
 
																+	task_submit(task, cmd);
															
 
																+
															
 
																+	/* Enqueue a cleaning task */
															
 
																+	//FIXME: execute this in the callback?
															
 
																+	starpu_task cleaning_task = task_create_cpu(cleaning_task_callback, cmd,1);
															
 
																+	cl_event ev = command_event_get(cmd);
															
 
																+	task_depends_on(cleaning_task, 1, &ev);
															
 
																+	task_submit(cleaning_task, cmd);
															
 
																-   gc_entity_release(cleaning_event);
															
 
																-  
															
 
																-   RETURN_EVENT(ev, event);
															
 
																-
															
 
																-   return ret;
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
 
																-/**
															
 
																- * Virtual kernel enqueueing command
															
 
																- */
															
 
																+
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclEnqueueNDRangeKernel(cl_command_queue cq,
															
 
																-                       cl_kernel        kernel,
															
 
																-                       cl_uint          work_dim,
															
 
																-                       const size_t *   global_work_offset,
															
 
																-                       const size_t *   global_work_size,
															
 
																-                       const size_t *   local_work_size,
															
 
																-                       cl_uint          num_events,
															
 
																-                       const cl_event * events,
															
 
																-                       cl_event *       event) CL_API_SUFFIX__VERSION_1_1
															
 
																+		cl_kernel        kernel,
															
 
																+		cl_uint          work_dim,
															
 
																+		const size_t *   global_work_offset,
															
 
																+		const size_t *   global_work_size,
															
 
																+		const size_t *   local_work_size,
															
 
																+		cl_uint          num_events,
															
 
																+		const cl_event * events,
															
 
																+		cl_event *       event) CL_API_SUFFIX__VERSION_1_1
															
 
																 {
															
 
																-	node_enqueue_kernel n;
															
 
																+	command_ndrange_kernel cmd = command_ndrange_kernel_create(kernel, work_dim,
															
 
																+			global_work_offset, global_work_size, local_work_size);
															
 
																+
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																-	n = graph_create_enqueue_kernel(0, cq, kernel, work_dim, global_work_offset, global_work_size,
															
 
																-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
															
 
																-		kernel->arg_type, kernel->arg_value);
															
 
																-	
															
 
																-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
															
 
																-	node_play_enqueue_kernel(n);
															
 
																+	RETURN_EVENT(cmd, event);
															
 
																-	//graph_store(n);
															
 
																 	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_enqueuereadbuffer.c
+++ b/socl/src/cl_enqueuereadbuffer.c
@@ -66,6 +66,37 @@ static starpu_codelet codelet_readbuffer = {
 
																    .nbuffers = 1
															
 
																 };
															
 
																+cl_int command_read_buffer_submit(command_read_buffer cmd) {
															
 
																+	/* Aliases */
															
 
																+	cl_mem buffer = cmd->buffer;
															
 
																+	size_t offset = cmd->offset;
															
 
																+	size_t cb = cmd->cb;
															
 
																+	void * ptr = cmd->ptr;
															
 
																+
															
 
																+	struct starpu_task *task;
															
 
																+	struct arg_readbuffer *arg;
															
 
																+
															
 
																+	task = task_create(CL_COMMAND_READ_BUFFER);
															
 
																+
															
 
																+	task->buffers[0].handle = buffer->handle;
															
 
																+	task->buffers[0].mode = STARPU_R;
															
 
																+	task->cl = &codelet_readbuffer;
															
 
																+
															
 
																+	arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
															
 
																+	arg->offset = offset;
															
 
																+	arg->cb = cb;
															
 
																+	arg->ptr = ptr;
															
 
																+	task->cl_arg = arg;
															
 
																+	task->cl_arg_size = sizeof(struct arg_readbuffer);
															
 
																+
															
 
																+	gc_entity_store(&arg->buffer, buffer);
															
 
																+
															
 
																+	task_submit(task, cmd);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclEnqueueReadBuffer(cl_command_queue  cq,
															
 
																                     cl_mem              buffer,
															
@@ -77,33 +108,14 @@ soclEnqueueReadBuffer(cl_command_queue  cq,
 
																                     const cl_event *    events,
															
 
																                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
															
 
																 { 
															
 
																-   struct starpu_task *task;
															
 
																-   struct arg_readbuffer *arg;
															
 
																-   cl_event ev;
															
 
																-
															
 
																-   task = task_create(CL_COMMAND_READ_BUFFER);
															
 
																-   ev = task_event(task);
															
 
																-
															
 
																-   task->buffers[0].handle = buffer->handle;
															
 
																-   task->buffers[0].mode = STARPU_R;
															
 
																-   task->cl = &codelet_readbuffer;
															
 
																-
															
 
																-   arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
															
 
																-   arg->offset = offset;
															
 
																-   arg->cb = cb;
															
 
																-   arg->ptr = ptr;
															
 
																-   task->cl_arg = arg;
															
 
																-   task->cl_arg_size = sizeof(struct arg_readbuffer);
															
 
																-
															
 
																-   gc_entity_store(&arg->buffer, buffer);
															
 
																-   task->synchronous = (blocking == CL_TRUE);
															
 
																+	command_read_buffer cmd = command_read_buffer_create(buffer, offset, cb, ptr);
															
 
																-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
															
 
																+	RETURN_EVENT(cmd, event);
															
 
																-   RETURN_EVENT(ev, event);
															
 
																+	MAY_BLOCK(blocking);
															
 
																-   return ret;
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_enqueuetask.c
+++ b/socl/src/cl_enqueuetask.c
@@ -16,14 +16,6 @@
 
																 #include "socl.h"
															
 
																-static cl_uint work_dim = 3;
															
 
																-static const size_t global_work_offset[3] = {0,0,0};
															
 
																-static const size_t global_work_size[3] = {1,1,1};
															
 
																-static const size_t * local_work_size = NULL;
															
 
																-
															
 
																-CL_API_ENTRY cl_int CL_API_CALL
															
 
																-soclEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) CL_API_SUFFIX__VERSION_1_0;
															
 
																-
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclEnqueueTask(cl_command_queue cq,
															
 
																               cl_kernel         kernel,
															
@@ -31,15 +23,11 @@ soclEnqueueTask(cl_command_queue cq,
 
																               const cl_event *  events,
															
 
																               cl_event *        event) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																-	node_enqueue_kernel n;
															
 
																-
															
 
																-	n = graph_create_enqueue_kernel(1, cq, kernel, work_dim, global_work_offset, global_work_size,
															
 
																-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
															
 
																-		kernel->arg_type, kernel->arg_value);
															
 
																+	command_ndrange_kernel cmd = command_task_create(kernel);
															
 
																-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
															
 
																-	node_play_enqueue_kernel(n);
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																+
															
 
																+	RETURN_EVENT(cmd, event);
															
 
																-	//graph_store(n);
															
 
																 	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_enqueueunmapmemobject.c
+++ b/socl/src/cl_enqueueunmapmemobject.c
@@ -16,27 +16,31 @@
 
																 #include "socl.h"
															
 
																+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd) {
															
 
																+	/* Aliases */
															
 
																+	cl_mem buffer = cmd->buffer;
															
 
																+
															
 
																+	//FIXME: use a callback
															
 
																+	starpu_task task = task_create_cpu((void(*)(void*))starpu_data_release, buffer->handle, 0);
															
 
																+
															
 
																+	task_submit(task, cmd);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclEnqueueUnmapMemObject(cl_command_queue cq,
															
 
																-                        cl_mem            memobj,
															
 
																-                        void *            UNUSED(mapped_ptr),
															
 
																+                        cl_mem            buffer,
															
 
																+                        void *            ptr,
															
 
																                         cl_uint           num_events,
															
 
																                         const cl_event *  events,
															
 
																                         cl_event *        event) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																-   struct starpu_task *task;
															
 
																-   cl_int err;
															
 
																-   cl_event ev;
															
 
																-
															
 
																-   /* Create StarPU task */
															
 
																-   task = task_create_cpu(CL_COMMAND_UNMAP_MEM_OBJECT, (void(*)(void*))starpu_data_release, memobj->handle, 0);
															
 
																-   ev = task_event(task);
															
 
																-
															
 
																-   DEBUG_MSG("Submitting UnmapBuffer task (event %d)\n", task->tag_id);
															
 
																+	command_unmap_mem_object cmd = command_unmap_mem_object_create(buffer, ptr);
															
 
																-   err = command_queue_enqueue(cq, task, 0, num_events, events);
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																-   RETURN_EVENT(ev, event);
															
 
																+	RETURN_EVENT(cmd, event);
															
 
																-   return err;
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_enqueuewaitforevents.c
+++ b/socl/src/cl_enqueuewaitforevents.c
@@ -22,10 +22,9 @@ soclEnqueueWaitForEvents(cl_command_queue cq,
 
																                        const cl_event * events) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_WAIT_FOR_EVENTS doesn't exist
															
 
																-   starpu_task * task = task_create(CL_COMMAND_MARKER);
															
 
																+	command_marker cmd = command_marker_create();
															
 
																-   command_queue_enqueue(cq, task, 0, num_events, events);
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																-   return CL_SUCCESS;
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_enqueuewritebuffer.c
+++ b/socl/src/cl_enqueuewritebuffer.c
@@ -68,6 +68,43 @@ static starpu_codelet codelet_writebuffer = {
 
																    .nbuffers = 1
															
 
																 };
															
 
																+cl_int command_write_buffer_submit(command_write_buffer cmd) {
															
 
																+	/* Aliases */
															
 
																+	cl_mem buffer = cmd->buffer;
															
 
																+	size_t offset = cmd->offset;
															
 
																+	size_t cb = cmd->cb;
															
 
																+	const void * ptr = cmd->ptr;
															
 
																+
															
 
																+	struct starpu_task *task;
															
 
																+	struct arg_writebuffer *arg;
															
 
																+
															
 
																+	task = task_create(CL_COMMAND_WRITE_BUFFER);
															
 
																+
															
 
																+	task->buffers[0].handle = buffer->handle;
															
 
																+	//If only a subpart of the buffer is written, RW access mode is required
															
 
																+	if (cb != buffer->size)
															
 
																+		task->buffers[0].mode = STARPU_RW;
															
 
																+	else 
															
 
																+		task->buffers[0].mode = STARPU_W;
															
 
																+	task->cl = &codelet_writebuffer;
															
 
																+
															
 
																+	arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
															
 
																+	arg->offset = offset;
															
 
																+	arg->cb = cb;
															
 
																+	arg->ptr = ptr;
															
 
																+	task->cl_arg = arg;
															
 
																+	task->cl_arg_size = sizeof(struct arg_writebuffer);
															
 
																+
															
 
																+	gc_entity_store(&arg->buffer, buffer);
															
 
																+
															
 
																+	//The buffer now contains meaningful data
															
 
																+	arg->buffer->scratch = 0;
															
 
																+
															
 
																+	task_submit(task, cmd);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclEnqueueWriteBuffer(cl_command_queue cq, 
															
 
																                      cl_mem             buffer, 
															
@@ -79,41 +116,13 @@ soclEnqueueWriteBuffer(cl_command_queue cq,
 
																                      const cl_event *   events, 
															
 
																                      cl_event *         event) CL_API_SUFFIX__VERSION_1_0
															
 
																 { 
															
 
																-   struct starpu_task *task;
															
 
																-   struct arg_writebuffer *arg;
															
 
																-   cl_event ev;
															
 
																-
															
 
																-   task = task_create(CL_COMMAND_WRITE_BUFFER);
															
 
																-   ev = task_event(task);
															
 
																-
															
 
																-   task->buffers[0].handle = buffer->handle;
															
 
																-   //If only a subpart of the buffer is written, RW access mode is required
															
 
																-   if (cb != buffer->size)
															
 
																-      task->buffers[0].mode = STARPU_RW;
															
 
																-   else 
															
 
																-      task->buffers[0].mode = STARPU_W;
															
 
																-   task->cl = &codelet_writebuffer;
															
 
																-
															
 
																-   arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
															
 
																-   arg->offset = offset;
															
 
																-   arg->cb = cb;
															
 
																-   arg->ptr = ptr;
															
 
																-   task->cl_arg = arg;
															
 
																-   task->cl_arg_size = sizeof(struct arg_writebuffer);
															
 
																-
															
 
																-   gc_entity_store(&arg->buffer, buffer);
															
 
																-
															
 
																-   //The buffer now contains meaningful data
															
 
																-   arg->buffer->scratch = 0;
															
 
																-
															
 
																-   task->synchronous = (blocking == CL_TRUE);
															
 
																+	command_write_buffer cmd = command_write_buffer_create(buffer, offset, cb, ptr);
															
 
																-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
															
 
																+	command_queue_enqueue(cq, cmd, num_events, events);
															
 
																-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events,events);
															
 
																+	RETURN_EVENT(cmd, event);
															
 
																-   /* Return retained event if required by user */
															
 
																-   RETURN_EVENT(ev,event);
															
 
																+	MAY_BLOCK(blocking);
															
 
																-   return ret;
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_finish.c
+++ b/socl/src/cl_finish.c
@@ -17,11 +17,14 @@
 
																 #include "socl.h"
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																-soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
															
 
																-{
															
 
																-   cl_event ev = enqueueBarrier(cq);
															
 
																-   soclWaitForEvents(1, &ev);
															
 
																-   gc_entity_release(ev);
															
 
																+soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0 {
															
 
																-   return CL_SUCCESS;
															
 
																+	command_marker cmd = command_barrier_create();
															
 
																+
															
 
																+	command_queue_enqueue(cq, cmd, 0, NULL);
															
 
																+		cl_event ev = command_event_get(cmd);
															
 
																+
															
 
																+	MAY_BLOCK(CL_TRUE)
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
--- a/socl/src/cl_geteventinfo.c
+++ b/socl/src/cl_geteventinfo.c
@@ -17,6 +17,7 @@
 
																 #include "socl.h"
															
 
																 #include "getinfo.h"
															
 
																+
															
 
																 CL_API_ENTRY cl_int CL_API_CALL
															
 
																 soclGetEventInfo(cl_event       event,
															
 
																                cl_event_info    param_name,
															
@@ -33,7 +34,7 @@ soclGetEventInfo(cl_event       event,
 
																    switch (param_name) {
															
 
																       INFO_CASE(CL_EVENT_COMMAND_QUEUE, event->cq);
															
 
																-      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->type);
															
 
																+      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->command->typ);
															
 
																       INFO_CASE(CL_EVENT_COMMAND_EXECUTION_STATUS, event->status);
															
 
																       INFO_CASE(CL_EVENT_REFERENCE_COUNT, event->_entity.refs);
															
 
																       default:
															
--- a/socl/src/cl_getkernelinfo.c
+++ b/socl/src/cl_getkernelinfo.c
@@ -29,7 +29,7 @@ soclGetKernelInfo(cl_kernel       kernel,
 
																    switch (param_name) {
															
 
																       INFO_CASE_EX(CL_KERNEL_FUNCTION_NAME, kernel->kernel_name, strlen(kernel->kernel_name)+1)
															
 
																-      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->arg_count)
															
 
																+      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->num_args)
															
 
																       INFO_CASE(CL_KERNEL_REFERENCE_COUNT, kernel->_entity.refs)
															
 
																       INFO_CASE(CL_KERNEL_PROGRAM, kernel->program)
															
 
																       INFO_CASE(CL_KERNEL_CONTEXT, kernel->program->context)
															
--- a/socl/src/cl_setkernelarg.c
+++ b/socl/src/cl_setkernelarg.c
@@ -25,7 +25,7 @@ soclSetKernelArg(cl_kernel  kernel,
 
																    if (kernel == NULL)
															
 
																       return CL_INVALID_KERNEL;
															
 
																-   if (arg_index >= kernel->arg_count)
															
 
																+   if (arg_index >= kernel->num_args)
															
 
																       return CL_INVALID_ARG_INDEX;
															
 
																    //FIXME: we don't return CL_INVALID_ARG_VALUE if "arg_value is NULL for an argument that is not declared with __local qualifier or vice-versa"
															
@@ -38,7 +38,8 @@ soclSetKernelArg(cl_kernel  kernel,
 
																          break;
															
 
																       case Buffer:
															
 
																          kernel->arg_type[arg_index] = Null;
															
 
																-         gc_entity_unstore((cl_mem*)&kernel->arg_value[arg_index]);
															
 
																+         gc_entity_unstore((cl_mem*)kernel->arg_value[arg_index]);
															
 
																+	 free(kernel->arg_value[arg_index]);
															
 
																          kernel->arg_value[arg_index] = NULL;
															
 
																          break;
															
 
																       case Immediate:
															
@@ -60,7 +61,8 @@ soclSetKernelArg(cl_kernel  kernel,
 
																       if ((arg_size == sizeof(cl_mem)) && ((buf = mem_object_fetch(arg_value)) != NULL)) {
															
 
																          DEBUG_MSG("Found buffer %d \n", buf->id);
															
 
																          kernel->arg_type[arg_index] = Buffer;
															
 
																-         gc_entity_store(&kernel->arg_value[arg_index], buf);
															
 
																+         kernel->arg_value[arg_index] = malloc(sizeof(void*));
															
 
																+	 gc_entity_store((cl_mem*)kernel->arg_value[arg_index], buf);
															
 
																       }
															
 
																       else {
															
 
																          /* Argument must be an immediate buffer  */
															
--- a/socl/src/cl_waitforevents.c
+++ b/socl/src/cl_waitforevents.c
@@ -23,7 +23,12 @@ soclWaitForEvents(cl_uint           num_events,
 
																    unsigned int i;
															
 
																    DEBUG_MSG("Waiting for events: ");
															
 
																    for (i=0; i<num_events; i++) {
															
 
																-      DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
															
 
																+   	command_graph_dump(event_list[i]->command);
															
 
																+
															
 
																+   	/* We need to submit commands if it's not already done */
															
 
																+	command_submit_deep(event_list[i]->command);
															
 
																+
															
 
																+      	DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
															
 
																    }
															
 
																    DEBUG_MSG_NOHEAD("\n");
															
--- a/socl/src/command.c
+++ b/socl/src/command.c
@@ -0,0 +1,235 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010,2011 University of Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "socl.h"
															
 
																+
															
 
																+void command_init_ex(cl_command cmd, cl_command_type typ) {
															
 
																+	cmd->typ = typ;
															
 
																+	cmd->num_events = 0;
															
 
																+	cmd->events = NULL;
															
 
																+	cmd->event = event_create();
															
 
																+	cmd->event->command = cmd;
															
 
																+	cmd->cq = NULL;
															
 
																+	cmd->task = NULL;
															
 
																+	cmd->submitted = 0;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+void command_submit_ex(cl_command cmd) {
															
 
																+#define SUBMIT(typ,name) case typ:\
															
 
																+	name##_submit((name)cmd);\
															
 
																+	break;
															
 
																+
															
 
																+	assert(cmd->submitted == 0);
															
 
																+
															
 
																+	switch(cmd->typ) {
															
 
																+		SUBMIT(CL_COMMAND_NDRANGE_KERNEL, command_ndrange_kernel)
															
 
																+		SUBMIT(CL_COMMAND_TASK, command_ndrange_kernel)
															
 
																+		SUBMIT(CL_COMMAND_READ_BUFFER, command_read_buffer)
															
 
																+		SUBMIT(CL_COMMAND_WRITE_BUFFER, command_write_buffer)
															
 
																+		SUBMIT(CL_COMMAND_COPY_BUFFER, command_copy_buffer)
															
 
																+		SUBMIT(CL_COMMAND_MAP_BUFFER, command_map_buffer)
															
 
																+		SUBMIT(CL_COMMAND_UNMAP_MEM_OBJECT, command_unmap_mem_object)
															
 
																+		SUBMIT(CL_COMMAND_MARKER, command_marker)
															
 
																+		default:
															
 
																+			ERROR_STOP("Trying to submit unknown command (type %x)", cmd->typ);
															
 
																+	}
															
 
																+
															
 
																+	cmd->submitted = 1;
															
 
																+#undef SUBMIT
															
 
																+}
															
 
																+
															
 
																+cl_int command_submit_deep_ex(cl_command cmd) {
															
 
																+	if (cmd->submitted == 1)
															
 
																+		return CL_SUCCESS;
															
 
																+	
															
 
																+	/* We set this in order to avoid cyclic dependencies */
															
 
																+	cmd->submitted = 1;
															
 
																+
															
 
																+	unsigned int i;
															
 
																+	for (i=0; i<cmd->num_events; i++)
															
 
																+		command_submit_deep(cmd->events[i]->command);
															
 
																+	
															
 
																+	cmd->submitted = 0;
															
 
																+
															
 
																+	command_submit_ex(cmd);
															
 
																+
															
 
																+	return CL_SUCCESS;
															
 
																+}
															
 
																+
															
 
																+void command_graph_dump_ex(cl_command cmd) {
															
 
																+
															
 
																+	unsigned int i;
															
 
																+	for (i=0; i<cmd->num_events; i++)
															
 
																+		command_graph_dump_ex(cmd->events[i]->command);
															
 
																+
															
 
																+	printf("CMD %lx TYPE %d DEPS", cmd, cmd->typ);
															
 
																+	for (i=0; i<cmd->num_events; i++)
															
 
																+		printf(" %lx", cmd->events[i]->command);
															
 
																+	printf("\n");
															
 
																+
															
 
																+}
															
 
																+
															
 
																+#define nullOrDup(name,size) cmd->name = memdup_safe(name,size)
															
 
																+#define dup(name) cmd->name = name
															
 
																+#define dupEntity(name) do { cmd->name = name; gc_entity_retain(name); } while (0);
															
 
																+
															
 
																+void soclEnqueueNDRangeKernel_task(void *descr[], void *args);
															
 
																+
															
 
																+command_ndrange_kernel command_ndrange_kernel_create (
															
 
																+		cl_kernel        kernel,
															
 
																+		cl_uint          work_dim,
															
 
																+		const size_t *   global_work_offset,
															
 
																+		const size_t *   global_work_size,
															
 
																+		const size_t *   local_work_size)
															
 
																+{
															
 
																+	command_ndrange_kernel cmd = malloc(sizeof(struct command_ndrange_kernel_t));
															
 
																+	command_init(cmd, CL_COMMAND_NDRANGE_KERNEL);
															
 
																+
															
 
																+	dupEntity(kernel);
															
 
																+	dup(work_dim);
															
 
																+	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
															
 
																+	nullOrDup(global_work_size, work_dim*sizeof(size_t));
															
 
																+	nullOrDup(local_work_size, work_dim*sizeof(size_t));
															
 
																+
															
 
																+   	/* Codelet */
															
 
																+   	cmd->codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
															
 
																+	starpu_codelet * codelet = cmd->codelet;
															
 
																+	codelet->where = STARPU_OPENCL;
															
 
																+	codelet->power_model = NULL;
															
 
																+	codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
															
 
																+	codelet->model = NULL;
															
 
																+
															
 
																+   	/* Kernel is mutable, so we duplicate its parameters... */
															
 
																+	cmd->num_args = kernel->num_args;
															
 
																+	cmd->arg_sizes = memdup(kernel->arg_size, sizeof(size_t) * kernel->num_args);
															
 
																+	cmd->arg_types = memdup(kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->num_args);
															
 
																+	cmd->args = memdup_deep_varsize_safe(kernel->arg_value, kernel->num_args, kernel->arg_size);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_ndrange_kernel command_task_create (cl_kernel kernel) {
															
 
																+
															
 
																+	static cl_uint task_work_dim = 3;
															
 
																+	static const size_t task_global_work_offset[3] = {0,0,0};
															
 
																+	static const size_t task_global_work_size[3] = {1,1,1};
															
 
																+	static const size_t * task_local_work_size = NULL;
															
 
																+
															
 
																+	command_ndrange_kernel cmd = command_ndrange_kernel_create(
															
 
																+			kernel, task_work_dim, task_global_work_offset,
															
 
																+			task_global_work_size, task_local_work_size);
															
 
																+
															
 
																+	/* This is the only difference with command_ndrange_kernel_create */
															
 
																+	cmd->_command.typ = CL_COMMAND_TASK;
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_marker command_barrier_create () {
															
 
																+
															
 
																+	command_marker cmd = malloc(sizeof(struct command_marker_t));
															
 
																+	command_init(cmd, CL_COMMAND_BARRIER);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_marker command_marker_create () {
															
 
																+
															
 
																+	command_marker cmd = malloc(sizeof(struct command_marker_t));
															
 
																+	command_init(cmd, CL_COMMAND_MARKER);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_map_buffer command_map_buffer_create(
															
 
																+		cl_mem buffer,
															
 
																+		cl_map_flags map_flags,
															
 
																+		size_t offset,
															
 
																+		size_t cb,
															
 
																+		cl_event event
															
 
																+		) {
															
 
																+
															
 
																+	command_map_buffer cmd = malloc(sizeof(struct command_map_buffer_t));
															
 
																+	command_init(cmd, CL_COMMAND_MAP_BUFFER);
															
 
																+
															
 
																+	dupEntity(buffer);
															
 
																+	dup(map_flags);
															
 
																+	dup(offset);
															
 
																+	dup(cb);
															
 
																+	dupEntity(event);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_unmap_mem_object command_unmap_mem_object_create(cl_mem buffer, void * ptr) {
															
 
																+	command_unmap_mem_object cmd = malloc(sizeof(struct command_unmap_mem_object_t));
															
 
																+	command_init(cmd, CL_COMMAND_UNMAP_MEM_OBJECT);
															
 
																+
															
 
																+	dupEntity(buffer);
															
 
																+	dup(ptr);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_read_buffer command_read_buffer_create(cl_mem buffer, size_t offset, size_t cb, void * ptr) {
															
 
																+
															
 
																+	command_read_buffer cmd = malloc(sizeof(struct command_read_buffer_t));
															
 
																+	command_init(cmd, CL_COMMAND_READ_BUFFER);
															
 
																+
															
 
																+	dupEntity(buffer);
															
 
																+	dup(offset);
															
 
																+	dup(cb);
															
 
																+	dup(ptr);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_write_buffer command_write_buffer_create(cl_mem buffer, size_t offset, size_t cb, const void * ptr) {
															
 
																+
															
 
																+	command_write_buffer cmd = malloc(sizeof(struct command_write_buffer_t));
															
 
																+	command_init(cmd, CL_COMMAND_WRITE_BUFFER);
															
 
																+
															
 
																+	dupEntity(buffer);
															
 
																+	dup(offset);
															
 
																+	dup(cb);
															
 
																+	dup(ptr);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+command_copy_buffer command_copy_buffer_create( cl_mem src_buffer, cl_mem dst_buffer,
															
 
																+		size_t src_offset, size_t dst_offset, size_t cb)
															
 
																+{
															
 
																+	command_copy_buffer cmd = malloc(sizeof(struct command_copy_buffer_t));
															
 
																+	command_init(cmd, CL_COMMAND_COPY_BUFFER);
															
 
																+
															
 
																+	dupEntity(src_buffer);
															
 
																+	dupEntity(dst_buffer);
															
 
																+	dup(src_offset);
															
 
																+	dup(dst_offset);
															
 
																+	dup(cb);
															
 
																+
															
 
																+	return cmd;
															
 
																+}
															
 
																+
															
 
																+#undef nullOrDup
															
 
																+#undef nodeNullOrDup
															
 
																+#undef dup
															
 
																+#undef dupEntity
															
 
																+#undef nodeDup
															
 
																+#undef memdup
															
 
																+
															
--- a/socl/src/command.h
+++ b/socl/src/command.h
@@ -0,0 +1,198 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010,2011 University of Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef SOCL_COMMANDS_H
															
 
																+#define SOCL_COMMANDS_H
															
 
																+
															
 
																+typedef struct cl_command_t * cl_command;
															
 
																+
															
 
																+/**
															
 
																+ * Initialize a command structure
															
 
																+ *
															
 
																+ * Command constructors for each kind of command use this method
															
 
																+ * Implicit and explicit dependencies must be passed as parameters
															
 
																+ */
															
 
																+void command_init_ex(cl_command cmd, cl_command_type typ);
															
 
																+#define command_init(cmd,typ) \
															
 
																+	command_init_ex((cl_command)cmd,typ)
															
 
																+
															
 
																+/** Submit a command for execution */
															
 
																+void command_submit_ex(cl_command cmd);
															
 
																+#define command_submit(cmd) \
															
 
																+	command_submit_ex(&(cmd)->_command)
															
 
																+
															
 
																+/** Submit a command and its dependencies */
															
 
																+cl_int command_submit_deep_ex(cl_command cmd);
															
 
																+#define command_submit_deep(cmd) (command_submit_deep_ex((cl_command)cmd))
															
 
																+
															
 
																+void command_graph_dump_ex(cl_command cmd);
															
 
																+#define command_graph_dump(cmd) (command_graph_dump_ex((cl_command)cmd))
															
 
																+
															
 
																+/**************************
															
 
																+ * OpenCL Commands
															
 
																+ **************************/
															
 
																+struct cl_command_t {
															
 
																+	cl_command_type	typ;	 	/* Command type */
															
 
																+	cl_uint 	num_events;	/* Number of dependencies */
															
 
																+	cl_event * 	events;		/* Dependencies */
															
 
																+	cl_event  	event;		/* Event for this command */
															
 
																+	cl_command_queue cq;		/* Command queue the command is enqueued in */
															
 
																+	starpu_task	task;		/* Associated StarPU task, if any */
															
 
																+	char		submitted;	/* True if the command has been submitted to StarPU */
															
 
																+};
															
 
																+
															
 
																+#define command_type_get(cmd) (((cl_command)cmd)->typ)
															
 
																+#define command_event_get(cmd) (((cl_command)cmd)->event)
															
 
																+#define command_num_events_get(cmd) (((cl_command)cmd)->num_events)
															
 
																+#define command_events_get(cmd) (((cl_command)cmd)->events)
															
 
																+#define command_task_get(cmd) (((cl_command)cmd)->task)
															
 
																+#define command_cq_get(cmd) (((cl_command)cmd)->cq)
															
 
																+
															
 
																+#define CL_COMMAND struct cl_command_t _command;
															
 
																+
															
 
																+typedef struct command_ndrange_kernel_t {
															
 
																+	CL_COMMAND
															
 
																+
															
 
																+	cl_kernel        kernel;
															
 
																+	cl_uint          work_dim;
															
 
																+	const size_t *   global_work_offset;
															
 
																+	const size_t *   global_work_size;
															
 
																+	const size_t *   local_work_size;
															
 
																+	cl_uint 	 num_args;
															
 
																+	size_t *	 arg_sizes;
															
 
																+	enum kernel_arg_type * arg_types;
															
 
																+	void **		 args;
															
 
																+	starpu_codelet * codelet;
															
 
																+	cl_uint		 num_buffers;
															
 
																+	cl_mem *	 buffers;
															
 
																+} * command_ndrange_kernel;
															
 
																+
															
 
																+
															
 
																+typedef struct command_read_buffer_t {
															
 
																+	CL_COMMAND
															
 
																+	
															
 
																+	cl_mem buffer;
															
 
																+	size_t offset;
															
 
																+	size_t cb;
															
 
																+	void * ptr;
															
 
																+} * command_read_buffer;
															
 
																+
															
 
																+
															
 
																+typedef struct command_write_buffer_t {
															
 
																+	CL_COMMAND
															
 
																+
															
 
																+	cl_mem buffer;
															
 
																+	size_t offset;
															
 
																+	size_t cb;
															
 
																+	const void * ptr;
															
 
																+} * command_write_buffer;
															
 
																+
															
 
																+
															
 
																+typedef struct command_copy_buffer_t {
															
 
																+	CL_COMMAND
															
 
																+	
															
 
																+	cl_mem src_buffer;
															
 
																+	cl_mem dst_buffer;
															
 
																+	size_t src_offset;
															
 
																+	size_t dst_offset;
															
 
																+	size_t cb;
															
 
																+} * command_copy_buffer;
															
 
																+
															
 
																+
															
 
																+typedef struct command_map_buffer_t {
															
 
																+	CL_COMMAND
															
 
																+
															
 
																+	cl_mem buffer;
															
 
																+	cl_map_flags map_flags;
															
 
																+	size_t offset;
															
 
																+	size_t cb;
															
 
																+	cl_event event;
															
 
																+} * command_map_buffer;
															
 
																+
															
 
																+
															
 
																+typedef struct command_unmap_mem_object_t {
															
 
																+	CL_COMMAND
															
 
																+
															
 
																+	cl_mem buffer;
															
 
																+	void * ptr;
															
 
																+} * command_unmap_mem_object;
															
 
																+
															
 
																+
															
 
																+typedef struct command_marker_t {
															
 
																+	CL_COMMAND
															
 
																+} * command_marker;
															
 
																+
															
 
																+/*************************
															
 
																+ * Constructor functions
															
 
																+ *************************/
															
 
																+
															
 
																+command_ndrange_kernel command_ndrange_kernel_create (
															
 
																+		cl_kernel        kernel,
															
 
																+		cl_uint          work_dim,
															
 
																+		const size_t *   global_work_offset,
															
 
																+		const size_t *   global_work_size,
															
 
																+		const size_t *   local_work_size);
															
 
																+
															
 
																+command_ndrange_kernel command_task_create (cl_kernel kernel);
															
 
																+
															
 
																+command_marker command_barrier_create ();
															
 
																+
															
 
																+command_marker command_marker_create ();
															
 
																+
															
 
																+command_map_buffer command_map_buffer_create(
															
 
																+		cl_mem buffer,
															
 
																+		cl_map_flags map_flags,
															
 
																+		size_t offset,
															
 
																+		size_t cb,
															
 
																+		cl_event event);
															
 
																+
															
 
																+command_unmap_mem_object command_unmap_mem_object_create(
															
 
																+		cl_mem buffer,
															
 
																+		void * ptr);
															
 
																+
															
 
																+command_read_buffer command_read_buffer_create(
															
 
																+		cl_mem buffer,
															
 
																+		size_t offset,
															
 
																+		size_t cb,
															
 
																+		void * ptr);
															
 
																+
															
 
																+command_write_buffer command_write_buffer_create(
															
 
																+		cl_mem buffer,
															
 
																+		size_t offset,
															
 
																+		size_t cb,
															
 
																+		const void * ptr);
															
 
																+
															
 
																+command_copy_buffer command_copy_buffer_create(
															
 
																+		cl_mem src_buffer,
															
 
																+		cl_mem dst_buffer,
															
 
																+		size_t src_offset,
															
 
																+		size_t dst_offset,
															
 
																+		size_t cb);
															
 
																+
															
 
																+/*************************
															
 
																+ * Submit functions
															
 
																+ *************************/
															
 
																+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd);
															
 
																+cl_int command_read_buffer_submit(command_read_buffer cmd);
															
 
																+cl_int command_write_buffer_submit(command_write_buffer cmd);
															
 
																+cl_int command_copy_buffer_submit(command_copy_buffer cmd);
															
 
																+cl_int command_map_buffer_submit(command_map_buffer cmd);
															
 
																+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd);
															
 
																+cl_int command_marker_submit(command_marker cmd);
															
 
																+
															
 
																+
															
 
																+#endif /* SOCL_COMMANDS_H */
															
 
																+
															
--- a/socl/src/command_list.c
+++ b/socl/src/command_list.c
@@ -0,0 +1,40 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010,2011 University of Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "socl.h"
															
 
																+
															
 
																+command_list command_list_cons(cl_command cmd, command_list ls) {
															
 
																+	command_list e = malloc(sizeof(struct command_list_t));
															
 
																+	e->cmd = cmd;
															
 
																+	e->next = ls;
															
 
																+	if (ls != NULL)
															
 
																+		ls->prev = e;
															
 
																+	return e;
															
 
																+}
															
 
																+
															
 
																+command_list command_list_remove(command_list l, cl_command cmd) {
															
 
																+	command_list e = l;
															
 
																+	while (e != NULL) {
															
 
																+		if (e->cmd == cmd) {
															
 
																+			if (e->prev != NULL) e->prev->next = e->next;
															
 
																+			if (e->next != NULL) e->next->prev = e->prev;
															
 
																+			command_list next = e->next;
															
 
																+			free(e);
															
 
																+			if (e == l) return next;
															
 
																+		}
															
 
																+	}
															
 
																+	return l;
															
 
																+}
															
--- a/socl/src/command_list.h
+++ b/socl/src/command_list.h
@@ -0,0 +1,28 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010,2011 University of Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "socl.h"
															
 
																+
															
 
																+typedef struct command_list_t * command_list;
															
 
																+
															
 
																+struct command_list_t {
															
 
																+	cl_command cmd;
															
 
																+	command_list next;
															
 
																+	command_list prev;
															
 
																+};
															
 
																+
															
 
																+command_list command_list_cons(cl_command cmd, command_list ls);
															
 
																+command_list command_list_remove(command_list l, cl_command cmd);
															
--- a/socl/src/command_queue.c
+++ b/socl/src/command_queue.c
@@ -24,74 +24,145 @@
 
																  * its command queue.
															
 
																  */
															
 
																+
															
 
																 /**
															
 
																- * Enqueue the given task but put fake_event into the command queue.
															
 
																- * This is used when a tag notified by application is used (cf clEnqueueMapBuffer, etc.)
															
 
																+ * Returned implicit dependencies for a task
															
 
																+ * Command queue must be locked!
															
 
																  */
															
 
																-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event) {
															
 
																-
															
 
																-  int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
															
 
																-
															
 
																-  /* Set explicit task dependencies */
															
 
																-  task_dependency_add(task, num_events, events);
															
 
																-
															
 
																-  /* Lock command queue */
															
 
																-  pthread_spin_lock(&cq->spin);
															
 
																+void command_queue_dependencies_implicit(
															
 
																+	cl_command_queue cq, 	/* Command queue */
															
 
																+	char is_barrier,	/* Is the task a barrier */
															
 
																+	cl_int * ret_num_events,	/* Returned number of dependencies */
															
 
																+	cl_event ** ret_events	/* Returned dependencies */
															
 
																+) {
															
 
																+
															
 
																+	/*********************
															
 
																+	 * Count dependencies
															
 
																+	 *********************/
															
 
																+	int ndeps = 0;
															
 
																+
															
 
																+	/* Add dependency to last barrier if applicable */
															
 
																+	if (cq->barrier != NULL)
															
 
																+		ndeps++;
															
 
																+
															
 
																+	/* Add dependencies to out-of-order events (if any) */
															
 
																+	if (is_barrier) {
															
 
																+		command_list cl = cq->commands;
															
 
																+		while (cl != NULL) {
															
 
																+			ndeps++;
															
 
																+			cl = cl->next;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/*********************
															
 
																+	 * Return dependencies
															
 
																+	 *********************/
															
 
																+
															
 
																+	cl_event * evs = malloc(ndeps * sizeof(cl_event));
															
 
																+	int n = 0;
															
 
																+
															
 
																+	/* Add dependency to last barrier if applicable */
															
 
																+	if (cq->barrier != NULL)
															
 
																+		evs[n++] = cq->barrier->event;
															
 
																+
															
 
																+	/* Add dependencies to out-of-order events (if any) */
															
 
																+	if (is_barrier) {
															
 
																+		command_list cl = cq->commands;
															
 
																+		while (cl != NULL) {
															
 
																+			evs[n++] = cl->cmd->event;
															
 
																+			cl = cl->next;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	*ret_num_events = ndeps;
															
 
																+	*ret_events = evs;
															
 
																+}
															
 
																+	
															
 
																+/**
															
 
																+ * Insert a command in the command queue
															
 
																+ * The command queue must be locked!
															
 
																+ */
															
 
																+void command_queue_insert(
															
 
																+	cl_command_queue 	cq, 	/* Command queue */
															
 
																+	cl_command 		cmd,	/* Command */
															
 
																+	int 			is_barrier		/* Is the task a barrier */
															
 
																+) {
															
 
																-  /* Add dependency to last barrier if applicable */
															
 
																-  if (cq->barrier != NULL)
															
 
																-    task_dependency_add(task, 1, &cq->barrier);
															
 
																+	int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
															
 
																-  /* Add dependencies to out-of-order events (if any) */
															
 
																-  if (barrier) {
															
 
																-    while (cq->events != NULL) {
															
 
																-      task_dependency_add(task, 1, &cq->events);
															
 
																-      cq->events = cq->events->next;
															
 
																-    }
															
 
																-  }
															
 
																+	if (is_barrier)
															
 
																+		cq->commands = NULL;
															
 
																-  cl_event ev = (fake_event == NULL ? task_event(task) : fake_event);
															
 
																+	/* Add command to the list of out-of-order commands */
															
 
																+	if (!in_order)
															
 
																+		cq->commands = command_list_cons(cmd, cq->commands);
															
 
																-  /* Add event to the list of out-of-order events */
															
 
																-  if (!in_order) {
															
 
																-    ev->next = cq->events;
															
 
																-    ev->prev = NULL;
															
 
																-    if (cq->events != NULL)
															
 
																-      cq->events->prev = ev;
															
 
																-    cq->events = ev;
															
 
																-  }
															
 
																+	/* Register this event as last barrier */
															
 
																+	if (is_barrier || in_order)
															
 
																+		cq->barrier = cmd;
															
 
																-  /* Register this event as last barrier */
															
 
																-  if (barrier || in_order)
															
 
																-    cq->barrier = ev;
															
 
																+	/* Add reference to the command queue */
															
 
																+	gc_entity_store(&cmd->event->cq, cq);
															
 
																+}
															
 
																-   /* Unlock command queue */
															
 
																-   pthread_spin_unlock(&cq->spin);
															
 
																+/**
															
 
																+ * Return implicit and explicit dependencies for a task
															
 
																+ * The command queue must be locked!
															
 
																+ */
															
 
																+void command_queue_dependencies(
															
 
																+	cl_command_queue 	cq,		/* Command queue */
															
 
																+	int 			is_barrier,	/* Is the task a barrier */
															
 
																+	cl_int 			num_events,	/* Number of explicit dependencies */
															
 
																+	const cl_event *	events,		/* Explicit dependencies */
															
 
																+	cl_int * 		ret_num_events,	/* Returned number of dependencies */
															
 
																+	cl_event ** 		ret_events	/* Returned dependencies */
															
 
																+) {
															
 
																+	cl_int implicit_num_events;
															
 
																+	cl_event * implicit_events;
															
 
																+
															
 
																+	/* Implicit dependencies */
															
 
																+	command_queue_dependencies_implicit(cq, is_barrier, &implicit_num_events, &implicit_events);
															
 
																+
															
 
																+	/* Explicit dependencies */
															
 
																+	cl_int ndeps = implicit_num_events + num_events;
															
 
																+	cl_event * evs = malloc(sizeof(cl_event) * ndeps);
															
 
																+	memcpy(evs, implicit_events, sizeof(cl_event) * implicit_num_events);
															
 
																+	memcpy(&evs[implicit_num_events], events, sizeof(cl_event) * num_events);
															
 
																+
															
 
																+	*ret_num_events = ndeps;
															
 
																+	*ret_events = evs;
															
 
																+}
															
 
																-   /* Add reference to the command queue */
															
 
																-   gc_entity_store(&ev->cq, cq);
															
 
																+void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_events, const cl_event * events) {
															
 
																-   /* Submit task */
															
 
																-   gc_entity_retain(task_event(task));
															
 
																-   int ret = starpu_task_submit(task);
															
 
																-   if (ret != 0)
															
 
																-      DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
															
 
																+	/* Check if the command is a barrier */
															
 
																+	int is_barrier = 0;
															
 
																+	if (cmd->typ == CL_COMMAND_BARRIER) {
															
 
																+		is_barrier = 1;
															
 
																+		/* OpenCL has no CL_COMMAND_BARRIER type, so we fall back on CL_COMMAND_MARKER */
															
 
																+		cmd->typ = CL_COMMAND_MARKER;
															
 
																+	}
															
 
																-   return CL_SUCCESS;
															
 
																-}
															
 
																+	/* Set command queue field */
															
 
																+	cmd->cq = cq;
															
 
																-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events) {
															
 
																-  return command_queue_enqueue_fakeevent(cq, task, barrier, num_events, events, NULL);
															
 
																-}
															
 
																+	/* Lock command queue */
															
 
																+	pthread_mutex_lock(&cq->mutex);
															
 
																+	//FIXME: crappy separation (command_queue_dependencies + command_queue_insert)
															
 
																-cl_event enqueueBarrier(cl_command_queue cq) {
															
 
																+	/* Get all (explicit + implicit) dependencies */
															
 
																+	cl_int all_num_events;
															
 
																+	cl_event * all_events;
															
 
																+	command_queue_dependencies(cq, is_barrier, num_events, events, &all_num_events, &all_events);
															
 
																-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_BARRIER doesn't exist
															
 
																-   starpu_task * task = task_create(CL_COMMAND_MARKER);
															
 
																+	/* Make all dependencies explicit for the command */
															
 
																+	cmd->num_events = all_num_events;
															
 
																+	cmd->events = all_events;
															
 
																-   DEBUG_MSG("Submitting barrier task (event %d)\n", task->tag_id);
															
 
																-   command_queue_enqueue(cq, task, 1, 0, NULL);
															
 
																+	/* Insert command in the queue */
															
 
																+	command_queue_insert(cq, cmd, is_barrier);
															
 
																-   return task_event(task);
															
 
																+	/* Unlock command queue */
															
 
																+	pthread_mutex_unlock(&cq->mutex);
															
 
																 }
															
--- a/socl/src/command_queue.h
+++ b/socl/src/command_queue.h
@@ -17,10 +17,14 @@
 
																 #ifndef SOCL_COMMAND_QUEUE_H
															
 
																 #define SOCl_COMMAND_QUEUE_H
															
 
																-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events);
															
 
																+void command_queue_enqueue_ex(
															
 
																+	cl_command_queue 	cq,		/* Command queue */
															
 
																+	cl_command		cmd,		/* Command to enqueue */
															
 
																+	cl_uint			num_events,	/* Number of explicit dependencies */
															
 
																+	const cl_event *	events		/* Explicit dependencies */
															
 
																+	);
															
 
																-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event);
															
 
																-
															
 
																-cl_event enqueueBarrier(cl_command_queue cq);
															
 
																+#define command_queue_enqueue(cq, cmd, num_events, events)\
															
 
																+	command_queue_enqueue_ex(cq, (cl_command)cmd, num_events, events)
															
 
																 #endif /* SOCl_COMMAND_QUEUE_H */
															
--- a/socl/src/debug.c
+++ b/socl/src/debug.c
@@ -0,0 +1,76 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010,2011 University of Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "socl.h"
															
 
																+
															
 
																+#ifdef STARPU_VERBOSE
															
 
																+void DEBUG_CL(char *s, cl_int err) {
															
 
																+   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
															
 
																+   switch(err) {
															
 
																+      case CL_SUCCESS:
															
 
																+         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
															
 
																+         break;
															
 
																+      ERR_CASE(CL_DEVICE_NOT_FOUND)
															
 
																+      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
															
 
																+      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
															
 
																+      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
															
 
																+      ERR_CASE(CL_OUT_OF_RESOURCES)
															
 
																+      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
															
 
																+      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
															
 
																+      ERR_CASE(CL_MEM_COPY_OVERLAP)
															
 
																+      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
															
 
																+      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
															
 
																+      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
															
 
																+      ERR_CASE(CL_MAP_FAILURE)
															
 
																+      ERR_CASE(CL_INVALID_VALUE)
															
 
																+      ERR_CASE(CL_INVALID_DEVICE_TYPE)
															
 
																+      ERR_CASE(CL_INVALID_PLATFORM)
															
 
																+      ERR_CASE(CL_INVALID_DEVICE)
															
 
																+      ERR_CASE(CL_INVALID_CONTEXT)
															
 
																+      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
															
 
																+      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
															
 
																+      ERR_CASE(CL_INVALID_HOST_PTR)
															
 
																+      ERR_CASE(CL_INVALID_MEM_OBJECT)
															
 
																+      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
															
 
																+      ERR_CASE(CL_INVALID_IMAGE_SIZE)
															
 
																+      ERR_CASE(CL_INVALID_SAMPLER)
															
 
																+      ERR_CASE(CL_INVALID_BINARY)
															
 
																+      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
															
 
																+      ERR_CASE(CL_INVALID_PROGRAM)
															
 
																+      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
															
 
																+      ERR_CASE(CL_INVALID_KERNEL_NAME)
															
 
																+      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
															
 
																+      ERR_CASE(CL_INVALID_KERNEL)
															
 
																+      ERR_CASE(CL_INVALID_ARG_INDEX)
															
 
																+      ERR_CASE(CL_INVALID_ARG_VALUE)
															
 
																+      ERR_CASE(CL_INVALID_ARG_SIZE)
															
 
																+      ERR_CASE(CL_INVALID_KERNEL_ARGS)
															
 
																+      ERR_CASE(CL_INVALID_WORK_DIMENSION)
															
 
																+      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
															
 
																+      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
															
 
																+      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
															
 
																+      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
															
 
																+      ERR_CASE(CL_INVALID_EVENT)
															
 
																+      ERR_CASE(CL_INVALID_OPERATION)
															
 
																+      ERR_CASE(CL_INVALID_GL_OBJECT)
															
 
																+      ERR_CASE(CL_INVALID_BUFFER_SIZE)
															
 
																+      ERR_CASE(CL_INVALID_MIP_LEVEL)
															
 
																+      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
															
 
																+      default:
															
 
																+         DEBUG_MSG("%s CL error: Error message not supported by DEBUG_CL macro (%d).\n", s, err);
															
 
																+   }
															
 
																+}
															
 
																+#endif
															
--- a/socl/src/debug.h
+++ b/socl/src/debug.h
@@ -17,10 +17,12 @@
 
																 #ifndef SOCL_DEBUG_H
															
 
																 #define SOCL_DEBUG_H
															
 
																+#include <../src/common/config.h>
															
 
																+
															
 
																 #ifdef STARPU_VERBOSE
															
 
																 #define DEBUG
															
 
																 #include <stdio.h>
															
 
																-   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__); } while (0);
															
 
																+   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__);} while (0);
															
 
																    #define DEBUG_MSG_NOHEAD(...) fprintf(stderr, __VA_ARGS__)
															
 
																    #define DEBUG_ERROR(...) do { fprintf(stderr, "[SOCL] ERROR: "__VA_ARGS__); exit(1); } while (0);
															
 
																 #else
															
@@ -35,62 +37,7 @@
 
																 #define ERROR_STOP(...) do { ERROR_MSG(__VA_ARGS__); exit(1); } while(0);
															
 
																 #ifdef STARPU_VERBOSE
															
 
																-void DEBUG_CL(char *s, cl_int err) {
															
 
																-   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
															
 
																-   switch(err) {
															
 
																-      case CL_SUCCESS:
															
 
																-         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
															
 
																-         break;
															
 
																-      ERR_CASE(CL_DEVICE_NOT_FOUND)
															
 
																-      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
															
 
																-      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
															
 
																-      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
															
 
																-      ERR_CASE(CL_OUT_OF_RESOURCES)
															
 
																-      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
															
 
																-      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
															
 
																-      ERR_CASE(CL_MEM_COPY_OVERLAP)
															
 
																-      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
															
 
																-      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
															
 
																-      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
															
 
																-      ERR_CASE(CL_MAP_FAILURE)
															
 
																-      ERR_CASE(CL_INVALID_VALUE)
															
 
																-      ERR_CASE(CL_INVALID_DEVICE_TYPE)
															
 
																-      ERR_CASE(CL_INVALID_PLATFORM)
															
 
																-      ERR_CASE(CL_INVALID_DEVICE)
															
 
																-      ERR_CASE(CL_INVALID_CONTEXT)
															
 
																-      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
															
 
																-      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
															
 
																-      ERR_CASE(CL_INVALID_HOST_PTR)
															
 
																-      ERR_CASE(CL_INVALID_MEM_OBJECT)
															
 
																-      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
															
 
																-      ERR_CASE(CL_INVALID_IMAGE_SIZE)
															
 
																-      ERR_CASE(CL_INVALID_SAMPLER)
															
 
																-      ERR_CASE(CL_INVALID_BINARY)
															
 
																-      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
															
 
																-      ERR_CASE(CL_INVALID_PROGRAM)
															
 
																-      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
															
 
																-      ERR_CASE(CL_INVALID_KERNEL_NAME)
															
 
																-      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
															
 
																-      ERR_CASE(CL_INVALID_KERNEL)
															
 
																-      ERR_CASE(CL_INVALID_ARG_INDEX)
															
 
																-      ERR_CASE(CL_INVALID_ARG_VALUE)
															
 
																-      ERR_CASE(CL_INVALID_ARG_SIZE)
															
 
																-      ERR_CASE(CL_INVALID_KERNEL_ARGS)
															
 
																-      ERR_CASE(CL_INVALID_WORK_DIMENSION)
															
 
																-      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
															
 
																-      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
															
 
																-      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
															
 
																-      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
															
 
																-      ERR_CASE(CL_INVALID_EVENT)
															
 
																-      ERR_CASE(CL_INVALID_OPERATION)
															
 
																-      ERR_CASE(CL_INVALID_GL_OBJECT)
															
 
																-      ERR_CASE(CL_INVALID_BUFFER_SIZE)
															
 
																-      ERR_CASE(CL_INVALID_MIP_LEVEL)
															
 
																-      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
															
 
																-      default:
															
 
																-         DEBUG_MSG("%s CL error: Error message not supported by print_cl_error (%d).\n", s, err);
															
 
																-   }
															
 
																-}
															
 
																+void DEBUG_CL(char *s, cl_int err);
															
 
																 #else
															
 
																    #define DEBUG_CL(...) while(0);
															
 
																 #endif
															
--- a/socl/src/event.c
+++ b/socl/src/event.c
@@ -20,21 +20,24 @@
 
																 static void release_callback_event(void * e);
															
 
																+int event_unique_id() {
															
 
																+   static int id = 1;
															
 
																+
															
 
																+   return __sync_fetch_and_add(&id,1);
															
 
																+}
															
 
																+
															
 
																 /**
															
 
																  * Create a new event
															
 
																  *
															
 
																  * Events have one-to-one relation with tag. Tag number is event ID
															
 
																  */
															
 
																 cl_event event_create(void) {
															
 
																-   static int id = 1;
															
 
																    cl_event ev;
															
 
																    ev = gc_entity_alloc(sizeof(struct _cl_event), release_callback_event);
															
 
																-   ev->next = NULL;
															
 
																-   ev->prev = NULL;
															
 
																-   ev->id = __sync_fetch_and_add(&id,1);
															
 
																+   ev->id = event_unique_id();
															
 
																    ev->status = CL_SUBMITTED;
															
 
																-   ev->type = 0;
															
 
																+   ev->command = NULL;
															
 
																    ev->profiling_info = NULL;
															
 
																    ev->cq = NULL;
															
@@ -49,22 +52,17 @@ static void release_callback_event(void * e) {
 
																   /* Remove from command queue */
															
 
																   if (cq != NULL) {
															
 
																     /* Lock command queue */
															
 
																-    pthread_spin_lock(&cq->spin);
															
 
																+    pthread_mutex_lock(&cq->mutex);
															
 
																     /* Remove barrier if applicable */
															
 
																-    if (cq->barrier == event)
															
 
																+    if (cq->barrier == event->command)
															
 
																       cq->barrier = NULL;
															
 
																-    /* Remove from the list of out-of-order events */
															
 
																-    if (event->prev != NULL)
															
 
																-      event->prev->next = event->next;
															
 
																-    if (event->next != NULL)
															
 
																-      event->next->prev = event->prev;
															
 
																-    if (cq->events == event)
															
 
																-      cq->events = event->next;
															
 
																+    /* Remove from the list of out-of-order commands */
															
 
																+    cq->commands = command_list_remove(cq->commands, event->command);
															
 
																     /* Unlock command queue */
															
 
																-    pthread_spin_unlock(&cq->spin);
															
 
																+    pthread_mutex_unlock(&cq->mutex);
															
 
																     gc_entity_unstore(&cq);
															
 
																   }
															
--- a/socl/src/event.h
+++ b/socl/src/event.h
@@ -26,4 +26,9 @@
 
																  */
															
 
																 cl_event event_create(void);
															
 
																+/**
															
 
																+ * Generate a unique tag id
															
 
																+ */
															
 
																+int event_unique_id();
															
 
																+
															
 
																 #endif /* SOCL_EVENT_H */
															
--- a/socl/src/gc.c
+++ b/socl/src/gc.c
@@ -104,13 +104,15 @@ void gc_stop(void) {
 
																   pthread_join(gc_thread, NULL);
															
 
																 }
															
 
																-void gc_entity_release_ex(entity e) {
															
 
																+int gc_entity_release_ex(entity e) {
															
 
																   /* Decrement reference count */
															
 
																   int refs = __sync_sub_and_fetch(&e->refs, 1);
															
 
																   if (refs != 0)
															
 
																-    return;
															
 
																+    return 0;
															
 
																+
															
 
																+  DEBUG_MSG("Releasing entity %lx\n", e);
															
 
																   GC_LOCK;
															
@@ -127,6 +129,8 @@ void gc_entity_release_ex(entity e) {
 
																   gc_list = e;
															
 
																   GC_UNLOCK;
															
 
																+
															
 
																+  return 1;
															
 
																 }
															
--- a/socl/src/gc.h
+++ b/socl/src/gc.h
@@ -27,7 +27,7 @@ void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*));
 
																 void gc_entity_retain(void *arg);
															
 
																 /** Decrement reference counter and release entity if applicable */
															
 
																-void gc_entity_release_ex(entity e);
															
 
																+int gc_entity_release_ex(entity e);
															
 
																 int gc_active_entity_count(void);
															
--- a/socl/src/graph.c
+++ b/socl/src/graph.c
@@ -1,123 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2010,2011 University of Bordeaux
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#include "socl.h"
															
 
																-#include "graph.h"
															
 
																-#include "event.h"
															
 
																-
															
 
																-static pthread_spinlock_t graph_lock;
															
 
																-static graph_node graph_nodes = NULL;
															
 
																-
															
 
																-
															
 
																-/**
															
 
																- * Initialize graph structure
															
 
																- */
															
 
																-void graph_init(void) {
															
 
																-	pthread_spin_init(&graph_lock, PTHREAD_PROCESS_PRIVATE);
															
 
																-}
															
 
																-
															
 
																-/**
															
 
																- * Release graph structure
															
 
																- */
															
 
																-void graph_destroy(void) {
															
 
																-	pthread_spin_destroy(&graph_lock);
															
 
																-}
															
 
																-
															
 
																-/**
															
 
																- * Initialize a graph node
															
 
																- */
															
 
																-void graph_node_init(graph_node node) {
															
 
																-	node->id = -1;
															
 
																-	node->next = NULL;
															
 
																-}
															
 
																-
															
 
																-/**
															
 
																- * Store a node in the graph
															
 
																- */
															
 
																-void graph_store(void * node) {
															
 
																-	pthread_spin_lock(&graph_lock);
															
 
																-
															
 
																-	graph_node n = (graph_node)node;
															
 
																-	n->next = graph_nodes;
															
 
																-	graph_nodes = n;
															
 
																-
															
 
																-	pthread_spin_unlock(&graph_lock);
															
 
																-}
															
 
																-
															
 
																-
															
 
																-
															
 
																-/**
															
 
																- * Duplicate a memory area into a fresh allocated buffer
															
 
																- */
															
 
																-static void * memdupa(const void *p, size_t size) {
															
 
																-	void * s = malloc(size);
															
 
																-	memcpy(s,p,size);
															
 
																-	return s;
															
 
																-}
															
 
																-
															
 
																-#define memdup(p, size) ((typeof(p))memdupa(p,size))
															
 
																-#define nullOrDup(name,size) s->name = (name == NULL ? NULL : memdup(name,size))
															
 
																-#define dup(name) s->name = name
															
 
																-
															
 
																-
															
 
																-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
															
 
																-		cl_command_queue cq,
															
 
																-		cl_kernel        kernel,
															
 
																-		cl_uint          work_dim,
															
 
																-		const size_t *   global_work_offset,
															
 
																-		const size_t *   global_work_size,
															
 
																-		const size_t *   local_work_size,
															
 
																-		cl_uint          num_events,
															
 
																-		const cl_event * events,
															
 
																-		cl_event *       event,
															
 
																-		cl_uint 		num_args,
															
 
																-		size_t *		arg_sizes,
															
 
																-		enum kernel_arg_type * arg_types,
															
 
																-		void **		args)
															
 
																-{
															
 
																-	node_enqueue_kernel s = malloc(sizeof(struct node_enqueue_kernel_t));
															
 
																-	graph_node_init(&s->node);
															
 
																-	s->node.id = NODE_ENQUEUE_KERNEL;
															
 
																-
															
 
																-	dup(is_task);
															
 
																-	dup(cq);
															
 
																-	dup(kernel);
															
 
																-	dup(work_dim);
															
 
																-	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
															
 
																-	nullOrDup(global_work_size, work_dim*sizeof(size_t));
															
 
																-	nullOrDup(local_work_size, work_dim*sizeof(size_t));
															
 
																-	dup(num_events);
															
 
																-	nullOrDup(events, num_events * sizeof(cl_event));
															
 
																-	dup(num_args);
															
 
																-	nullOrDup(arg_sizes, num_args * sizeof(size_t));
															
 
																-	nullOrDup(arg_types, num_args * sizeof(enum kernel_arg_type));
															
 
																-	nullOrDup(args, num_args * sizeof(void*));
															
 
																-
															
 
																-	
															
 
																-	if (event != NULL) {
															
 
																-		*event = event_create();
															
 
																-		s->event = event;
															
 
																-	}
															
 
																-	else {
															
 
																-		s->event = NULL;
															
 
																-	}
															
 
																-
															
 
																-	return s;
															
 
																-}
															
 
																-
															
 
																-#undef nullOrDup
															
 
																-#undef memdup
															
 
																-#undef dup
															
--- a/socl/src/graph.h
+++ b/socl/src/graph.h
@@ -1,73 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2010,2011 University of Bordeaux
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#ifndef SOCL_GRAPH_H
															
 
																-#define SOCL_GRAPH_H
															
 
																-
															
 
																-#include "socl.h"
															
 
																-
															
 
																-typedef struct graph_node_t * graph_node;
															
 
																-
															
 
																-struct graph_node_t {
															
 
																-	int id; /* Kind of node */
															
 
																-	graph_node next; /* Linked-list of nodes... */
															
 
																-};
															
 
																-
															
 
																-void graph_init(void);
															
 
																-void graph_destroy(void);
															
 
																-void graph_node_init(graph_node node);
															
 
																-void graph_store(void * node);
															
 
																-
															
 
																-#define NODE_ENQUEUE_KERNEL 1
															
 
																-
															
 
																-
															
 
																-typedef struct node_enqueue_kernel_t {
															
 
																-	struct graph_node_t node;
															
 
																-
															
 
																-	char 		 is_task; /* Set if clEnqueueTask is used */
															
 
																-	cl_command_queue cq;
															
 
																-	cl_kernel        kernel;
															
 
																-	cl_uint          work_dim;
															
 
																-	const size_t *   global_work_offset;
															
 
																-	const size_t *   global_work_size;
															
 
																-	const size_t *   local_work_size;
															
 
																-	cl_uint          num_events;
															
 
																-	const cl_event * events;
															
 
																-	cl_event * 	 event;
															
 
																-	cl_uint 	 num_args;
															
 
																-	size_t *	 arg_sizes;
															
 
																-	enum kernel_arg_type * arg_types;
															
 
																-	void **		 args;
															
 
																-} * node_enqueue_kernel;
															
 
																-
															
 
																-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
															
 
																-		cl_command_queue cq,
															
 
																-		cl_kernel        kernel,
															
 
																-		cl_uint          work_dim,
															
 
																-		const size_t *   global_work_offset,
															
 
																-		const size_t *   global_work_size,
															
 
																-		const size_t *   local_work_size,
															
 
																-		cl_uint          num_events,
															
 
																-		const cl_event * events,
															
 
																-		cl_event *       event,
															
 
																-		cl_uint 		num_args,
															
 
																-		size_t *		arg_sizes,
															
 
																-		enum kernel_arg_type * arg_types,
															
 
																-		void **		args);
															
 
																-
															
 
																-cl_int node_play_enqueue_kernel(node_enqueue_kernel n);
															
 
																-
															
 
																-#endif /* SOCL_GRAPH_H */
															
--- a/socl/src/init.c
+++ b/socl/src/init.c
@@ -15,7 +15,6 @@
 
																  */
															
 
																 #include "socl.h"
															
 
																-#include "graph.h"
															
 
																 #include "gc.h"
															
 
																 #include "mem_objects.h"
															
@@ -25,7 +24,6 @@
 
																 __attribute__((constructor)) static void socl_init() {
															
 
																   mem_object_init();
															
 
																-  graph_init();
															
 
																   starpu_init(NULL);
															
@@ -51,7 +49,5 @@ __attribute__((destructor)) static void socl_shutdown() {
 
																   if (active_entities != 0)
															
 
																     fprintf(stderr, "Unreleased entities: %d\n", active_entities);
															
 
																-  graph_destroy();
															
 
																-
															
 
																   starpu_shutdown();
															
 
																 }
															
--- a/socl/src/socl.h
+++ b/socl/src/socl.h
@@ -23,6 +23,9 @@
 
																 #include CL_HEADERS "CL/cl.h"
															
 
																 #endif
															
 
																+/* Additional command type */
															
 
																+#define CL_COMMAND_BARRIER 0x99987
															
 
																+
															
 
																 #include <string.h>
															
 
																 #include <stdlib.h>
															
 
																 #include <stdint.h>
															
@@ -35,7 +38,7 @@
 
																 #include <starpu_profiling.h>
															
 
																 #include <starpu_task.h>
															
 
																-typedef struct starpu_task starpu_task;
															
 
																+typedef struct starpu_task * starpu_task;
															
 
																 #ifdef UNUSED
															
 
																 #elif defined(__GNUC__)
															
@@ -50,12 +53,13 @@ typedef struct starpu_task starpu_task;
 
																  */
															
 
																 typedef struct entity * entity;
															
 
																+#include "command.h"
															
 
																+#include "command_list.h"
															
 
																 #include "command_queue.h"
															
 
																 #include "debug.h"
															
 
																 #include "devices.h"
															
 
																 #include "event.h"
															
 
																 #include "gc.h"
															
 
																-#include "graph.h"
															
 
																 #include "mem_objects.h"
															
 
																 #include "task.h"
															
 
																 #include "util.h"
															
@@ -79,11 +83,32 @@ struct entity {
 
																 struct _cl_platform_id {};
															
 
																-#define RETURN_EVENT(ev, event) \
															
 
																-   if (event != NULL) \
															
 
																-      *event = ev; \
															
 
																-   else\
															
 
																-      gc_entity_release(ev);
															
 
																+#define RETURN_EVENT(cmd, event) \
															
 
																+	if (event != NULL) { \
															
 
																+		cl_event ev = command_event_get(cmd);\
															
 
																+		gc_entity_retain(ev);\
															
 
																+		*event = ev; \
															
 
																+	}
															
 
																+
															
 
																+#define RETURN_CUSTOM_EVENT(src, tgt) \
															
 
																+	if (tgt != NULL) { \
															
 
																+		gc_entity_retain(src); \
															
 
																+		*tgt = src; \
															
 
																+	}
															
 
																+
															
 
																+#define MAY_BLOCK(blocking) \
															
 
																+	if ((blocking) == CL_TRUE) {\
															
 
																+		cl_event ev = command_event_get(cmd);\
															
 
																+		soclWaitForEvents(1, &ev);\
															
 
																+		gc_entity_release(ev);\
															
 
																+	}
															
 
																+
															
 
																+#define MAY_BLOCK_CUSTOM(blocking,event) \
															
 
																+	if ((blocking) == CL_TRUE) {\
															
 
																+		cl_event ev = (event);\
															
 
																+		soclWaitForEvents(1, &ev);\
															
 
																+		gc_entity_release(ev);\
															
 
																+	}
															
 
																 /* Constants */
															
 
																 struct _cl_platform_id socl_platform;
															
@@ -121,14 +146,14 @@ struct _cl_command_queue {
 
																   cl_device_id device;
															
 
																   cl_context context;
															
 
																-  /* Stored command events */
															
 
																-  cl_event events;
															
 
																+  /* Stored commands */
															
 
																+  command_list commands;
															
 
																   /* Last enqueued barrier-like event */
															
 
																-  cl_event barrier;
															
 
																+  cl_command barrier;
															
 
																   /* Mutex */
															
 
																-  pthread_spinlock_t spin;
															
 
																+  pthread_mutex_t mutex;
															
 
																   /* ID  */
															
 
																 #ifdef DEBUG
															
@@ -142,12 +167,8 @@ struct _cl_event {
 
																   /* Command queue */
															
 
																   cl_command_queue cq;
															
 
																-  /* Command type */
															
 
																-  cl_command_type type;
															
 
																-
															
 
																-  /* Command queue list */
															
 
																-  cl_event prev;
															
 
																-  cl_event next;
															
 
																+  /* Command */
															
 
																+  cl_command command;
															
 
																   /* Event status */
															
 
																   cl_int status;
															
@@ -244,7 +265,7 @@ struct _cl_kernel {
 
																   cl_int *errcodes;
															
 
																   /* Arguments */
															
 
																-  unsigned int arg_count;
															
 
																+  unsigned int num_args;
															
 
																   size_t *arg_size;
															
 
																   enum kernel_arg_type  *arg_type;
															
 
																   void  **arg_value;
															
--- a/socl/src/task.c
+++ b/socl/src/task.c
@@ -18,64 +18,90 @@
 
																 #include "gc.h"
															
 
																 #include "event.h"
															
 
																-cl_event task_event(starpu_task *task) {
															
 
																-  return (cl_event)task->callback_arg;
															
 
																-}
															
 
																-
															
 
																 static void task_release_callback(void *arg) {
															
 
																-  starpu_task *task = starpu_get_current_task();
															
 
																-  cl_event ev = (cl_event)arg;
															
 
																+  starpu_task task = starpu_get_current_task();
															
 
																+  cl_command cmd = (cl_command)arg;
															
 
																+  cl_event ev = command_event_get(cmd);
															
 
																   ev->status = CL_COMPLETE;
															
 
																+  DEBUG_MSG("notifying tag %x as well as task tag %x\n", ev->id, task->tag_id);
															
 
																+
															
 
																+  /* Trigger the tag associated to the command event */
															
 
																+  starpu_tag_notify_from_apps(ev->id);
															
 
																+
															
 
																   if (task->profiling_info != NULL && (intptr_t)task->profiling_info != -ENOSYS) {
															
 
																     ev->profiling_info = malloc(sizeof(*task->profiling_info));
															
 
																     memcpy(ev->profiling_info, task->profiling_info, sizeof(*task->profiling_info));
															
 
																   }
															
 
																   gc_entity_release(ev);
															
 
																+
															
 
																+  /* Release the command */
															
 
																+  //TODO
															
 
																 }
															
 
																 /*
															
 
																  * Create a StarPU task
															
 
																- *
															
 
																- * Task's callback_arg is event
															
 
																- * Task's tag is set to event ID
															
 
																  */
															
 
																-starpu_task * task_create(cl_command_type type) {
															
 
																-   cl_event event;
															
 
																-   struct starpu_task * task;
															
 
																+starpu_task task_create() {
															
 
																+	struct starpu_task * task;
															
 
																+
															
 
																+	/* Create StarPU task */
															
 
																+	task = starpu_task_create();
															
 
																+
															
 
																+	/* Set task common settings */
															
 
																+	task->destroy = 1;
															
 
																+	task->detach = 1;
															
 
																+
															
 
																+	task->use_tag = 1;
															
 
																+	task->tag_id = event_unique_id();
															
 
																-   /* Create event */
															
 
																-   event = event_create();
															
 
																-   event->type = type;
															
 
																+	DEBUG_MSG("creating task with tag %x\n", task->tag_id);
															
 
																-   /* Create StarPU task */
															
 
																-   task = starpu_task_create();
															
 
																+	return task;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
															
 
																+
															
 
																+	if (num_events != 0) {
															
 
																+		cl_uint i;
															
 
																+
															
 
																+		starpu_tag_t * tags = malloc(num_events * sizeof(starpu_tag_t));	
															
 
																-   /* Task tag is set to event id */
															
 
																-   task->use_tag = 1;
															
 
																-   task->tag_id = event->id;
															
 
																+		if (num_events != 0)
															
 
																+			DEBUG_MSG("Tag %d depends on %u tags:", task->tag_id, num_events);
															
 
																-   /* Set task common settings */
															
 
																-   task->destroy = 1;
															
 
																-   task->detach = 1;
															
 
																-   task->callback_func = task_release_callback;
															
 
																-   task->callback_arg = event;
															
 
																+		for (i=0; i<num_events; i++) {
															
 
																+			tags[i] = events[i]->id;
															
 
																+			DEBUG_MSG_NOHEAD(" %u", events[i]->id);
															
 
																+		}
															
 
																+		DEBUG_MSG_NOHEAD("\n");
															
 
																-   return task;
															
 
																+		starpu_tag_declare_deps_array(task->tag_id, num_events, tags);
															
 
																+
															
 
																+		free(tags);
															
 
																+	}
															
 
																 }
															
 
																+cl_int task_submit_ex(starpu_task task, cl_command cmd) {
															
 
																+
															
 
																+	/* Associated the task to the command */
															
 
																+	cmd->task = task;
															
 
																+
															
 
																+	task_depends_on(task, command_num_events_get(cmd), command_events_get(cmd));
															
 
																+
															
 
																+	task->callback_func = task_release_callback;
															
 
																+	task->callback_arg = cmd;
															
 
																-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events) {
															
 
																-   unsigned int i;
															
 
																+	/* Submit task */
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	if (ret != 0)
															
 
																+		DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
															
 
																-   for (i=0; i<num; i++) {
															
 
																-      starpu_tag_t tag = events[i]->id;
															
 
																-      DEBUG_MSG("Event %d depends on event %d\n", task->tag_id, events[i]->id);
															
 
																-      starpu_tag_declare_deps_array(task->tag_id, 1, &tag);
															
 
																-   }
															
 
																+	return CL_SUCCESS;
															
 
																 }
															
@@ -106,14 +132,14 @@ static starpu_codelet cputask_codelet = {
 
																    .cpu_func = &cputask_task
															
 
																 };
															
 
																-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg) {
															
 
																+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg) {
															
 
																   struct cputask_arg * a = malloc(sizeof(struct cputask_arg));
															
 
																   a->callback = callback;
															
 
																   a->arg = arg;
															
 
																   a->free_arg = free_arg;
															
 
																-  starpu_task *task = task_create(type);
															
 
																+  starpu_task task = task_create();
															
 
																   task->cl = &cputask_codelet;
															
 
																   task->cl_arg = a;
															
--- a/socl/src/task.h
+++ b/socl/src/task.h
@@ -19,9 +19,22 @@
 
																 #include "socl.h"
															
 
																-starpu_task * task_create(cl_command_type type);
															
 
																-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events);
															
 
																-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg);
															
 
																-cl_event task_event(starpu_task *task);
															
 
																+starpu_task task_create();
															
 
																+void task_dependency_add(starpu_task task, cl_uint num_events, cl_event *events);
															
 
																+
															
 
																+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg);
															
 
																+
															
 
																+/**
															
 
																+ * Associate a StarPU task to a command and submit it
															
 
																+ *
															
 
																+ * When the task terminates, the command is set as terminated too
															
 
																+ */
															
 
																+cl_int task_submit_ex(starpu_task task, cl_command cmd);
															
 
																+#define task_submit(task,cmd) task_submit_ex(task, (cl_command)cmd)
															
 
																+
															
 
																+/**
															
 
																+ * Add task dependencies
															
 
																+ */
															
 
																+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events);
															
 
																 #endif /* SOCL_TASK_H */
															
--- a/socl/src/util.c
+++ b/socl/src/util.c
@@ -24,3 +24,27 @@ int starpu_worker_get_range() {
 
																    return oid;
															
 
																 }
															
 
																+
															
 
																+void * memdupa(const void *p, size_t size) {
															
 
																+	void * s = malloc(size);
															
 
																+	memcpy(s,p,size);
															
 
																+	return s;
															
 
																+}
															
 
																+
															
 
																+void ** memdup_deep_safea(const void **p, unsigned n, size_t size) {
															
 
																+	void ** s = (void**)malloc(sizeof(void*) * n);
															
 
																+	unsigned i;
															
 
																+	for (i=0; i<n; i++) {
															
 
																+		s[i] = memdup_safe((void*)p[i], size);
															
 
																+	}
															
 
																+	return s;
															
 
																+}
															
 
																+
															
 
																+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size) {
															
 
																+	void ** s = (void**)malloc(sizeof(void*) * n);
															
 
																+	unsigned i;
															
 
																+	for (i=0; i<n; i++) {
															
 
																+		s[i] = memdup_safe((void*)p[i], size[i]);
															
 
																+	}
															
 
																+	return s;
															
 
																+}
															
--- a/socl/src/util.h
+++ b/socl/src/util.h
@@ -19,4 +19,28 @@
 
																 int starpu_worker_get_range();
															
 
																+/**
															
 
																+ * Duplicate a memory area into a fresh allocated buffer
															
 
																+ * Consider using memdup or memdup_safe instead
															
 
																+ */
															
 
																+void * memdupa(const void *p, size_t size);
															
 
																+
															
 
																+#define memdup(p, size) ((typeof(p))memdupa((const void*)p,size))
															
 
																+#define memdup_safe(p,size) (p == NULL ? NULL : memdup(p,size))
															
 
																+
															
 
																+/**
															
 
																+ * Duplicate an array of pointers by performing a deep copy
															
 
																+ */
															
 
																+void ** memdup_deep_safea(const void **p, unsigned n, size_t size);
															
 
																+
															
 
																+#define memdup_deep_safe(p,n,size) ((typeof(p))memdup_deep_safea((const void **)p,n,size))
															
 
																+
															
 
																+/**
															
 
																+ * Duplicate an array of pointers by performing a deep copy
															
 
																+ * Sizes are different for each cell
															
 
																+ */
															
 
																+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size);
															
 
																+
															
 
																+#define memdup_deep_varsize_safe(p,n,size) ((typeof(p))memdup_deep_varsize_safea((const void **)p,n,size))
															
 
																+
															
 
																 #endif /* SOCL_UTIL_H */
															
--- a/src/common/htable32.c
+++ b/src/common/htable32.c