Explorar o código

new merge (sched_policies not yet done)

Andra Hugo %!s(int64=14) %!d(string=hai) anos
pai
achega
64ee8754cc
Modificáronse 100 ficheiros con 2572 adicións e 1138 borrados
  1. 1 0
      .gitignore
  2. 2 0
      AUTHORS
  3. 7 1
      Makefile.am
  4. 42 23
      configure.ac
  5. 6 1
      doc/Makefile.am
  6. 160 0
      doc/c-extensions.texi
  7. 48 11
      doc/starpu.texi
  8. 2 0
      examples/Makefile.am
  9. 2 2
      examples/basic_examples/hello_world.c
  10. 3 3
      examples/basic_examples/mult.c
  11. 384 0
      examples/basic_examples/mult_impl.c
  12. 2 2
      examples/basic_examples/vector_scal_cpu.c
  13. 10 9
      examples/cholesky/cholesky_models.c
  14. 1 1
      examples/filters/fblock_cpu.c
  15. 1 1
      examples/filters/fmatrix.c
  16. 1 1
      examples/filters/fvector.c
  17. 13 12
      examples/heat/lu_kernels_model.c
  18. 1 1
      examples/mandelbrot/mandelbrot.c
  19. 6 6
      examples/ppm_downscaler/ppm_downscaler.c
  20. 8 8
      examples/ppm_downscaler/yuv_downscaler.c
  21. 1 1
      examples/profiling/profiling.c
  22. 4 4
      examples/reductions/dot_product.c
  23. 2 2
      examples/reductions/minmax_reduction.c
  24. 1 1
      examples/socl/mandelbrot/mandelbrot.c
  25. 2 2
      examples/spmv/spmv.c
  26. 1 0
      examples/starpufft/.gitignore
  27. 1 0
      examples/stencil/.gitignore
  28. 2 2
      examples/stencil/stencil-blocks.c
  29. 33 25
      examples/stencil/stencil-kernels.c
  30. 3 3
      examples/stencil/stencil.c
  31. 1 1
      examples/tag_example/tag_example.c
  32. 1 1
      examples/tag_example/tag_example2.c
  33. 1 1
      examples/tag_example/tag_example3.c
  34. 2 2
      examples/tag_example/tag_restartable.c
  35. 3 3
      examples/top/hello_world_top.c
  36. 2 0
      gcc-plugin/README
  37. 5 0
      gcc-plugin/examples/Makefile.am
  38. 8 36
      gcc-plugin/examples/matrix-mult.c
  39. 5 2
      gcc-plugin/src/Makefile.am
  40. 234 0
      gcc-plugin/src/c-expr.y
  41. 8 0
      gcc-plugin/src/starpu-gcc-config.h.in
  42. 148 96
      gcc-plugin/src/starpu.c
  43. 1 0
      gcc-plugin/tests/Makefile.am
  44. 2 2
      gcc-plugin/tests/acquire-errors.c
  45. 4 1
      gcc-plugin/tests/lib-user.c
  46. 8 2
      gcc-plugin/tests/register-errors.c
  47. 39 1
      gcc-plugin/tests/register.c
  48. 11 5
      gcc-plugin/tests/run-test.in
  49. 2 2
      gcc-plugin/tests/unregister-errors.c
  50. 27 0
      gcc-plugin/tests/wait-errors.c
  51. 1 0
      include/starpu_config.h.in
  52. 2 0
      include/starpu_data_filters.h
  53. 5 4
      include/starpu_perfmodel.h
  54. 6 4
      include/starpu_scheduler.h
  55. 19 1
      include/starpu_task.h
  56. 3 2
      include/starpu_task_bundle.h
  57. 2 0
      m4/.gitignore
  58. 9 1
      m4/gcc.m4
  59. 1 0
      mpi/.gitignore
  60. 1 0
      mpi/tests/.gitignore
  61. 8 6
      socl/src/Makefile.am
  62. 3 4
      socl/src/cl_createcommandqueue.c
  63. 11 11
      socl/src/cl_createkernel.c
  64. 2 2
      socl/src/cl_createprogramwithsource.c
  65. 4 3
      socl/src/cl_enqueuebarrier.c
  66. 40 28
      socl/src/cl_enqueuecopybuffer.c
  67. 25 48
      socl/src/cl_enqueuemapbuffer.c
  68. 15 5
      socl/src/cl_enqueuemarker.c
  69. 124 250
      socl/src/cl_enqueuendrangekernel.c
  70. 36 24
      socl/src/cl_enqueuereadbuffer.c
  71. 4 16
      socl/src/cl_enqueuetask.c
  72. 18 14
      socl/src/cl_enqueueunmapmemobject.c
  73. 3 4
      socl/src/cl_enqueuewaitforevents.c
  74. 42 33
      socl/src/cl_enqueuewritebuffer.c
  75. 9 6
      socl/src/cl_finish.c
  76. 2 1
      socl/src/cl_geteventinfo.c
  77. 1 1
      socl/src/cl_getkernelinfo.c
  78. 5 3
      socl/src/cl_setkernelarg.c
  79. 6 1
      socl/src/cl_waitforevents.c
  80. 235 0
      socl/src/command.c
  81. 198 0
      socl/src/command.h
  82. 40 0
      socl/src/command_list.c
  83. 28 0
      socl/src/command_list.h
  84. 124 53
      socl/src/command_queue.c
  85. 8 4
      socl/src/command_queue.h
  86. 76 0
      socl/src/debug.c
  87. 4 57
      socl/src/debug.h
  88. 13 15
      socl/src/event.c
  89. 5 0
      socl/src/event.h
  90. 6 2
      socl/src/gc.c
  91. 1 1
      socl/src/gc.h
  92. 0 123
      socl/src/graph.c
  93. 0 73
      socl/src/graph.h
  94. 0 4
      socl/src/init.c
  95. 39 18
      socl/src/socl.h
  96. 61 35
      socl/src/task.c
  97. 17 4
      socl/src/task.h
  98. 24 0
      socl/src/util.c
  99. 24 0
      socl/src/util.h
  100. 0 0
      src/common/htable32.c

+ 1 - 0
.gitignore

@@ -185,3 +185,4 @@ starpu.log
 /gcc-plugin/tests/unregister
 /gcc-plugin/tests/lib-user
 /gcc-plugin/examples/matrix-mult
+/gcc-plugin/src/c-expr.c

+ 2 - 0
AUTHORS

@@ -10,3 +10,5 @@ William Braik <wbraik@gmail.com>
 Yann Courtois <yann.courtois33@gmail.com>
 Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
+David Gómez <david_gomez1380@yahoo.com.mx>
+Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>

+ 7 - 1
Makefile.am

@@ -18,6 +18,8 @@ ACLOCAL_AMFLAGS=-I m4
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 SUBDIRS = src
+SUBDIRS += tools tests doc
+
 if USE_MPI
 SUBDIRS += mpi
 endif
@@ -26,7 +28,7 @@ if BUILD_SOCL
 SUBDIRS += socl
 endif
 
-SUBDIRS += tools examples tests doc
+SUBDIRS += examples
 
 if COND_OPT
 SUBDIRS += tests/opt examples/opt
@@ -68,6 +70,8 @@ clean-local:
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
+uninstall-local:
+	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
 endif
 
 if STARPU_HAVE_WINDOWS
@@ -77,3 +81,5 @@ txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README
 EXTRA_DIST = AUTHORS COPYING.LGPL README
+
+include starpu-top/extradist

+ 42 - 23
configure.ac

@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2011  Télécom-SudParis
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -33,8 +34,8 @@ AC_PROG_SED
 AC_PROG_LN_S
 AC_PROG_F77
 
-AC_LIBTOOL_WIN32_DLL
-AC_PROG_LIBTOOL
+LT_PREREQ([2.2])
+LT_INIT([win32-dll])
 
 AC_PROG_INSTALL
 AC_PROG_MKDIR_P
@@ -158,18 +159,6 @@ if test x$enable_libnuma = xyes; then
 fi
 
 ###############################################################################
-#									      #
-#				SCED_CTX settings			      #
-#									      #
-###############################################################################
-AC_MSG_CHECKING(maximum number of sched_ctxs)
-AC_ARG_ENABLE(max_sched_ctxs, [AS_HELP_STRING([--enable-max-sched-ctxs=<number>],
-			[maximum number of sched_ctxs])],
-			max_sched_ctxs=$enableval, max_sched_ctxs=10)
-AC_MSG_RESULT($max_sched_ctxs)
-AC_DEFINE_UNQUOTED(STARPU_NMAX_SCHED_CTXS, [$max_sched_ctxs], [Maximum number of sched_ctxs supported])
-
-###############################################################################
 #                                                                             #
 #                                 CPUs settings                               #
 #                                                                             #
@@ -282,6 +271,7 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 
     if test "$have_valid_cuda" = "no" ; then
         LDFLAGS="${SAVED_LDFLAGS}"
+	unset STARPU_CUDA_LDFLAGS
     fi
 ])
 
@@ -460,8 +450,10 @@ if test x$have_curand = xyes; then
 fi
 
 # Peer transfers are only supported since CUDA 4.0
+# Disable them if user explicity wants to disable them
+AC_ARG_ENABLE(cuda_memcpy_peer, [AS_HELP_STRING([--disable-cuda-memcpy-peer], [do not allow peer transfers when using CUDA 4.0])],, [enable_cuda_memcpy_peer=yes])
 have_cuda_memcpy_peer=no
-if test x$enable_cuda = xyes; then
+if test x$enable_cuda_memcpy_peer = xyes -a x$enable_cuda = xyes ; then
     SAVED_LDFLAGS="${LDFLAGS}"
     LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
     AC_CHECK_FUNC([cudaMemcpyPeer], have_cuda_memcpy_peer=yes, have_cuda_memcpy_peer=no)
@@ -898,6 +890,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 
+# Computes the maximum number of implementations per arch
+AC_MSG_CHECKING(maximum number of implementations)
+AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
+		[maximum number of implementations])],
+		maximplementations=$enableval, maximplementations=1)
+AC_MSG_RESULT($maximplementations)
+AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
+		[maximum number of implementations])
+
 ###############################################################################
 #                                                                             #
 #                                    MPI                                      #
@@ -1116,6 +1117,10 @@ else
    run_gcc_plugin_test_suite="no"
 fi
 
+# Bison is used to generate the C expression parser.  The generated
+# parser is part of the distribution, though.
+AC_PROG_YACC
+
 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
 
@@ -1285,6 +1290,10 @@ AC_MSG_CHECKING(which BLAS lib should be used)
 AC_MSG_RESULT($blas_lib)
 AC_SUBST(BLAS_LIB,$blas_lib)
 
+##########################################
+# FFT                                    #
+##########################################
+
 have_fftw=no
 have_fftwf=no
 have_fftwl=no
@@ -1295,7 +1304,7 @@ PKG_CHECK_MODULES([FFTW],  [fftw3],  [
   have_fftw=yes
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
-
+ 
 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
   AC_SUBST([STARPU_HAVE_FFTWF], [1])
@@ -1310,6 +1319,10 @@ PKG_CHECK_MODULES([FFTWL], [fftw3l], [
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
 
+##########################################
+# hwloc                                  #
+##########################################
+
 AC_ARG_WITH([hwloc], [AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])])
 SAVED_LDFLAGS="${LDFLAGS}"
 SAVED_CPPFLAGS="${CPPFLAGS}"
@@ -1364,23 +1377,29 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
-  chmod +x gcc-plugin/tests/run-test
 ])
 
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
 
-AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
+if test $build_gcc_plugin == "yes" ; then
+    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
+    AC_OUTPUT([
+	    gcc-plugin/Makefile
+	    gcc-plugin/src/Makefile
+	    gcc-plugin/tests/Makefile
+	    gcc-plugin/tests/run-test
+	    gcc-plugin/examples/Makefile
+            ])
+    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
+            chmod +x gcc-plugin/tests/run-test
+            ])
+fi 
 
 AC_OUTPUT([
 	Makefile
 	src/Makefile
 	tools/Makefile
-	gcc-plugin/Makefile
-	gcc-plugin/src/Makefile
-	gcc-plugin/tests/Makefile
-	gcc-plugin/tests/run-test
-	gcc-plugin/examples/Makefile
 	socl/Makefile
 	socl/src/Makefile
 	libstarpu.pc

+ 6 - 1
doc/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009  Université de Bordeaux 1
-# Copyright (C) 2010  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,8 @@
 
 info_TEXINFOS = starpu.texi
 
+starpu_TEXINFOS = c-extensions.texi
+
 MAINTAINERCLEANFILES = starpu.pdf
 
 EXTRA_DIST = starpu.pdf \
@@ -28,6 +30,9 @@ EXTRA_DIST = starpu.pdf \
 
 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
 
+uninstall-local:
+	$(RM) $(DESTDIR)$(infodir)/dir
+
 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@

+ 160 - 0
doc/c-extensions.texi

@@ -0,0 +1,160 @@
+@c This is part of the StarPU Handbook.
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+
+@node C Extensions
+@chapter C Extensions
+
+@cindex C extensions
+@cindex GCC plug-in
+
+When configured with @code{--enable-gcc-extensions}, StarPU builds a
+plug-in for the GNU Compiler Collection (GCC), which defines extensions
+to the C language that make it easier to write StarPU code@footnote{This
+feature is only available for GCC 4.5 and later.}.  Those extensions
+include syntactic sugar for defining tasks and their implementations,
+invoking a task, and manipulating data buffers.
+
+This section does not require detailed knowledge of the StarPU library.
+
+Note: as of StarPU @value{VERSION}, this is still an area under
+development and subject to change.
+
+@menu
+* Defining Tasks::              Defining StarPU tasks
+* Registered Data Buffers::     Manipulating data buffers
+@end menu
+
+@node Defining Tasks
+@section Defining Tasks
+
+@cindex task
+@cindex task implementation
+
+The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
+
+@enumerate
+@item
+tasks may have several implementations---e.g., one for CPUs, one written
+in OpenCL, one written in CUDA;
+@item
+when a task is invoked, it may run in parallel, and StarPU is free to
+choose any of its implementations.
+@end enumerate
+
+Tasks and their implementations must be @emph{declared}.  These
+declarations are annotated with @dfn{attributes} (@pxref{Attribute
+Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
+(GCC)}): the declaration of a task is a regular C function declaration
+with an additional @code{task} attribute, and task implementations are
+declared with a @code{task_implementation} attribute.
+
+The following function attributes are provided:
+
+@table @code
+
+@item task
+@cindex @code{task} attribute
+Declare the given function as a StarPU task.  Its return type must be
+@code{void}, and it must not be defined---instead, a definition will
+automatically be provided by the compiler.
+
+Under the hood, declaring a task leads to the declaration of the
+corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
+more task implementations are declared in the same compilation unit,
+then the codelet and the function itself are also defined; they inherit
+the scope of the task.
+
+Scalar arguments to the task are passed by value and copied to the
+target device if need be---technically, they are passed as the
+@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
+
+Pointer arguments are assumed to be registered data buffers---the
+@code{buffers} argument of a task (@pxref{Codelets and Tasks,
+@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
+read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
+buffers are assumed to be used read-write (@code{STARPU_RW}).
+
+@item task_implementation (@var{target}, @var{task})
+@cindex @code{task_implementation} attribute
+Declare the given function as an implementation of @var{task} to run on
+@var{target}.  @var{target} must be a string, currently one of
+@code{"cpu"} or @code{"cuda"}.
+@c FIXME: Update when OpenCL support is ready.
+
+@end table
+
+Here is an example:
+
+@example
+static void matmul (const float *A, const float *B, float *C,
+		    size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task));
+
+static void matmul_cpu (const float *A, const float *B, float *C,
+			size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task_implementation ("cpu", matmul)));
+
+
+static void
+matmul_cpu (const float *A, const float *B, float *C,
+	    size_t nx, size_t ny, size_t nz)
+@{
+  size_t i, j, k;
+
+  for (j = 0; j < ny; j++)
+    for (i = 0; i < nx; i++)
+      @{
+	for (k = 0; k < nz; k++)
+	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
+      @}
+@}
+@end example
+
+@noindent
+A @code{matmult} task is defined; it has only one implementation,
+@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
+@var{B} are input buffers, whereas @var{C} is considered an input/output
+buffer.  The task can be invoked like a regular C function:
+
+@example
+matmul (&A[i * zdim * bydim + k * bzdim * bydim],
+        &B[k * xdim * bzdim + j * bxdim * bzdim],
+        &C[i * xdim * bydim + j * bxdim * bydim],
+        bxdim, bydim, bzdim);
+@end example
+
+@noindent
+This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
+implementation may run in parallel with the continuation of the caller.
+
+The next section describes how memory buffers must be handled in
+StarPU-GCC code.
+
+
+@node Registered Data Buffers
+@section Registered Data Buffers
+
+Data buffers such as matrices and vectors that are to be passed to tasks
+must be @dfn{registered}.  Registration allows StarPU to handle data
+transfers among devices---e.g., transferring an input buffer from the
+CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
+Management Library}).
+
+The following pragmas are provided:
+
+@table @code
+
+@item #pragma starpu register @var{ptr} [@var{size}]
+Register @var{ptr} as a @var{size}-element buffer.
+
+@item #pragma starpu unregister @var{ptr}
+@item #pragma starpu acquire @var{ptr}
+
+@end table
+
+FIXME: finish
+
+@c Local Variables:
+@c TeX-master: "guile.texi"
+@c ispell-local-dictionary: "american"
+@c End:

+ 48 - 11
doc/starpu.texi

@@ -49,8 +49,9 @@ was last updated on @value{UPDATED}.
 * Configuring StarPU::          How to configure StarPU
 * StarPU API::                  The API to use StarPU
 * Advanced Topics::             Advanced use of StarPU
-* Full source code for the 'Scaling a Vector' example::  
+* C Extensions::                Easier StarPU programming with GCC
 
+* Full source code for the 'Scaling a Vector' example::
 * Function Index::              Index of C functions.
 @end menu
 
@@ -1227,16 +1228,16 @@ Partitioning can be applied several times, see
 @section Performance model example
 
 To achieve good scheduling, StarPU scheduling policies need to be able to
-estimate in advance the duration of a task. This is done by giving to codelets a
-performance model. There are several kinds of performance models.
+estimate in advance the duration of a task. This is done by giving to codelets
+a performance model, by defining a @code{starpu_perfmodel_t} structure and
+providing its address in the @code{model} field of the @code{starpu_codelet}
+structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel_t}
+are mandatory, to give a name to the model, and the type of the model, since
+there are several kinds of performance models.
 
 @itemize
 @item
-Providing an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
-see for instance
-@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)
-@item
-Measured at runtime (STARPU_HISTORY_BASED model type). This assumes that for a
+Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
 given set of data input/output sizes, the performance will always be about the
 same. This is very true for regular kernels on GPUs for instance (<0.1% error),
 and just a bit less true on CPUs (~=1% error). This also assumes that there are
@@ -1276,7 +1277,7 @@ starpu_codelet cl = @{
 @end cartouche
 
 @item
-Measured at runtime and refined by regression (STARPU_REGRESSION_*_BASED
+Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
 model type). This still assumes performance regularity, but can work
 with various data input sizes, by applying regression over observed
 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
@@ -1286,7 +1287,12 @@ STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
 model for the @code{memset} operation.
 
 @item
-Provided explicitly by the application (STARPU_PER_ARCH model type): the
+Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
+see for instance
+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
+
+@item
+Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
 @code{.per_arch[i].cost_model} fields have to be filled with pointers to
 functions which return the expected duration of the task in micro-seconds, one
 per architecture.
@@ -1877,6 +1883,22 @@ TODO
 
 @c what kind of information do we get ?
 
+The bus speed measured by StarPU can be displayed by using the
+@code{starpu_machine_display} tool, for instance:
+
+@example
+StarPU has found :
+        3 CUDA devices
+                CUDA 0 (Tesla C2050 02:00.0)
+                CUDA 1 (Tesla C2050 03:00.0)
+                CUDA 2 (Tesla C2050 84:00.0)
+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
+RAM     0.000000        5176.530428     5176.492994     5191.710722
+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
+@end example
+
 @node StarPU-Top
 @subsection StarPU-Top interface
 
@@ -2057,7 +2079,7 @@ This will create an @code{activity.data} file in the current
 directory. A profile of the application showing the activity of StarPU
 during the execution of the program can be generated:
 @example
-$ starpu_top.sh activity.data
+$ starpu_top activity.data
 @end example
 
 This will create a file named @code{activity.eps} in the current directory.
@@ -2558,6 +2580,7 @@ Enable flags for the @code{gcov} coverage tool.
 * --with-cuda-dir::             
 * --with-cuda-include-dir::             
 * --with-cuda-lib-dir::             
+* --disable-cuda-memcpy-peer::
 * --enable-maxopencldev::       
 * --disable-opencl::            
 * --with-opencl-dir::           
@@ -2624,6 +2647,13 @@ notably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to
 
 @end table
 
+@node --disable-cuda-memcpy-peer
+@subsubsection @code{--disable-cuda-memcpy-peer}
+@table @asis
+@item @emph{Description}
+Explicitely disables peer transfers when using CUDA 4.0
+@end table
+
 @node --enable-maxopencldev
 @subsubsection @code{--enable-maxopencldev=<number>}
 @table @asis
@@ -4893,6 +4923,13 @@ static struct starpu_sched_policy_s dummy_sched_policy = @{
 
 
 @c ---------------------------------------------------------------------
+@c C Extensions
+@c ---------------------------------------------------------------------
+
+@include c-extensions.texi
+
+
+@c ---------------------------------------------------------------------
 @c Appendices
 @c ---------------------------------------------------------------------
 

+ 2 - 0
examples/Makefile.am

@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2011  Télécom-SudParis
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -152,6 +153,7 @@ examplebin_PROGRAMS +=				\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/variable			\
+	basic_examples/mult_impl                \
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\

+ 2 - 2
examples/basic_examples/hello_world.c

@@ -52,12 +52,12 @@ struct params {
 };
 void cpu_func(void *buffers[], void *cl_arg)
 {
-	struct params *params = cl_arg;
+	struct params *params = (struct params *) cl_arg;
 
 	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
 }
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 int main(int argc, char **argv)
 {

+ 3 - 3
examples/basic_examples/mult.c

@@ -127,9 +127,9 @@ static void init_problem_data(void)
 
 	/* we initialize matrices A, B and C in the usual way */
 
-	A = malloc(zdim*ydim*sizeof(float));
-	B = malloc(xdim*zdim*sizeof(float));
-	C = malloc(xdim*ydim*sizeof(float));
+	A = (float *) malloc(zdim*ydim*sizeof(float));
+	B = (float *) malloc(xdim*zdim*sizeof(float));
+	C = (float *) malloc(xdim*ydim*sizeof(float));
 
 	/* fill the A and B matrices */
 	srand(2009);

+ 384 - 0
examples/basic_examples/mult_impl.c

@@ -0,0 +1,384 @@
+/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Télécom-SudParis
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+static float *A, *B, *C;
+static starpu_data_handle A_handle, B_handle, C_handle;
+
+static unsigned nslicesx = 4;
+static unsigned nslicesy = 4;
+static unsigned xdim = 1024;
+static unsigned ydim = 1024;
+static unsigned zdim = 512;
+
+
+double mult_gemm_cost(starpu_buffer_descr *descr)
+{
+	/* C = A * B */
+	uint32_t nxC, nyC, nxA;
+
+
+	nxC = starpu_matrix_get_nx(descr[2].handle);
+	nyC = starpu_matrix_get_ny(descr[2].handle);
+	nxA = starpu_matrix_get_nx(descr[0].handle);
+
+	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
+
+	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
+
+	printf("cost %e \n", cost);
+
+	return cost;
+}
+
+static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
+{
+	float *subA, *subB, *subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	printf("On application: Hello, this is kernel cpu_mult\n\n");
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator! */
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	/* we assume a FORTRAN-ordering! */
+	unsigned i,j,k;
+	for (i = 0; i < nyC; i++)
+	{
+		for (j = 0; j < nxC; j++)
+		{
+			float sum = 0.0;
+
+			for (k = 0; k < nyA; k++)
+			{
+				sum += subA[j+k*ldA]*subB[k+i*ldB];
+			}
+
+			subC[j + i*ldC] = sum;
+		}
+	}
+}
+
+static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
+{
+	float *subA, *subB, *subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	printf("On application: this is kernel cpu_mult_2\n\n");
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	/* we assume a FORTRAN-ordering! */
+	unsigned i,j,k;
+	for (j = 0; j < nxC; j++)
+	{
+		for (i = 0; i < nyC; i++)
+		{
+			float sum = 0.0;
+
+			for (k = 0; k < nyA; k++)
+			{
+				sum += subA[j+k*ldA]*subB[k+i*ldB];
+			}
+
+			subC[j + i*ldC] = sum;
+		}
+	}
+}
+
+
+
+static void init_problem_data(void)
+{
+	unsigned i,j;
+
+	/* we initialize matrices A, B and C in the usual way */
+
+	A = malloc(zdim*ydim*sizeof(float));
+	B = malloc(xdim*zdim*sizeof(float));
+	C = malloc(xdim*ydim*sizeof(float));
+
+	/* fill the A and B matrices */
+	srand(2009);
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < zdim; i++) {
+			A[j+i*ydim] = (float)(starpu_drand48());
+		}
+	}
+
+	for (j=0; j < zdim; j++) {
+		for (i=0; i < xdim; i++) {
+			B[j+i*zdim] = (float)(starpu_drand48());
+		}
+	}
+
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < xdim; i++) {
+			C[j+i*ydim] = (float)(0);
+		}
+	}
+}
+
+static void partition_mult_data(void)
+{
+	/* note that we assume a FORTRAN ordering here! */
+
+	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
+		ydim, ydim, zdim, sizeof(float));
+	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
+		zdim, zdim, xdim, sizeof(float));
+	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
+		ydim, ydim, xdim, sizeof(float));
+
+	/* A filter is a method to partition a data into disjoint chunks, it is
+	 * described by the means of the "struct starpu_data_filter" structure that
+	 * contains a function that is applied on a data handle to partition it
+	 * into smaller chunks, and an argument that is passed to the function
+	 * (eg. the number of blocks to create here).
+	 */
+
+	struct starpu_data_filter vert = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nslicesx,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL
+	};
+
+	struct starpu_data_filter horiz = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nslicesy,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL
+	};
+
+/*
+ *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
+ *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
+ *	the number of filters to apply, and the indexes for each filters, for
+ *	instance:
+ *
+ *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
+ *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
+ *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
+ *
+ *	Note that here we applied 2 filters recursively onto C.
+ *
+ *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
+ *	of blocked matrix C for example.
+ *
+ *		              |---|---|---|---|
+ *		              |   |   | B'|   | B
+ *		              |---|---|---|---|
+ *		                0   1   2   3
+ *		     |----|   |---|---|---|---|
+ *		     |    |   |   |   |   |   |
+ *		     |    | 0 |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		     | A' |   |   |   | C'|   |
+ *		     |    |   |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		       A              C
+ *
+ *	IMPORTANT: applying filters is equivalent to partitionning a piece of
+ *	data in a hierarchical manner, so that memory consistency is enforced
+ *	for each of the elements independantly. The tasks should therefore NOT
+ *	access inner nodes (eg. one column of C or the whole C) but only the
+ *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
+ *	possible by disapplying the filters (using starpu_data_unpartition), to
+ *	enforce memory consistency.
+ */
+
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
+
+	/* starpu_data_map_filters is a variable-arity function, the first argument
+	 * is the handle of the data to partition, the second argument is the
+	 * number of filters to apply recursively. Filters are applied in the
+	 * same order as the arguments.
+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
+	 * then applying horiz on each sub-data (ie. each column of C)
+	 */
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+}
+
+static struct starpu_perfmodel_t starpu_dgemm_model_common = {
+	.cost_model = mult_gemm_cost,
+	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
+	.symbol = "mult_perf_model"
+};
+
+/*
+static struct starpu_perfmodel_t mult_perf_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "mult_perf_model"
+};
+*/
+
+struct starpu_conf conf = {
+		.sched_policy_name = "heft",
+		.calibrate = 1,
+		.ncpus = 4
+};
+
+
+static starpu_codelet cl = {
+        /* we can only execute that kernel on a CPU yet */
+        .where = STARPU_CPU,
+        //.starpu_impl_multiple = 1,
+        /* CPU implementation of the codelet */
+        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
+        .cpu_funcs = {cpu_mult,cpu_mult_2},
+        /* the codelet manipulates 3 buffers that are managed by the
+         * DSM */
+        .nbuffers = 3,
+        /* in case the scheduling policy may use performance models */
+        .model = &starpu_dgemm_model_common
+};
+
+static void launch_tasks(void)
+{
+	/* partition the work into slices */
+	unsigned taskx, tasky;
+
+	for (taskx = 0; taskx < nslicesx; taskx++)
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
+			/* C[taskx, tasky] = A[tasky] B[taskx] */
+
+			/* by default, starpu_task_create() returns an
+ 			 * asynchronous task (ie. task->synchronous = 0) */
+			struct starpu_task *task = starpu_task_create();
+
+			/* this task implements codelet "cl" */
+			task->cl = &cl;
+
+			/*
+			 *              |---|---|---|---|
+			 *              |   | * |   |   | B
+			 *              |---|---|---|---|
+			 *                    X
+			 *     |----|   |---|---|---|---|
+			 *     |****| Y |   |***|   |   |
+			 *     |****|   |   |***|   |   |
+			 *     |----|   |---|---|---|---|
+			 *     |    |   |   |   |   |   |
+			 *     |    |   |   |   |   |   |
+			 *     |----|   |---|---|---|---|
+			 *       A              C
+			 */
+
+			/* there was a single filter applied to matrices A
+			 * (respectively B) so we grab the handle to the chunk
+			 * identified by "tasky" (respectively "taskx). The "1"
+			 * tells StarPU that there is a single argument to the
+			 * variable-arity function starpu_data_get_sub_data */
+			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
+			task->buffers[0].mode = STARPU_R;
+			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
+			task->buffers[1].mode = STARPU_R;
+
+			/* 2 filters were applied on matrix C, so we give
+			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
+			 * must match the order in which the filters were
+			 * applied.
+			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
+			 * a handle to the column number k of matrix C.
+			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
+			 * equivalent to
+			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
+			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
+			task->buffers[2].mode = STARPU_W;
+
+			/* this is not a blocking call since task->synchronous = 0 */
+			int summit_task;
+			summit_task = starpu_task_submit(task);
+			printf("task is submmited or not %d\n",summit_task);
+
+		}
+	}
+}
+
+int main(void)
+{
+	/* start the runtime */
+	starpu_init(&conf);
+
+	/* initialize matrices A, B and C and register them to StarPU */
+	init_problem_data();
+
+	/* partition matrices into blocks that can be manipulated by the
+ 	 * codelets */
+	partition_mult_data();
+
+	/* submit all tasks in an asynchronous fashion */
+	launch_tasks();
+
+	/* wait for termination */
+	starpu_task_wait_for_all();
+
+	/* remove the filters applied by the means of starpu_data_map_filters; now
+ 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
+	 * starpu_data_map_filters is called again on C_handle.
+	 * The second argument is the memory node where the different subsets
+	 * should be reassembled, 0 = main memory (RAM) */
+	starpu_data_unpartition(C_handle, 0);
+
+	/* stop monitoring matrix C : after this, it is not possible to pass C
+	 * (or any subset of C) as a codelet input/output. This also implements
+	 * a barrier so that the piece of data is put back into main memory in
+	 * case it was only available on a GPU for instance. */
+	starpu_data_unregister(C_handle);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 2 - 2
examples/basic_examples/vector_scal_cpu.c

@@ -24,7 +24,7 @@
 void scal_cpu_func(void *buffers[], void *cl_arg)
 {
 	unsigned i;
-	float *factor = cl_arg;
+	float *factor = (float *) cl_arg;
 
 	/*
 	 * The "buffers" array matches the task->buffers array: for instance
@@ -37,7 +37,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 	 * migrated/replicated), and elemsize gives the size of each elements.
 	 */
 
-	starpu_vector_interface_t *vector = buffers[0];
+	starpu_vector_interface_t *vector = (starpu_vector_interface_t *) buffers[0];
 
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(vector);

+ 10 - 9
examples/cholesky/cholesky_models.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -125,27 +126,27 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 }
 
 struct starpu_perfmodel_t chol_model_11 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_11"
 };
 
 struct starpu_perfmodel_t chol_model_21 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_21"
 };
 
 struct starpu_perfmodel_t chol_model_22 = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_22"

+ 1 - 1
examples/filters/fblock_cpu.c

@@ -19,7 +19,7 @@
 void cpu_func(void *buffers[], void *cl_arg)
 {
         unsigned i, j, k;
-        int *factor = cl_arg;
+        int *factor = (int *) cl_arg;
 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);

+ 1 - 1
examples/filters/fmatrix.c

@@ -25,7 +25,7 @@
 void cpu_func(void *buffers[], void *cl_arg)
 {
         unsigned i, j;
-        int *factor = cl_arg;
+        int *factor = (int *) cl_arg;
 
         /* length of the matrix */
         unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);

+ 1 - 1
examples/filters/fvector.c

@@ -24,7 +24,7 @@
 void cpu_func(void *buffers[], void *cl_arg)
 {
         unsigned i;
-        int *factor = cl_arg;
+        int *factor = (int *) cl_arg;
 
         /* length of the vector */
         unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);

+ 13 - 12
examples/heat/lu_kernels_model.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -215,9 +216,9 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
 struct starpu_perfmodel_t model_11 = {
 	.cost_model = task_11_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_11_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_11_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_11_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
@@ -231,9 +232,9 @@ struct starpu_perfmodel_t model_11 = {
 
 struct starpu_perfmodel_t model_12 = {
 	.cost_model = task_12_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_12_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_12_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_12_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
@@ -247,9 +248,9 @@ struct starpu_perfmodel_t model_12 = {
 
 struct starpu_perfmodel_t model_21 = {
 	.cost_model = task_21_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_21_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_21_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_21_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
@@ -263,9 +264,9 @@ struct starpu_perfmodel_t model_21 = {
 
 struct starpu_perfmodel_t model_22 = {
 	.cost_model = task_22_cost,
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = task_22_cost_cpu },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
+	.per_arch = {
+		[STARPU_CPU_DEFAULT][0] = { .cost_model = task_22_cost_cpu },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_model = task_22_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS

+ 1 - 1
examples/mandelbrot/mandelbrot.c

@@ -333,7 +333,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
 	while (1)
 	{
-		local_iy = STARPU_ATOMIC_ADD(pcnt, 1) - 1;
+		local_iy = STARPU_ATOMIC_ADD((unsigned int *)pcnt, 1) - 1;
 		if (local_iy >= block_size)
 			break;
 

+ 6 - 6
examples/ppm_downscaler/ppm_downscaler.c

@@ -28,7 +28,7 @@
 
 struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
 {
-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
 	assert(ppm);
 
 	ppm->ncols = ncols;
@@ -36,9 +36,9 @@ struct ppm_image *allocate_new_ppm(int ncols, int nlines, int coldepth)
 	ppm->coldepth = coldepth;
 
 #ifdef STARPU_HAVE_MEMALIGN
-	ppm->data = memalign(16384, ncols*nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) memalign(16384, ncols*nlines*sizeof(struct ppm_color));
 #else
-	ppm->data = malloc(ncols*nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) malloc(ncols*nlines*sizeof(struct ppm_color));
 #endif
 	assert(ppm->data);
 
@@ -49,7 +49,7 @@ struct ppm_image *file_to_ppm(char *filename)
 {
 	int ret;
 
-	struct ppm_image *ppm = malloc(sizeof(struct ppm_image));
+	struct ppm_image *ppm = (struct ppm_image *) malloc(sizeof(struct ppm_image));
 	assert(ppm);
 	
 	FILE *file = fopen(filename, "r");
@@ -64,9 +64,9 @@ struct ppm_image *file_to_ppm(char *filename)
 	
 	/* allocate a buffer for the image */
 #ifdef STARPU_HAVE_MEMALIGN
-	ppm->data = memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) memalign(16384, ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
 #else
-	ppm->data = malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
+	ppm->data = (struct ppm_color *) malloc(ppm->ncols*ppm->nlines*sizeof(struct ppm_color));
 #endif
 	assert(ppm->data);
 

+ 8 - 8
examples/ppm_downscaler/yuv_downscaler.c

@@ -123,7 +123,7 @@ int main(int argc, char **argv)
 	FILE *f_in = fopen(filename_in, "r");
 	assert(f_in);
 
-	struct yuv_frame *yuv_in_buffer = malloc(nframes*FRAMESIZE);
+	struct yuv_frame *yuv_in_buffer = (struct yuv_frame *) malloc(nframes*FRAMESIZE);
 	fread(yuv_in_buffer, FRAMESIZE, nframes, f_in);
 
 	/* allocate room for an output buffer */
@@ -131,16 +131,16 @@ int main(int argc, char **argv)
 	assert(f_out);
 
 /*	fprintf(stderr, "Alloc output file ...\n"); */
-	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
+	struct yuv_new_frame *yuv_out_buffer = (struct yuv_new_frame *) calloc(nframes, NEW_FRAMESIZE);
 	assert(yuv_out_buffer);
 
-	starpu_data_handle *frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
 
-	starpu_data_handle *new_frame_y_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *new_frame_u_handle = calloc(nframes, sizeof(starpu_data_handle));
-	starpu_data_handle *new_frame_v_handle = calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *new_frame_y_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *new_frame_u_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
+	starpu_data_handle *new_frame_v_handle = (starpu_data_handle *)  calloc(nframes, sizeof(starpu_data_handle));
 
 	starpu_init(NULL);
 

+ 1 - 1
examples/profiling/profiling.c

@@ -53,7 +53,7 @@ int main(int argc, char **argv)
 		.nbuffers = 0
 	};
 
-	struct starpu_task **tasks = malloc(niter*sizeof(struct starpu_task *));
+	struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
 	assert(tasks);
 
 	unsigned i;

+ 4 - 4
examples/reductions/dot_product.c

@@ -164,11 +164,11 @@ int main(int argc, char **argv)
 	unsigned long nelems = nblocks*entries_per_block;
 	size_t size = nelems*sizeof(float);
 
-	x = malloc(size);
-	y = malloc(size);
+	x = (float *) malloc(size);
+	y = (float *) malloc(size);
 
-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
-	y_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
+	y_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
 
 	assert(x && y);
 

+ 2 - 2
examples/reductions/minmax_reduction.c

@@ -129,8 +129,8 @@ int main(int argc, char **argv)
 	unsigned long nelems = nblocks*entries_per_bock;
 	size_t size = nelems*sizeof(TYPE);
 
-	x = malloc(size);
-	x_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	x = (TYPE *) malloc(size);
+	x_handles = (starpu_data_handle *) calloc(nblocks, sizeof(starpu_data_handle));
 	
 	assert(x && x_handles);
 

+ 1 - 1
examples/socl/mandelbrot/mandelbrot.c

@@ -21,7 +21,7 @@
 #include <unistd.h>
 
 /* Uncomment this to activate X11 display */
-#define USE_X11
+//#define USE_X11
 
 #define SHORT_LOG 1
 #define ROUND_ROBIN

+ 2 - 2
examples/spmv/spmv.c

@@ -44,8 +44,8 @@ static void parse_args(int argc, char **argv)
  * same number of non-zero entries. */
 static void csr_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 {
-	starpu_csr_interface_t *csr_father = father_interface;
-	starpu_csr_interface_t *csr_child = child_interface;
+	starpu_csr_interface_t *csr_father = (starpu_csr_interface_t *) father_interface;
+	starpu_csr_interface_t *csr_child = (starpu_csr_interface_t *) child_interface;
 
 	uint32_t nrow = csr_father->nrow;
 	size_t elemsize = csr_father->elemsize;

+ 1 - 0
examples/starpufft/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 1 - 0
examples/stencil/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 2 - 2
examples/stencil/stencil-blocks.c

@@ -90,7 +90,7 @@ int MPI_TAG1(int z, int iter, int dir)
 /* Compute the size of the different blocks */
 static void compute_block_sizes(void)
 {
-	block_sizes_z = malloc(nbz*sizeof(unsigned));
+	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
 	STARPU_ASSERT(block_sizes_z);
 
 	/* Perhaps the last chunk is smaller */
@@ -136,7 +136,7 @@ void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsi
 	sizez = _sizez;
 
 	/* Create a grid of block descriptors */
-	blocks = calloc(nbz, sizeof(struct block_description));
+	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
 	STARPU_ASSERT(blocks);
 
 	/* What is the size of the different blocks ? */

+ 33 - 25
examples/stencil/stencil-kernels.c

@@ -165,10 +165,12 @@ static void check_load(starpu_block_interface_t *block, starpu_block_interface_t
 /*
  * Load a neighbour's boundary into block, CPU version
  */
-static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -184,10 +186,12 @@ static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
  * Load a neighbour's boundary into block, CUDA version
  */
 #ifdef STARPU_USE_CUDA
-static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_from_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -245,16 +249,16 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 	for (i=1; i<=K; i++)
 	{
 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 		/* Shadow data */
 		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 		/* And perform actual computation */
 #ifdef LIFE
-		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
-		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 #endif /* LIFE */
 	}
 
@@ -338,16 +342,16 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 	for (i=1; i<=K; i++)
 	{
 		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
 
 		/* Shadow data */
 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 
 		/* And perform actual computation */
 #ifdef LIFE
-		opencl_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
-                clEnqueueCopyBuffer(cq, old, new, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), 0, NULL, NULL);
+                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, NULL);
 #endif /* LIFE */
 	}
 
@@ -365,7 +369,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
  */
 static void update_func_cpu(void *descr[], void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
@@ -405,8 +409,8 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 	for (i=1; i<=K; i++)
 	{
-		starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
+		starpu_block_interface_t *oldb = (starpu_block_interface_t *) descr[i%2], *newb = (starpu_block_interface_t *) descr[(i+1)%2];
+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
 
 		/* Shadow data */
 		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
@@ -424,9 +428,9 @@ fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 
 		/* And perform actual computation */
 #ifdef LIFE
-		life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
-		memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
 #endif /* LIFE */
 	}
 
@@ -465,10 +469,12 @@ starpu_codelet cl_update = {
  */
 
 /* CPU version */
-static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cpu(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -482,10 +488,12 @@ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
 
 /* CUDA version */
 #ifdef STARPU_USE_CUDA
-static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
-					starpu_block_interface_t *boundary,
+static void load_subblock_into_buffer_cuda(void *_block,
+					void *_boundary,
 					unsigned firstz)
 {
+	starpu_block_interface_t *block = (starpu_block_interface_t *)_block;
+	starpu_block_interface_t *boundary = (starpu_block_interface_t *)_boundary;
 	check_load(block, boundary);
 
 	/* We do a contiguous memory transfer */
@@ -527,7 +535,7 @@ unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 /* top save, CPU version */
 static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 
@@ -543,7 +551,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 /* bottom save, CPU version */
 static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 
@@ -557,7 +565,7 @@ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *a
 #ifdef STARPU_USE_CUDA
 static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 
@@ -574,7 +582,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 /* bottom save, CUDA version */
 static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 
@@ -590,7 +598,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 #ifdef STARPU_USE_OPENCL
 static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	top_per_worker[workerid]++;
 
@@ -610,7 +618,7 @@ static void dummy_func_top_opencl(void *descr[] __attribute__((unused)), void *a
 /* bottom save, OPENCL version */
 static void dummy_func_bottom_opencl(void *descr[] __attribute__((unused)), void *arg)
 {
-	struct block_description *block = arg;
+	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	bottom_per_worker[workerid]++;
 

+ 3 - 3
examples/stencil/stencil.c

@@ -132,9 +132,9 @@ static void init_problem(int argc, char **argv, int rank, int world_size)
 	display_memory_consumption(rank);
 
 	who_runs_what_len = 2*niter;
-	who_runs_what = calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
-	who_runs_what_index = calloc(nbz, sizeof(*who_runs_what_index));
-	last_tick = calloc(nbz, sizeof(*last_tick));
+	who_runs_what = (int *) calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
+	who_runs_what_index = (int *) calloc(nbz, sizeof(*who_runs_what_index));
+	last_tick = (struct timeval *) calloc(nbz, sizeof(*last_tick));
 }
 
 /*

+ 1 - 1
examples/tag_example/tag_example.c

@@ -30,7 +30,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, j, iter)	((starpu_tag_t) ( ((uint64_t)(iter)<<48) |  ((uint64_t)(j)<<24) | (i)) )
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 #define Ni	64
 #define Nj	32

+ 1 - 1
examples/tag_example/tag_example2.c

@@ -30,7 +30,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 #define Ni	64
 #define Nk	256

+ 1 - 1
examples/tag_example/tag_example3.c

@@ -30,7 +30,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)iter)<<32 | (i)) )
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 #define Ni	64
 #define Nk	256

+ 2 - 2
examples/tag_example/tag_restartable.c

@@ -35,7 +35,7 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)((iter)%Nrolls))<<32 | (i)) )
 
-starpu_codelet cl;
+starpu_codelet cl = {};
 
 #define Ni	64
 #define Nk	256
@@ -134,7 +134,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 	FPRINTF(stderr, "ITER : %u\n", nk);
 
 	for (i = 0; i < Nrolls; i++) {
-		tasks[i] = malloc(ni * sizeof(*tasks[i]));
+		tasks[i] = (struct starpu_task **) malloc(ni * sizeof(*tasks[i]));
 
 		create_task_grid(i);
 	}

+ 3 - 3
examples/top/hello_world_top.c

@@ -62,7 +62,7 @@ struct params {
 };
 void cpu_func(void *buffers[], void *cl_arg)
 {
-	struct params *params = cl_arg;
+	struct params *params = (struct params *) cl_arg;
 
 	//loosing time for top example...
 	int sum = 0;
@@ -82,14 +82,14 @@ void cpu_func(void *buffers[], void *cl_arg)
 
 void callback_name_changed(starputop_param* param)
 {
-	char* message = malloc(256);
+	char* message = (char *) malloc(256);
 	sprintf(message, "Name have been changed to %s", names[name_selected]);
 	starputop_debug_log(message);
 }
 
 void callback_number_addition_changed(starputop_param* param)
 {
-	char* message = malloc(256);
+	char* message = (char *) malloc(256);
 	sprintf(message, "Number of addition is now %d", number_of_addition);
 
 	starputop_debug_log(message);

+ 2 - 0
gcc-plugin/README

@@ -8,3 +8,5 @@ tasks.
 Plug-ins are supported starting from GCC 4.5.
 
 To run the test suite, GNU Guile 1.8.x or 2.0.x is needed.
+
+When building from SVN, GNU Bison 2.5+ is required.

+ 5 - 0
gcc-plugin/examples/Makefile.am

@@ -17,4 +17,9 @@ noinst_PROGRAMS =				\
   matrix-mult
 
 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
+
+AM_CPPFLAGS =						\
+  -I$(top_srcdir)/include				\
+  $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
+
 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall

+ 8 - 36
gcc-plugin/examples/matrix-mult.c

@@ -187,34 +187,15 @@ main (int argc, char **argv)
   gettimeofday (&start_register, NULL);
   for (i = 0; i < nslicesy; i++)
     for (j = 0; j < nslicesz; j++)
-      {
-	/* TODO: Get rid of the `ptr' and `size' variables when the pragma
-	   parser supports arbitrary C expressions.  */
-
-	typeof (A) ptr = &A[i*zdim*bydim + j*bzdim*bydim];
-	size_t size = (bzdim * bydim);
-#pragma starpu register ptr size
-      }
+#pragma starpu register &A[i*zdim*bydim + j*bzdim*bydim] (bzdim * bydim)
 
   for (i = 0; i < nslicesz; i++)
-    {
-      for (j = 0; j < nslicesx; j++)
-	{
-	  typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
-	  size_t size = (bxdim * bzdim);
-#pragma starpu register ptr size
-	}
-    }
+    for (j = 0; j < nslicesx; j++)
+#pragma starpu register &B[i*xdim*bzdim + j*bxdim*bzdim] (bxdim * bzdim)
 
   for (i = 0; i < nslicesy; i++)
-    {
-      for (j = 0; j < nslicesx; j++)
-	{
-	  typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
-	  size_t size = (bxdim * bydim);
-#pragma starpu register ptr size
-	}
-    }
+    for (j = 0; j < nslicesx; j++)
+#pragma starpu register &C[i*xdim*bydim + j*bxdim*bydim] (bxdim * bydim)
 
 
   gettimeofday (&end_register, NULL);
@@ -256,25 +237,16 @@ main (int argc, char **argv)
   gettimeofday (&start_unregister, NULL);
   for (i = 0; i < nslicesy; i++)
     for (j = 0; j < nslicesz; j++)
-      {
-	typeof (A) ptr =  &A[i*zdim*bydim + j*bzdim*bydim];
-#pragma starpu unregister ptr
-      }
+#pragma starpu unregister &A[i*zdim*bydim + j*bzdim*bydim]
 
 
   for (i = 0; i < nslicesz; i++)
     for (j = 0; j < nslicesx; j++)
-      {
-	typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
-#pragma starpu unregister ptr
-      }
+#pragma starpu unregister &B[i*xdim*bzdim + j*bxdim*bzdim]
 
   for (i = 0; i < nslicesy; i++)
     for (j = 0; j < nslicesx; j++)
-      {
-	typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
-#pragma starpu unregister ptr
-      }
+#pragma starpu unregister &C[i*xdim*bydim + j*bxdim*bydim]
 
   gettimeofday (&end_unregister, NULL);
   gettimeofday (&end_all, NULL);

+ 5 - 2
gcc-plugin/src/Makefile.am

@@ -17,7 +17,10 @@
 # requires a name prefixed by `lib'.
 pkglib_LTLIBRARIES = starpu.la
 
-starpu_la_SOURCES = starpu.c
+starpu_la_SOURCES = starpu.c c-expr.y
+
+AM_CPPFLAGS =						\
+  -I$(top_srcdir)/include				\
+  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1
 
-AM_CPPFLAGS = -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall
 AM_LDFLAGS = -module

+ 234 - 0
gcc-plugin/src/c-expr.y

@@ -0,0 +1,234 @@
+/* GCC-StarPU
+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+
+   GCC-StarPU is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   GCC-StarPU is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Parser for simple C expressions in pragmas.  */
+
+%define api.pure
+%parse-param { location_t loc }
+%parse-param { const char *pragma }
+%parse-param { tree *seq }
+%debug
+
+%{
+  #include <starpu-gcc-config.h>
+
+  #include <gcc-plugin.h>
+  #include <plugin.h>
+  #include <tree.h>
+  #include <cpplib.h>
+
+  #ifdef HAVE_C_FAMILY_C_COMMON_H
+  # include <c-family/c-common.h>
+  #elif HAVE_C_COMMON_H
+  # include <c-common.h>
+  #endif
+
+  #ifdef HAVE_C_FAMILY_C_PRAGMA_H
+  # include <c-family/c-pragma.h>
+  #elif HAVE_C_PRAGMA_H
+  # include <c-pragma.h>
+  #endif
+
+  #if !HAVE_DECL_BUILD_ARRAY_REF
+  /* This declaration is missing in GCC 4.6.1.  */
+  extern tree build_array_ref (location_t loc, tree array, tree index);
+  #endif
+
+
+  #define YYSTYPE tree
+  #define YYLTYPE location_t
+
+  static void
+  yyerror (location_t loc, const char *pragma, tree *seq,
+	   char const *message)
+  {
+    error_at (loc, "parse error in pragma %qs: %s", pragma, message);
+  }
+
+  /* Return SOMETHING if it's a VAR_DECL, an identifier bound to a VAR_DECL,
+     or another object; raise an error otherwise.  */
+
+  static tree
+  ensure_bound (location_t loc, tree something)
+  {
+    gcc_assert (something != NULL_TREE);
+
+    if (DECL_P (something))
+      return something;
+    else if (TREE_CODE (something) == IDENTIFIER_NODE)
+      {
+	tree var = lookup_name (something);
+	if (var == NULL_TREE)
+	  {
+	    error_at (loc, "unbound variable %qE", something);
+	    return error_mark_node;
+	  }
+	else
+	  return var;
+      }
+
+    return something;
+  }
+
+  static tree
+  build_component_ref (location_t loc, tree what, tree field)
+  {
+    sorry ("struct field access not implemented yet"); /* XXX */
+    return error_mark_node;
+  }
+%}
+
+%code {
+  /* Mapping of libcpp token names to Bison-generated token names.  This is
+     not ideal but Bison cannot be told to use the `enum cpp_ttype'
+     values.  */
+  static const int cpplib_bison_token_map[] =
+    {
+      [CPP_NAME] = YCPP_NAME,
+      [CPP_NUMBER] = YCPP_NUM,
+      [CPP_AND] = YCPP_AND,
+      [CPP_OPEN_SQUARE] = YCPP_OPEN_SQUARE,
+      [CPP_CLOSE_SQUARE] = YCPP_CLOSE_SQUARE,
+      [CPP_OPEN_PAREN] = YCPP_OPEN_PAREN,
+      [CPP_CLOSE_PAREN] = YCPP_CLOSE_PAREN,
+      [CPP_PLUS] = YCPP_PLUS,
+      [CPP_MINUS] = YCPP_MINUS,
+      [CPP_MULT] = YCPP_MULT,
+      [CPP_DIV] = YCPP_DIV,
+      [CPP_DOT] = YCPP_DOT,
+      [CPP_DEREF] = YCPP_DEREF
+    };
+
+  static int
+  yylex (YYSTYPE *lvalp)
+  {
+    int ret;
+
+    ret = pragma_lex (lvalp);
+    if (ret < sizeof cpplib_bison_token_map / sizeof cpplib_bison_token_map[0])
+      ret = cpplib_bison_token_map[ret];
+    else
+      ret = -1;
+
+    return ret;
+  }
+}
+
+%token YCPP_NAME "identifier"
+%token YCPP_NUM "integer"
+%token YCPP_AND "&"
+%token YCPP_OPEN_SQUARE "["
+%token YCPP_CLOSE_SQUARE "]"
+%token YCPP_OPEN_PAREN "("
+%token YCPP_CLOSE_PAREN ")"
+%token YCPP_PLUS "+"
+%token YCPP_MINUS "-"
+%token YCPP_MULT "*"
+%token YCPP_DIV "/"
+%token YCPP_DOT "."
+%token YCPP_DEREF "->"
+
+%% /* Grammar rules.  */
+
+ /* Always return a TREE_LIST rather than a raw chain, because the elements
+    of that list may be already chained for other purposes---e.g., PARM_DECLs
+    of a function are chained together.  */
+
+sequence: expression {
+          gcc_assert (*seq == NULL_TREE);
+	  *seq = tree_cons (NULL_TREE, $1, NULL_TREE);
+	  $$ = *seq;
+      }
+      | expression sequence {
+	  gcc_assert ($2 == *seq);
+	  *seq = tree_cons (NULL_TREE, $1, $2);
+	  $$ = *seq;
+      }
+;
+
+expression: identifier | binary_expression | unary_expression;
+
+/* XXX: `ensure_bound' below leads to errors raised even for non-significant
+   arguments---e.g., junk after pragma.  */
+identifier: YCPP_NAME  { $$ = ensure_bound (loc, $1); }
+;
+
+binary_expression: multiplicative_expression
+     | additive_expression
+;
+
+multiplicative_expression: multiplicative_expression YCPP_MULT cast_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, MULT_EXPR, $1, $3, 0);
+     }
+     | multiplicative_expression YCPP_DIV cast_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, TRUNC_DIV_EXPR, $1, $3, 0);
+     }
+     | cast_expression
+;
+
+additive_expression: multiplicative_expression
+     | additive_expression YCPP_PLUS multiplicative_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, PLUS_EXPR, $1, $3, 0);
+     }
+     | additive_expression YCPP_MINUS multiplicative_expression {
+       $$ = build_binary_op (UNKNOWN_LOCATION, MINUS_EXPR, $1, $3, 0);
+     }
+;
+
+cast_expression: unary_expression
+		 /* XXX: No support for '(' TYPE-NAME ')' UNARY-EXPRESSION.  */
+;
+
+unary_expression:
+       primary_expression
+     | postfix_expression
+     | YCPP_AND cast_expression {
+       $$ = build_addr (ensure_bound (loc, $2), current_function_decl);
+     }
+;
+
+postfix_expression:
+       primary_expression
+     | postfix_expression YCPP_OPEN_SQUARE expression YCPP_CLOSE_SQUARE {
+#if 1
+	 /* Build the array ref with proper error checking.  */
+	 $$ = build_array_ref (loc, ensure_bound (loc, $1),
+			       ensure_bound (loc, $3));
+#else /* TIMTOWTDI */
+	 $$ = build_indirect_ref (loc,
+	       build_binary_op (loc, PLUS_EXPR, ensure_bound (loc, $1), ensure_bound (loc, $3), 0),
+		RO_ARRAY_INDEXING);
+#endif
+     }
+     | postfix_expression YCPP_DOT identifier {
+        $$ = build_component_ref (loc, ensure_bound (loc, $1), $2);
+     }
+     | postfix_expression YCPP_DEREF identifier {
+        $$ = build_component_ref (loc,
+               build_indirect_ref (loc, ensure_bound (loc, $1), RO_ARRAY_INDEXING),
+               $2);
+     }
+;
+
+primary_expression: identifier
+     | constant
+     | YCPP_OPEN_PAREN expression YCPP_CLOSE_PAREN { $$ = $2; }
+;
+
+constant: YCPP_NUM { $$ = $1; }
+;
+
+%%

+ 8 - 0
gcc-plugin/src/starpu-gcc-config.h.in

@@ -21,3 +21,11 @@
 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_ARRAY
 
 #undef HAVE_DECL_BUILD_CALL_EXPR_LOC_VEC
+
+#undef HAVE_DECL_BUILD_ARRAY_REF
+
+#undef HAVE_C_FAMILY_C_COMMON_H
+#undef HAVE_C_COMMON_H
+
+#undef HAVE_C_FAMILY_C_PRAGMA_H
+#undef HAVE_C_PRAGMA_H

+ 148 - 96
gcc-plugin/src/starpu.c

@@ -30,8 +30,19 @@ int plugin_is_GPL_compatible;
 #include <cpplib.h>
 #include <tree.h>
 #include <tree-iterator.h>
-#include <c-common.h>
-#include <c-pragma.h>
+
+#ifdef HAVE_C_FAMILY_C_COMMON_H
+# include <c-family/c-common.h>
+#elif HAVE_C_COMMON_H
+# include <c-common.h>
+#endif
+
+#ifdef HAVE_C_FAMILY_C_PRAGMA_H
+# include <c-family/c-pragma.h>
+#elif HAVE_C_PRAGMA_H
+# include <c-pragma.h>
+#endif
+
 #include <tm.h>
 #include <gimple.h>
 #include <tree-pass.h>
@@ -75,6 +86,9 @@ static tree build_codelet_declaration (tree task_decl);
 static tree build_task_body (const_tree task_decl);
 static tree build_pointer_lookup (tree pointer);
 
+static bool task_p (const_tree decl);
+static bool task_implementation_p (const_tree decl);
+
 
 /* Lookup the StarPU function NAME in the global scope and store the result
    in VAR (this can't be done from `lower_starpu'.)  */
@@ -119,7 +133,9 @@ build_call_expr_loc_vec (location_t loc, tree fndecl, VEC(tree,gc) *vec)
 
 
 /* Build a reference to the INDEXth element of ARRAY.  `build_array_ref' is
-   not exported, so we roll our own.  */
+   not exported, so we roll our own.
+   FIXME: This version may not work for array types and doesn't do as much
+   type-checking as `build_array_ref'.  */
 
 static tree
 array_ref (tree array, size_t index)
@@ -347,48 +363,45 @@ handle_pragma_shutdown (struct cpp_reader *reader)
 static void
 handle_pragma_wait (struct cpp_reader *reader)
 {
-  tree fndecl;
+  if (task_implementation_p (current_function_decl))
+    {
+      location_t loc;
 
-  fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
-  gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
+      loc = cpp_peek_token (reader, 0)->src_loc;
+
+      /* TODO: In the future we could generate a task for the continuation
+	 and have it depend on what's before here.  */
+      error_at (loc, "task implementation is not allowed to wait");
+    }
+  else
+    {
+      tree fndecl;
+
+      fndecl = lookup_name (get_identifier ("starpu_task_wait_for_all"));
+      gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
 
-  add_stmt (build_call_expr (fndecl, 0));
+      add_stmt (build_call_expr (fndecl, 0));
+    }
 }
 
-/* Parse a pointer variable for PRAGMA, raising the appropriate error if
-   needed.  Return the pointer variable on success, NULL_TREE otherwise.  */
+/* The minimal C expression parser.  */
+
+extern int yyparse (location_t, const char *, tree *);
+extern int yydebug;
+
+/* Parse expressions from the CPP reader for PRAGMA, which is located at LOC.
+   Return a TREE_LIST of C expressions.  */
 
 static tree
-read_pragma_pointer_variable (const char *pragma, location_t loc)
+read_pragma_expressions (const char *pragma, location_t loc)
 {
-  tree token, var = NULL_TREE;
-  enum cpp_ttype type;
-
-  type = pragma_lex (&token);
-  if (type == CPP_EOF)
-    error_at (loc, "unterminated %<starpu %s%> pragma", pragma);
-  else if (type != CPP_NAME)
-    error_at (loc, "identifier expected");
-  else
-    {
-      /* Get the variable name.  */
-      tree var_name = token;
-      tree decl = lookup_name (var_name);
-
-      if (decl == NULL_TREE || !DECL_P (decl))
-	error_at (loc, "unbound variable %qE", var_name);
-      else if (!POINTER_TYPE_P (TREE_TYPE (decl))
-	       && TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
-	error_at (loc, "%qE is neither a pointer nor an array", var_name);
-      else
-	{
-	  var = decl;
-	  TREE_USED (var) = true;
-	  DECL_READ_P (var) = true;
-	}
-    }
+  tree expr = NULL_TREE;
+
+  if (yyparse (loc, pragma, &expr))
+    /* Parse error or memory exhaustion.  */
+    expr = NULL_TREE;
 
-  return var;
+  return expr;
 }
 
 /* Process `#pragma starpu register VAR [COUNT]' and emit the corresponding
@@ -397,19 +410,37 @@ read_pragma_pointer_variable (const char *pragma, location_t loc)
 static void
 handle_pragma_register (struct cpp_reader *reader)
 {
-  tree token, var;
+  tree args, ptr, count_arg;
   location_t loc;
-  enum cpp_ttype type;
 
   loc = cpp_peek_token (reader, 0)->src_loc;
 
-  var = read_pragma_pointer_variable ("register", loc);
-  if (var == NULL_TREE)
+  args = read_pragma_expressions ("register", loc);
+  if (args == NULL_TREE)
+    /* Parse error, presumably already handled by the parser.  */
     return;
 
-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE
-      && !DECL_EXTERNAL (var)
-      && !TREE_STATIC (var)
+  /* First argument should be a pointer expression.  */
+  ptr = TREE_VALUE (args);
+  args = TREE_CHAIN (args);
+
+  if (ptr == error_mark_node)
+    return;
+
+  if (!POINTER_TYPE_P (TREE_TYPE (ptr))
+      && TREE_CODE (TREE_TYPE (ptr)) != ARRAY_TYPE)
+    {
+      error_at (loc, "%qE is neither a pointer nor an array", ptr);
+      return;
+    }
+
+  TREE_USED (ptr) = true;
+  if (DECL_P (ptr))
+    DECL_READ_P (ptr) = true;
+
+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE
+      && !DECL_EXTERNAL (ptr)
+      && !TREE_STATIC (ptr)
       && !MAIN_NAME_P (DECL_NAME (current_function_decl)))
     warning_at (loc, 0, "using an on-stack array as a task input "
 		"considered unsafe");
@@ -417,9 +448,9 @@ handle_pragma_register (struct cpp_reader *reader)
   /* Determine the number of elements in the vector.  */
   tree count = NULL_TREE;
 
-  if (TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE)
+  if (TREE_CODE (TREE_TYPE (ptr)) == ARRAY_TYPE)
     {
-      tree domain = TYPE_DOMAIN (TREE_TYPE (var));
+      tree domain = TYPE_DOMAIN (TREE_TYPE (ptr));
 
       if (domain != NULL_TREE)
 	{
@@ -435,52 +466,44 @@ handle_pragma_register (struct cpp_reader *reader)
 	}
     }
 
+  /* Second argument is optional but should be an integer.  */
+  count_arg = (args == NULL_TREE) ? NULL_TREE : TREE_VALUE (args);
+  if (args != NULL_TREE)
+    {
+      args = TREE_CHAIN (args);
+      TREE_CHAIN (count_arg) = NULL_TREE;
+    }
 
-  type = pragma_lex (&token);
-  if (type == CPP_EOF)
+  if (count_arg == NULL_TREE)
     {
-      /* End of line reached: don't consume TOKEN and check whether the array
-	 size was determined.  */
+      /* End of line reached: check whether the array size was
+	 determined.  */
       if (count == NULL_TREE)
 	{
-	  error_at (loc, "cannot determine size of array %qE", DECL_NAME (var));
+	  error_at (loc, "cannot determine size of array %qE", ptr);
 	  return;
 	}
     }
+  else if (count_arg == error_mark_node)
+    /* COUNT_ARG could not be parsed and an error was already reported.  */
+    return;
+  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
+    {
+      error_at (loc, "%qE is not an integer", count_arg);
+      return;
+    }
   else
     {
-      /* TOKEN may be a number or a integer variable.  */
-
-      tree count_arg;
-
-      if (TREE_CODE (token) == IDENTIFIER_NODE)
-	{
-	  count_arg = lookup_name (token);
-	  if (count_arg == NULL_TREE)
-	    {
-	      error_at (loc, "unbound variable %qE", token);
-	      return;
-	    }
-	  else if (!INTEGRAL_TYPE_P (TREE_TYPE (count_arg)))
-	    {
-	      error_at (loc, "integer expected");
-	      return;
-	    }
-
-	  TREE_USED (count_arg) = true;
-	  DECL_READ_P (count_arg) = true;
-	}
-      else if (TREE_CODE (token) != INTEGER_CST)
-	error_at (loc, "integer expected");
-      else
-	count_arg = token;
+      TREE_USED (count_arg) = true;
+      if (DECL_P (count_arg))
+	DECL_READ_P (count_arg) = true;
 
       if (count != NULL_TREE)
 	{
 	  /* The number of elements of this array was already determined.  */
 	  inform (loc,
 		  "element count can be omitted for bounded array %qE",
-		  DECL_NAME (var));
+		  ptr);
 
 	  if (count_arg != NULL_TREE)
 	    {
@@ -489,7 +512,7 @@ handle_pragma_register (struct cpp_reader *reader)
 		  if (!tree_int_cst_equal (count, count_arg))
 		    error_at (loc, "specified element count differs "
 			      "from actual size of array %qE",
-			      DECL_NAME (var));
+			      ptr);
 		}
 	      else
 		/* Using a variable to determine the array size whereas the
@@ -501,19 +524,24 @@ handle_pragma_register (struct cpp_reader *reader)
 	}
       else
 	count = count_arg;
-
-      if (pragma_lex (&token) != CPP_EOF)
-	error_at (loc, "junk after %<starpu register%> pragma");
     }
 
-  /* If VAR is an array, take its address.  */
+  /* Any remaining args?  */
+  if (args != NULL_TREE)
+    error_at (loc, "junk after %<starpu register%> pragma");
+
+  /* If PTR is an array, take its address.  */
   tree pointer =
-    POINTER_TYPE_P (TREE_TYPE (var))
-    ? var
-    : build_addr (var, current_function_decl);
+    POINTER_TYPE_P (TREE_TYPE (ptr))
+    ? ptr
+    : build_addr (ptr, current_function_decl);
 
   /* Introduce a local variable to hold the handle.  */
-  tree handle_var = create_tmp_var (ptr_type_node, ".handle");
+  tree handle_var = build_decl (loc, VAR_DECL, create_tmp_var_name (".handle"),
+				ptr_type_node);
+  DECL_CONTEXT (handle_var) = current_function_decl;
+  DECL_ARTIFICIAL (handle_var) = true;
+  DECL_INITIAL (handle_var) = NULL_TREE;
 
   tree register_fn =
     lookup_name (get_identifier ("starpu_vector_data_register"));
@@ -525,9 +553,13 @@ handle_pragma_register (struct cpp_reader *reader)
 		     build_addr (handle_var, current_function_decl),
 		     build_zero_cst (uintptr_type_node), /* home node */
 		     pointer, count,
-		     size_in_bytes (TREE_TYPE (TREE_TYPE (var))));
+		     size_in_bytes (TREE_TYPE (TREE_TYPE (ptr))));
 
-  add_stmt (call);
+  tree bind;
+  bind = build3 (BIND_EXPR, void_type_node, handle_var, call,
+		 NULL_TREE);
+
+  add_stmt (bind);
 }
 
 /* Process `#pragma starpu acquire VAR' and emit the corresponding
@@ -539,16 +571,26 @@ handle_pragma_acquire (struct cpp_reader *reader)
   static tree acquire_fn;
   LOOKUP_STARPU_FUNCTION (acquire_fn, "starpu_data_acquire");
 
-  tree token, var;
+  tree args, var;
   location_t loc;
 
   loc = cpp_peek_token (reader, 0)->src_loc;
 
-  var = read_pragma_pointer_variable ("acquire", loc);
-  if (var == NULL_TREE)
+  args = read_pragma_expressions ("acquire", loc);
+  if (args == NULL_TREE)
     return;
 
-  if (pragma_lex (&token) != CPP_EOF)
+  var = TREE_VALUE (args);
+
+  if (var == error_mark_node)
+    return;
+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
+    {
+      error_at (loc, "%qE is neither a pointer nor an array", var);
+      return;
+    }
+  else if (TREE_CHAIN (var) != NULL_TREE)
     error_at (loc, "junk after %<starpu acquire%> pragma");
 
   /* If VAR is an array, take its address.  */
@@ -573,16 +615,26 @@ handle_pragma_unregister (struct cpp_reader *reader)
   static tree unregister_fn;
   LOOKUP_STARPU_FUNCTION (unregister_fn, "starpu_data_unregister");
 
-  tree token, var;
+  tree args, var;
   location_t loc;
 
   loc = cpp_peek_token (reader, 0)->src_loc;
 
-  var = read_pragma_pointer_variable ("unregister", loc);
-  if (var == NULL_TREE)
+  args = read_pragma_expressions ("unregister", loc);
+  if (args == NULL_TREE)
     return;
 
-  if (pragma_lex (&token) != CPP_EOF)
+  var = TREE_VALUE (args);
+
+  if (var == error_mark_node)
+    return;
+  else if (TREE_CODE (TREE_TYPE (var)) != POINTER_TYPE
+	   && TREE_CODE (TREE_TYPE (var)) != ARRAY_TYPE)
+    {
+      error_at (loc, "%qE is neither a pointer nor an array", var);
+      return;
+    }
+  else if (TREE_CHAIN (args) != NULL_TREE)
     error_at (loc, "junk after %<starpu unregister%> pragma");
 
   /* If VAR is an array, take its address.  */

+ 1 - 0
gcc-plugin/tests/Makefile.am

@@ -28,6 +28,7 @@ gcc_tests =					\
   pointer-tasks.c				\
   no-initialize.c				\
   lib-user.c					\
+  wait-errors.c					\
   shutdown-errors.c
 
 dist_noinst_HEADERS = mocks.h

+ 2 - 2
gcc-plugin/tests/acquire-errors.c

@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
 #pragma starpu register x
 
-#pragma starpu acquire /* (error "unterminated") */
-#pragma starpu acquire 123 /* (error "identifier expected") */
+#pragma starpu acquire /* (error "parse error") */
+#pragma starpu acquire 123 /* (error "neither a pointer nor an array") */
 #pragma starpu acquire does_not_exit /* (error "unbound variable") */
 
 #pragma starpu acquire argc /* (error "neither a pointer nor an array") */

+ 4 - 1
gcc-plugin/tests/lib-user.c

@@ -28,7 +28,10 @@ main (int argc, char *argv[])
 {
 #pragma starpu initialize
 
-  static const char x[] = { 0, 1, 2, 3, 4, 5 };
+  /* Align X so that the assumptions behind `dummy_pointer_to_handle'
+     hold.  */
+  static const char x[] __attribute__ ((aligned (8))) = { 0, 1, 2, 3, 4, 5 };
+
   float y[sizeof x];
 
   static const char forty_two = 42;

+ 8 - 2
gcc-plugin/tests/register-errors.c

@@ -23,9 +23,9 @@ main (int argc, char *argv[])
 {
 #pragma starpu initialize
 
-#pragma starpu register /* (error "unterminated") */
+#pragma starpu register /* (error "parse error") */
 
-#pragma starpu register argv 234 junk right here /* (error "junk after") */
+#pragma starpu register argv 234 junk here /* (error "junk after") *//* (error "unbound") *//* (error "unbound") */
 
   static int x[123] __attribute__ ((unused));
 #pragma starpu register x 234 /* (note "can be omitted") *//* (error "differs from actual size") */
@@ -37,9 +37,15 @@ main (int argc, char *argv[])
 #pragma starpu register argv does_not_exit /* (error "unbound variable") */
 
 #pragma starpu register argv /* (error "cannot determine size") */
+#pragma starpu register &argv[2] /* (error "cannot determine size") */
+#pragma starpu register &x[2] /* (error "cannot determine size") */
 
 #pragma starpu register argc /* (error "neither a pointer nor an array") */
 
+#pragma starpu register argv[2][3] 3 /* (error "neither a pointer nor an array") */
+
+#pragma starpu register argv[does_not_exist] 3 /* (error "unbound variable") */
+
   char **p = argv;
   size_t ps = argc;
 #pragma starpu register p ps  /* No unused variable warning, please! */

+ 39 - 1
gcc-plugin/tests/register.c

@@ -31,6 +31,15 @@ foo (void)
 #pragma starpu register x /* (warning "considered unsafe") */
 }
 
+static void
+bar (float *p, int s)
+{
+  expected_register_arguments.pointer = p;
+  expected_register_arguments.elements = s;
+  expected_register_arguments.element_size = sizeof *p;
+#pragma starpu register p s
+}
+
 int
 main (int argc, char *argv[])
 {
@@ -39,6 +48,7 @@ main (int argc, char *argv[])
   int x[123];
   double *y;
   static char z[345];
+  static float m[7][42];
   short w[] = { 1, 2, 3 };
   size_t y_size = 234;
 
@@ -84,8 +94,36 @@ main (int argc, char *argv[])
 #undef N
 
   foo ();
+  bar ((float *) argv, argc);
+
+  expected_register_arguments.pointer = argv;
+  expected_register_arguments.elements = argc;
+  expected_register_arguments.element_size = sizeof argv[0];
+
+  int chbouib = argc;
+#pragma starpu register argv chbouib
+
+  expected_register_arguments.pointer = &argv[2];
+  expected_register_arguments.elements = 3;
+  expected_register_arguments.element_size = sizeof argv[0];
+#pragma starpu register &argv[2] 3
+
+  expected_register_arguments.pointer = &argv[argc + 3 / 2];
+  expected_register_arguments.elements = argc * 4;
+  expected_register_arguments.element_size = sizeof argv[0];
+#pragma starpu register &argv[argc + 3 / 2] (argc * 4)
+
+  expected_register_arguments.pointer = &y[y_size / 2];
+  expected_register_arguments.elements = (y_size / 2 - 7);
+  expected_register_arguments.element_size = sizeof y[0];
+#pragma starpu register &y[y_size / 2] (y_size / 2 - 7)
+
+  expected_register_arguments.pointer = m[6];
+  expected_register_arguments.elements = 42;
+  expected_register_arguments.element_size = sizeof m[0][0];
+#pragma starpu register m[6]
 
-  assert (data_register_calls == 8);
+  assert (data_register_calls == 14);
 
   free (y);
 

+ 11 - 5
gcc-plugin/tests/run-test.in

@@ -85,11 +85,11 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
     ;; the real file name.
     ,(string-append "-fplugin=" %builddir "/../src/.libs/starpu.so")
 
-    "-g" "-O2"
+    "-g"
     "-fdump-tree-gimple" "-Wall"))
 
 (define %default-ldflags
-  `(,(string-append "-L" %srcdir "/../../src")))
+  `(,(string-append "-L" %builddir "/../../src")))
 
 (define %libtool
   (string-append %builddir "/../../libtool"))
@@ -105,7 +105,7 @@ compiler status and the list of lines printed on stdout/stderr."
          (mode     (if compile?
                        "compile"
                        "link"))
-         (command  (format #f "LANG=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
+         (command  (format #f "LC_ALL=C ~a --mode=~a ~a ~{~a ~} \"~a\" ~{~a ~} 2>&1"
                            %libtool mode cc cflags file ldflags))
          (pipe     (begin
                      (log "running `~a'" command)
@@ -311,7 +311,7 @@ otherwise."
                 file (length dependencies) dependencies))
 
     (and (every (cut compile/match <> cc cflags ldflags)
-                dependencies)
+                (map (cut string-append %srcdir "/" <>) dependencies))
          (let*-values (((goal)
                         (if error-expected?
                             'compile
@@ -421,6 +421,12 @@ otherwise."
 ;;;
 
 (define (build/run . file)
-  (exit (every (cut compile/match <> %gcc %default-cflags %default-ldflags) file)))
+  (exit (every (lambda (file)
+                 ;; For each file, check that everything works both with and
+                 ;; without optimizations.
+                 (every (cut compile/match file %gcc <> %default-ldflags)
+                        `((,"-O0" ,@%default-cflags)
+                          (,"-O2" ,@%default-cflags))))
+               file)))
 
 ;;; run-test.in ends here

+ 2 - 2
gcc-plugin/tests/unregister-errors.c

@@ -28,8 +28,8 @@ main (int argc, char *argv[])
 
 #pragma starpu register x
 
-#pragma starpu unregister /* (error "unterminated") */
-#pragma starpu unregister 123 /* (error "identifier expected") */
+#pragma starpu unregister /* (error "parse error") */
+#pragma starpu unregister 123 /* (error "neither a pointer nor an array") */
 #pragma starpu unregister does_not_exit /* (error "unbound variable") */
 
 #pragma starpu unregister argc /* (error "neither a pointer nor an array") */

+ 27 - 0
gcc-plugin/tests/wait-errors.c

@@ -0,0 +1,27 @@
+/* GCC-StarPU
+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+
+   GCC-StarPU is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   GCC-StarPU is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* The task under test.  */
+
+void task (int x, char y, int z) __attribute__ ((task));
+static void task_cpu (int x, char y, int z)
+  __attribute__ ((task_implementation ("cpu", task)));
+
+static void
+task_cpu (int x, char y, int z)
+{
+#pragma starpu wait /* (error "not allowed") */
+}

+ 1 - 0
include/starpu_config.h.in

@@ -46,6 +46,7 @@
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_NMAXWORKERS
+#undef STARPU_MAXIMPLEMENTATIONS
 
 #undef STARPU_HAVE_LIBNUMA
 

+ 2 - 0
include/starpu_data_filters.h

@@ -19,6 +19,8 @@
 #ifndef __STARPU_DATA_FILTERS_H__
 #define __STARPU_DATA_FILTERS_H__
 
+#include <stdarg.h>
+
 #include <starpu.h>
 #include <starpu_config.h>
 

+ 5 - 4
include/starpu_perfmodel.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -104,7 +105,7 @@ struct starpu_perfmodel_t {
 	double (*cost_model)(struct starpu_buffer_descr_t *);
 
 	/* per-architecture model */
-	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS];
+	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
 
 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
 	const char *symbol;
@@ -126,12 +127,12 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
  * performance model files */
 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
-		enum starpu_perf_archtype arch, char *path, size_t maxlen);
-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,
-		char *archname, size_t maxlen);
+		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,	char *archname, size_t maxlen, unsigned nimpl);
 int starpu_list_models(void);
 
 void starpu_force_bus_sampling(void);
+void starpu_print_bus_bandwidth(FILE *f);
 
 #ifdef __cplusplus
 }

+ 6 - 4
include/starpu_scheduler.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -127,7 +128,7 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 #endif
 
 /* Check if the worker specified by workerid can execute the codelet. */
-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
+int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 /* The scheduling policy may put tasks directly into a worker's local queue so
  * that it is not always necessary to create its own queue when the local queue
@@ -147,6 +148,7 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back);
 /* By convention, the default priority level should be 0 so that we can
  * statically allocate tasks with a default priority. */
 #define STARPU_DEFAULT_PRIO	0
+
 int starpu_sched_get_min_priority(void);
 int starpu_sched_get_max_priority(void);
 
@@ -164,7 +166,7 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 /* Get the description of a combined worker */
 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
 /* Variant of starpu_worker_may_execute_task compatible with combined workers */
-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task);
+int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 /*
  *	Data prefetching
@@ -182,7 +184,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
 /* Return the current date */
 double starpu_timing_now(void);
 /* Returns expected task duration in µs */
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch);
+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns an estimated speedup factor relative to CPU speed */
 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
 /* Returns expected data transfer time in µs */
@@ -190,7 +192,7 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 /* Predict the transfer time (in µs) to move a handle to a memory node */
 double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
 /* Returns expected power consumption in J */
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch);
+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 
 /* Waits until all the tasks of a worker, already submitted, have been executed */
 int starpu_wait_for_all_tasks_of_worker(int workerid);

+ 19 - 1
include/starpu_task.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,7 +24,7 @@
 #include <starpu_config.h>
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
-#include <cuda.h>
+# include <cuda.h>
 #endif
 
 #include <starpu_data.h>
@@ -56,6 +57,18 @@ extern "C" {
 
 typedef uint64_t starpu_tag_t;
 
+
+typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
+typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
+typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
+typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
+
+#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    (starpu_cpu_func_t) -1
+#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   (starpu_cuda_func_t) -1
+#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS (starpu_opencl_func_t) -1
+#define STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS 255
+
+
 /*
  * A codelet describes the various function 
  * that may be called from a worker
@@ -72,6 +85,11 @@ typedef struct starpu_codelet_t {
 	void (*opencl_func)(void **, void *);
 	uint8_t gordon_func;
 
+	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_gordon_func_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS];
+
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
 

+ 3 - 2
include/starpu_task_bundle.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -71,10 +72,10 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
 
 /* Return the expected duration of the entire task bundle in µs. */
-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch);
+double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Return the time (in µs) expected to transfer all data used within the bundle */
 double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
 /* Return the expected power consumption of the entire task bundle in J. */
-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch);
+double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl);
 
 #endif // __STARPU_TASK_BUNDLE_H__

+ 2 - 0
m4/.gitignore

@@ -0,0 +1,2 @@
+/libtool.m4
+/lt*.m4

+ 9 - 1
m4/gcc.m4

@@ -51,8 +51,16 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
     dnl Reason:
     dnl   build_call_expr_loc_array -- not in GCC 4.5.x; appears in 4.6
     dnl   build_call_expr_loc_vec   -- likewise
+    dnl   build_array_ref           -- present but undeclared in 4.6.1
     _STARPU_WITH_GCC_PLUGIN_API([
-      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec],
+      AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec,
+                      build_array_ref],
+        [], [], [#include <gcc-plugin.h>
+	         #include <tree.h>])
+
+      dnl Work around header naming issues introduced upstream and in Debian
+      dnl (see <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=631082>).
+      AC_CHECK_HEADERS([c-common.h c-pragma.h c-family/c-common.h c-family/c-pragma.h],
         [], [], [#include <gcc-plugin.h>
 	         #include <tree.h>])
     ])

+ 1 - 0
mpi/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 1 - 0
mpi/tests/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 8 - 6
socl/src/Makefile.am

@@ -24,16 +24,18 @@ SUBDIRS =
 lib_LTLIBRARIES = libsocl.la
 
 libsocl_la_SOURCES = 						\
-  socl.c \
-  gc.c \
-  graph.c \
+  command.c \
+  command_list.c \
+  command_queue.c \
+  debug.c \
+  devices.c \
   event.c \
+  gc.c \
   init.c \
-  task.c \
-  command_queue.c \
   mem_objects.c \
+  socl.c \
+  task.c \
   util.c \
-  devices.c \
   cl_getplatformids.c \
   cl_getplatforminfo.c \
   cl_getdeviceids.c \

+ 3 - 4
socl/src/cl_createcommandqueue.c

@@ -30,8 +30,7 @@ static void release_callback_command_queue(void * e) {
   gc_entity_unstore(&cq->context);
 
   /* Destruct object */
-  pthread_spin_destroy(&cq->spin);
-  free(cq->events);
+  pthread_mutex_destroy(&cq->mutex);
 }
 
 
@@ -68,9 +67,9 @@ soclCreateCommandQueue(cl_context                   context,
       profiling_queue_count += 1;
    }
 
-   cq->events = NULL;
+   cq->commands = NULL;
    cq->barrier = NULL;
-   pthread_spin_init(&cq->spin, 0);
+   pthread_mutex_init(&cq->mutex, NULL);
 
    if (errcode_ret != NULL)
       *errcode_ret = CL_SUCCESS;

+ 11 - 11
socl/src/cl_createkernel.c

@@ -32,23 +32,23 @@ static void soclCreateKernel_task(void *data) {
    }
 
    /* One worker creates argument structures */
-   if (__sync_bool_compare_and_swap(&k->arg_count, 0, 666)) {
+   if (__sync_bool_compare_and_swap(&k->num_args, 0, 666)) {
       unsigned int i;
-      cl_uint arg_count;
+      cl_uint num_args;
 
-      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(arg_count), &arg_count, NULL);
+      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
       if (err != CL_SUCCESS) {
          DEBUG_CL("clGetKernelInfo", err);
          ERROR_STOP("Unable to get kernel argument count. Aborting.\n");
       }
-      k->arg_count = arg_count;
-      DEBUG_MSG("Kernel has %d arguments\n", arg_count);
+      k->num_args = num_args;
+      DEBUG_MSG("Kernel has %d arguments\n", num_args);
 
-      k->arg_size = (size_t*)malloc(sizeof(size_t) * arg_count);
-      k->arg_value = (void**)malloc(sizeof(void*) * arg_count);
-      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * arg_count);
+      k->arg_size = (size_t*)malloc(sizeof(size_t) * num_args);
+      k->arg_value = (void**)malloc(sizeof(void*) * num_args);
+      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * num_args);
       /* Settings default type to NULL */
-      for (i=0; i<arg_count; i++) {
+      for (i=0; i<num_args; i++) {
          k->arg_value[i] = NULL;
          k->arg_type[i] = Null;
       }
@@ -70,7 +70,7 @@ static void release_callback_kernel(void * e) {
 
   //Free args
   unsigned int j;
-  for (j=0; j<kernel->arg_count; j++) {
+  for (j=0; j<kernel->num_args; j++) {
     switch (kernel->arg_type[j]) {
       case Null:
         break;
@@ -125,7 +125,7 @@ soclCreateKernel(cl_program    program,
    
    gc_entity_store(&k->program, program);
    k->kernel_name = strdup(kernel_name);
-   k->arg_count = 0;
+   k->num_args = 0;
    k->arg_value = NULL;
    k->arg_size = NULL;
 

+ 2 - 2
socl/src/cl_createprogramwithsource.c

@@ -40,11 +40,11 @@ static void soclCreateProgramWithSource_task(void *data) {
 }
 
 static void rp_task(void *data) {
-   struct _cl_program *d = (struct _cl_program*)data;
+   cl_program program = (cl_program)data;
 
    int range = starpu_worker_get_range();
 
-   cl_int err = clReleaseProgram(d->cl_programs[range]);
+   cl_int err = clReleaseProgram(program->cl_programs[range]);
    if (err != CL_SUCCESS)
       DEBUG_CL("clReleaseProgram", err);
 }

+ 4 - 3
socl/src/cl_enqueuebarrier.c

@@ -19,8 +19,9 @@
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueBarrier(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
 {
-   cl_event ev = enqueueBarrier(cq);   
-   gc_entity_release(ev);
+	command_marker cmd = command_barrier_create();
 
-   return CL_SUCCESS;
+	command_queue_enqueue(cq, cmd, 0, NULL);
+
+	return CL_SUCCESS;
 }

+ 40 - 28
socl/src/cl_enqueuecopybuffer.c

@@ -66,6 +66,42 @@ static starpu_codelet codelet_copybuffer = {
    .nbuffers = 2
 };
 
+cl_int command_copy_buffer_submit(command_copy_buffer cmd) {
+	/* Aliases */
+	cl_mem src_buffer = cmd->src_buffer;
+	cl_mem dst_buffer = cmd->dst_buffer;
+	size_t src_offset = cmd->src_offset;
+	size_t dst_offset = cmd->dst_offset;
+	size_t cb = cmd->cb;
+
+	struct starpu_task *task;
+	struct arg_copybuffer *arg;
+
+	task = task_create(CL_COMMAND_COPY_BUFFER);
+
+	task->buffers[0].handle = src_buffer->handle;
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = dst_buffer->handle;
+	task->buffers[1].mode = STARPU_RW;
+	task->cl = &codelet_copybuffer;
+
+	arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
+	arg->src_offset = src_offset;
+	arg->dst_offset = dst_offset;
+	arg->cb = cb;
+	gc_entity_store(&arg->src_buffer, src_buffer);
+	gc_entity_store(&arg->dst_buffer, dst_buffer);
+	task->cl_arg = arg;
+	task->cl_arg_size = sizeof(struct arg_copybuffer);
+
+	dst_buffer->scratch = 0;
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
+
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueCopyBuffer(cl_command_queue  cq, 
                     cl_mem              src_buffer,
@@ -77,35 +113,11 @@ soclEnqueueCopyBuffer(cl_command_queue  cq,
                     const cl_event *    events,
                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
 {
-   struct starpu_task *task;
-   struct arg_copybuffer *arg;
-   cl_event ev;
-
-   task = task_create(CL_COMMAND_COPY_BUFFER);
-   ev = task_event(task);
-
-   task->buffers[0].handle = src_buffer->handle;
-   task->buffers[0].mode = STARPU_R;
-   task->buffers[1].handle = dst_buffer->handle;
-   task->buffers[1].mode = STARPU_RW;
-   task->cl = &codelet_copybuffer;
-
-   arg = (struct arg_copybuffer*)malloc(sizeof(struct arg_copybuffer));
-   arg->src_offset = src_offset;
-   arg->dst_offset = dst_offset;
-   arg->cb = cb;
-   gc_entity_store(&arg->src_buffer, src_buffer);
-   gc_entity_store(&arg->dst_buffer, dst_buffer);
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct arg_copybuffer);
-
-   dst_buffer->scratch = 0;
-
-   DEBUG_MSG("Submitting CopyBuffer task (event %d)\n", ev->id);
+	command_copy_buffer cmd = command_copy_buffer_create(src_buffer, dst_buffer, src_offset, dst_offset, cb);
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-   RETURN_EVENT(ev, event);
+	RETURN_EVENT(cmd, event);
 
-   return ret;
+	return CL_SUCCESS;
 }

+ 25 - 48
socl/src/cl_enqueuemapbuffer.c

@@ -16,76 +16,53 @@
 
 #include "socl.h"
 
-struct mb_data {
-  cl_event ev;
-  cl_mem buffer;
-  cl_map_flags map_flags;
-};
-
 static void mapbuffer_callback(void *args) {
-  struct mb_data *arg = (struct mb_data*)args;
-
-  starpu_tag_notify_from_apps(arg->ev->id);
-  arg->ev->status = CL_COMPLETE;
+	command_map_buffer cmd = (command_map_buffer)args;
 
-  gc_entity_unstore(&arg->ev);
-  gc_entity_unstore(&arg->buffer);
-  free(args);
+	starpu_tag_notify_from_apps(cmd->event->id);
+	cmd->event->status = CL_COMPLETE;
 }
 
 static void mapbuffer_task(void *args) {
-  struct mb_data *arg = (struct mb_data*)args;
+	command_map_buffer cmd = (command_map_buffer)args;
+
+	starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
+
+	starpu_data_acquire_cb(cmd->buffer->handle, mode, mapbuffer_callback, cmd);
+}
 
-  starpu_access_mode mode = (arg->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
+cl_int command_map_buffer_submit(command_map_buffer cmd) {
+	starpu_task task = task_create_cpu(mapbuffer_task, cmd, 0);
 
-  starpu_data_acquire_cb(arg->buffer->handle, mode, mapbuffer_callback, arg);
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
 }
 
 CL_API_ENTRY void * CL_API_CALL
 soclEnqueueMapBuffer(cl_command_queue cq,
                    cl_mem           buffer,
-                   cl_bool          blocking_map, 
+                   cl_bool          blocking, 
                    cl_map_flags     map_flags,
                    size_t           offset, 
-                   size_t           UNUSED(cb),
+                   size_t           cb,
                    cl_uint          num_events,
                    const cl_event * events,
                    cl_event *       event,
                    cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0
 {
-   struct starpu_task *task;
-   struct mb_data *arg;
-   cl_event ev;
-   cl_int err;
-
-   /* Create custom event that will be triggered when map is complete */
-   ev = event_create();
-
-   /* Store arguments */
-   arg = (struct mb_data*)malloc(sizeof(struct mb_data));
-   arg->map_flags = map_flags;
-   gc_entity_store(&arg->ev, ev);
-   gc_entity_store(&arg->buffer, buffer);
-
-   /* Create StarPU task */
-   task = task_create_cpu(CL_COMMAND_MAP_BUFFER, mapbuffer_task, arg, 0);
-   cl_event map_event = task_event(task);
-
-   /* Enqueue task */
-   DEBUG_MSG("Submitting MapBuffer task (event %d)\n", ev->id);
-   err = command_queue_enqueue_fakeevent(cq, task, 0, num_events, events, ev);
-   gc_entity_release(map_event);
+	cl_event ev = event_create();
 
-   if (errcode_ret != NULL)
-      *errcode_ret = err;
+	command_map_buffer cmd = command_map_buffer_create(buffer, map_flags, offset, cb, ev);
 
-   if (err != CL_SUCCESS)
-      return NULL;
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-   if (blocking_map == CL_TRUE)
-      soclWaitForEvents(1, &ev);
+	if (errcode_ret != NULL)
+		*errcode_ret = CL_SUCCESS;
 
-   RETURN_EVENT(ev, event);
+	RETURN_CUSTOM_EVENT(ev,event);
 
-   return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
+	MAY_BLOCK_CUSTOM(blocking,ev);
+	
+	return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
 }

+ 15 - 5
socl/src/cl_enqueuemarker.c

@@ -20,11 +20,21 @@ CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueMarker(cl_command_queue  cq,
                 cl_event *          event) CL_API_SUFFIX__VERSION_1_0
 {
-   if (event == NULL)
-      return CL_INVALID_VALUE;
+	if (event == NULL)
+		return CL_INVALID_VALUE;
+	
+	command_marker cmd = command_marker_create();
 
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
-   *event = task_event(task);
+	command_queue_enqueue(cq, cmd, 0, NULL);
 
-   return command_queue_enqueue(cq, task, 0, 0, NULL);
+	RETURN_EVENT(cmd, event);
+
+	return CL_SUCCESS;
+}
+
+cl_int command_marker_submit(command_marker cmd) {
+	struct starpu_task *task;
+	task = task_create(CL_COMMAND_MARKER);
+
+	task_submit(task, cmd);
 }

+ 124 - 250
socl/src/cl_enqueuendrangekernel.c

@@ -16,81 +16,64 @@
 
 #include "socl.h"
 
-typedef struct running_kernel * running_kernel;
-
-struct running_kernel {
-  cl_kernel kernel;
-  cl_mem *buffers;
-  unsigned int buffer_count;
-  starpu_codelet *codelet;
-  cl_uint work_dim;
-  size_t * global_work_offset;
-  size_t * global_work_size;
-  size_t * local_work_size;
-  /* Arguments */
-  unsigned int arg_count;
-  size_t *arg_size;
-  enum kernel_arg_type  *arg_type;
-  void  **arg_value;
-};
 
-static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
-   running_kernel d;
+void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
+
    cl_command_queue cq;
    int wid;
    cl_int err;
 
-   d = (running_kernel)args;
    wid = starpu_worker_get_id();
    starpu_opencl_get_queue(wid, &cq);
 
-   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, d->kernel->id);
+   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, cmd->kernel->id);
 
    int range = starpu_worker_get_range();
 
    /* Set arguments */
    {
-      unsigned int i;
-      int buf = 0;
-      for (i=0; i<d->arg_count; i++) {
-         switch (d->arg_type[i]) {
-            case Null:
-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], NULL);
-               break;
-            case Buffer: {
-                  cl_mem mem;  
-                  mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
-                  err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], &mem);
-                  buf++;
-               }
-               break;
-            case Immediate:
-               err = clSetKernelArg(d->kernel->cl_kernels[range], i, d->arg_size[i], d->arg_value[i]);
-               break;
-         }
-         if (err != CL_SUCCESS) {
-            DEBUG_CL("clSetKernelArg", err);
-            DEBUG_ERROR("Aborting\n");
-         }
-      }
+	   unsigned int i;
+	   int buf = 0;
+	   for (i=0; i<cmd->num_args; i++) {
+		   switch (cmd->arg_types[i]) {
+			   case Null:
+				   err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], NULL);
+				   break;
+			   case Buffer: {
+						cl_mem mem;  
+						mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
+						err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], &mem);
+						buf++;
+					}
+					break;
+			   case Immediate:
+					err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], cmd->args[i]);
+					break;
+		   }
+		   if (err != CL_SUCCESS) {
+			   DEBUG_CL("clSetKernelArg", err);
+			   DEBUG_ERROR("Aborting\n");
+		   }
+	   }
    }
 
    /* Calling Kernel */
    cl_event event;
-   err = clEnqueueNDRangeKernel(cq, d->kernel->cl_kernels[range], d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size, 0, NULL, &event);
+   err = clEnqueueNDRangeKernel(cq, cmd->kernel->cl_kernels[range], cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size, 0, NULL, &event);
 
    if (err != CL_SUCCESS) {
-      ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
-      DEBUG_CL("clEnqueueNDRangeKernel", err);
-      DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
-                d->work_dim, d->global_work_offset, d->global_work_size, d->local_work_size);
-      DEBUG_MSG("Global work size: %ld %ld %ld\n", d->global_work_size[0],
-            (d->work_dim > 1 ? d->global_work_size[1] : 1), (d->work_dim > 2 ? d->global_work_size[2] : 1)); 
-      if (d->local_work_size != NULL)
-         DEBUG_MSG("Local work size: %ld %ld %ld\n", d->local_work_size[0],
-               (d->work_dim > 1 ? d->local_work_size[1] : 1), (d->work_dim > 2 ? d->local_work_size[2] : 1)); 
-      ERROR_MSG("Aborting.\n");
-      exit(1);
+	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
+	   DEBUG_CL("clEnqueueNDRangeKernel", err);
+	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
+			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
+			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
+	   if (cmd->local_work_size != NULL)
+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
+				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
+	   ERROR_MSG("Aborting.\n");
+	   exit(1);
    }
 
    /* Waiting for kernel to terminate */
@@ -99,219 +82,110 @@ static void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
 }
 
 static void cleaning_task_callback(void *args) {
-   running_kernel arg = (running_kernel)args;
+	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
 
-   free(arg->arg_size);
-   free(arg->arg_type);
-
-   unsigned int i;
-   for (i=0; i<arg->arg_count; i++) {
-      free(arg->arg_value[i]);
-   }
-   free(arg->arg_value);
+	free(cmd->arg_sizes);
+	free(cmd->arg_types);
 
-   for (i=0; i<arg->buffer_count; i++)
-      gc_entity_unstore(&arg->buffers[i]);
+	unsigned int i;
+	for (i=0; i<cmd->num_args; i++) {
+		free(cmd->args[i]);
+	}
+	free(cmd->args);
 
-   gc_entity_unstore(&arg->kernel);
+	for (i=0; i<cmd->num_buffers; i++)
+		gc_entity_unstore(&cmd->buffers[i]);
 
-   free(arg->buffers);
-   free(arg->global_work_offset);
-   free(arg->global_work_size);
-   free(arg->local_work_size);
-   void * co = arg->codelet;
-   arg->codelet = NULL;
-   free(co);
+	free(cmd->buffers);
+	void * co = cmd->codelet;
+	cmd->codelet = NULL;
+	free(co);
 }
 
 static struct starpu_perfmodel_t perf_model = {
-  .type = STARPU_HISTORY_BASED,
-  .symbol = "perf_model"
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "perf_model"
 };
 
 /**
  * Real kernel enqueuing command
  */
-cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
-
-   struct starpu_task *task;
-   running_kernel arg;
-   starpu_codelet *codelet;
-   cl_event ev;
-   
-   /* Alias struc fields */
-   cl_command_queue cq = n->cq;
-   cl_kernel        kernel = n->kernel;
-   cl_uint          work_dim = n->work_dim;
-   const size_t *   global_work_offset = n->global_work_offset;
-   const size_t *   global_work_size = n->global_work_size;
-   const size_t *   local_work_size = n->local_work_size;
-   cl_uint          num_events = n->num_events;
-   const cl_event * events = n->events;
-   cl_event *       event = n->event;
-   char 	    is_task = n->is_task;
-
-
-   /* Allocate structures */
-
-   /* Codelet */
-   codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
-   if (codelet == NULL)
-      return CL_OUT_OF_HOST_MEMORY;
-
-   /* Codelet arguments */
-   arg = (running_kernel)malloc(sizeof(struct running_kernel));
-   if (arg == NULL) {
-      free(codelet);
-      return CL_OUT_OF_HOST_MEMORY;
-   }
-
-   /* StarPU task */
-   task = task_create(is_task ? CL_COMMAND_TASK : CL_COMMAND_NDRANGE_KERNEL);
-   ev = task_event(task);
-
-   /*******************
-    * Initializations *
-    *******************/
-
-   /* ------- *
-    * Codelet *
-    * ------- */
-   codelet->where = STARPU_OPENCL;
-   codelet->power_model = NULL;
-   codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
-   //codelet->model = NULL;
-   codelet->model = &perf_model;
-
-   /* ---------------- *
-    * Codelet argument *
-    * ---------------- */
-   gc_entity_store(&arg->kernel, kernel);
-   arg->work_dim = work_dim;
-   arg->codelet = codelet;
-
-   /* Global work offset */
-   if (global_work_offset != NULL) {
-      arg->global_work_offset = (size_t*)malloc(sizeof(size_t)*work_dim);
-      memcpy(arg->global_work_offset, global_work_offset, work_dim*sizeof(size_t));
-   }
-   else arg->global_work_offset = NULL;
-
-   /* Global work size */
-   arg->global_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
-   memcpy(arg->global_work_size, global_work_size, work_dim*sizeof(size_t));
-
-   /* Local work size */
-   if (local_work_size != NULL) {
-      arg->local_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
-      memcpy(arg->local_work_size, local_work_size, work_dim*sizeof(size_t));
-   }
-   else arg->local_work_size = NULL;
-
-   /* ----------- *
-    * StarPU task *
-    * ----------- */
-   task->cl = codelet;
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct running_kernel);
-
-   /* Convert OpenCL's memory objects to StarPU buffers */
-   codelet->nbuffers = 0;
-   {
-      arg->buffers = malloc(sizeof(cl_mem) * kernel->arg_count);
-      arg->buffer_count = 0;
-
-      unsigned int i;
-      for (i=0; i<kernel->arg_count; i++) {
-         if (kernel->arg_type[i] == Buffer) {
-
-            cl_mem buf = (cl_mem)kernel->arg_value[i];
-
-            /* We save cl_mem references in order to properly release them after kernel termination */
-            gc_entity_store(&arg->buffers[arg->buffer_count], buf);
-            arg->buffer_count += 1;
-
-            codelet->nbuffers++;
-            task->buffers[codelet->nbuffers-1].handle = buf->handle;
-
-            /* Determine best StarPU buffer access mode */
-            int mode;
-            if (buf->mode == CL_MEM_READ_ONLY)
-               mode = STARPU_R;
-            else if (buf->mode == CL_MEM_WRITE_ONLY) {
-               mode = STARPU_W;
-               buf->scratch = 0;
-            }
-            else if (buf->scratch) { //RW but never accessed in RW or W mode
-               mode = STARPU_W;
-               buf->scratch = 0;
-            }
-            else {
-               mode = STARPU_RW;
-               buf->scratch = 0;
-            }
-            task->buffers[codelet->nbuffers-1].mode = mode; 
-         }
-      }
-   }
-
-   /* Copy arguments as kernel args can be modified by the time we launch the kernel */
-   {
-      arg->arg_count = kernel->arg_count;
-      arg->arg_size = malloc(sizeof(size_t) * kernel->arg_count);
-      memcpy(arg->arg_size, kernel->arg_size, sizeof(size_t) * kernel->arg_count);
-      arg->arg_type = malloc(sizeof(enum kernel_arg_type) * kernel->arg_count);
-      memcpy(arg->arg_type, kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->arg_count);
-      arg->arg_value = malloc(sizeof(void*) * kernel->arg_count);
-      unsigned int i;
-      for (i=0; i<kernel->arg_count; i++) {
-         if (kernel->arg_value[i] != NULL) {
-           arg->arg_value[i] = malloc(arg->arg_size[i]);
-           memcpy(arg->arg_value[i], kernel->arg_value[i], arg->arg_size[i]);
-         }
-         else arg->arg_value[i] = NULL;
-      }
-   }
-
-   DEBUG_MSG("Submitting NDRange task (event %d)\n", ev->id);
-
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
-
-   /* Enqueue a cleaning task */
-   starpu_task * cleaning_task = task_create_cpu(0, cleaning_task_callback, arg,1);
-   cl_event cleaning_event = task_event(cleaning_task);
-   command_queue_enqueue(cq, cleaning_task, 0, 1, &ev);
+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd) {
+
+	starpu_task task = task_create();
+	task->cl = cmd->codelet;
+	task->cl_arg = cmd;
+	task->cl_arg_size = sizeof(cmd);
+
+	starpu_codelet * codelet = cmd->codelet;
+
+	/* We need to detect which parameters are OpenCL's memory objects and
+	 * we retrieve their corresponding StarPU buffers */
+	cmd->num_buffers = 0;
+	cmd->buffers = malloc(sizeof(cl_mem) * cmd->num_args);
+
+	unsigned int i;
+	for (i=0; i<cmd->num_args; i++) {
+		if (cmd->arg_types[i] == Buffer) {
+
+			cl_mem buf = *(cl_mem*)cmd->args[i];
+
+			gc_entity_store(&cmd->buffers[cmd->num_buffers], buf);
+			task->buffers[cmd->num_buffers].handle = buf->handle;
+
+			/* Determine best StarPU buffer access mode */
+			int mode;
+			if (buf->mode == CL_MEM_READ_ONLY)
+				mode = STARPU_R;
+			else if (buf->mode == CL_MEM_WRITE_ONLY) {
+				mode = STARPU_W;
+				buf->scratch = 0;
+			}
+			else if (buf->scratch) { //RW but never accessed in RW or W mode
+				mode = STARPU_W;
+				buf->scratch = 0;
+			}
+			else {
+				mode = STARPU_RW;
+				buf->scratch = 0;
+			}
+			task->buffers[cmd->num_buffers].mode = mode; 
+
+			cmd->num_buffers += 1;
+		}
+	}
+	codelet->nbuffers = cmd->num_buffers;
+
+	task_submit(task, cmd);
+
+	/* Enqueue a cleaning task */
+	//FIXME: execute this in the callback?
+	starpu_task cleaning_task = task_create_cpu(cleaning_task_callback, cmd,1);
+	cl_event ev = command_event_get(cmd);
+	task_depends_on(cleaning_task, 1, &ev);
+	task_submit(cleaning_task, cmd);
 
-   gc_entity_release(cleaning_event);
-  
-   RETURN_EVENT(ev, event);
-
-   return ret;
+	return CL_SUCCESS;
 }
 
-/**
- * Virtual kernel enqueueing command
- */
+
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueNDRangeKernel(cl_command_queue cq,
-                       cl_kernel        kernel,
-                       cl_uint          work_dim,
-                       const size_t *   global_work_offset,
-                       const size_t *   global_work_size,
-                       const size_t *   local_work_size,
-                       cl_uint          num_events,
-                       const cl_event * events,
-                       cl_event *       event) CL_API_SUFFIX__VERSION_1_1
+		cl_kernel        kernel,
+		cl_uint          work_dim,
+		const size_t *   global_work_offset,
+		const size_t *   global_work_size,
+		const size_t *   local_work_size,
+		cl_uint          num_events,
+		const cl_event * events,
+		cl_event *       event) CL_API_SUFFIX__VERSION_1_1
 {
-	node_enqueue_kernel n;
+	command_ndrange_kernel cmd = command_ndrange_kernel_create(kernel, work_dim,
+			global_work_offset, global_work_size, local_work_size);
+
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-	n = graph_create_enqueue_kernel(0, cq, kernel, work_dim, global_work_offset, global_work_size,
-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
-		kernel->arg_type, kernel->arg_value);
-	
-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
-	node_play_enqueue_kernel(n);
+	RETURN_EVENT(cmd, event);
 
-	//graph_store(n);
 	return CL_SUCCESS;
 }

+ 36 - 24
socl/src/cl_enqueuereadbuffer.c

@@ -66,6 +66,37 @@ static starpu_codelet codelet_readbuffer = {
    .nbuffers = 1
 };
 
+cl_int command_read_buffer_submit(command_read_buffer cmd) {
+	/* Aliases */
+	cl_mem buffer = cmd->buffer;
+	size_t offset = cmd->offset;
+	size_t cb = cmd->cb;
+	void * ptr = cmd->ptr;
+
+	struct starpu_task *task;
+	struct arg_readbuffer *arg;
+
+	task = task_create(CL_COMMAND_READ_BUFFER);
+
+	task->buffers[0].handle = buffer->handle;
+	task->buffers[0].mode = STARPU_R;
+	task->cl = &codelet_readbuffer;
+
+	arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
+	arg->offset = offset;
+	arg->cb = cb;
+	arg->ptr = ptr;
+	task->cl_arg = arg;
+	task->cl_arg_size = sizeof(struct arg_readbuffer);
+
+	gc_entity_store(&arg->buffer, buffer);
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
+
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueReadBuffer(cl_command_queue  cq,
                     cl_mem              buffer,
@@ -77,33 +108,14 @@ soclEnqueueReadBuffer(cl_command_queue  cq,
                     const cl_event *    events,
                     cl_event *          event) CL_API_SUFFIX__VERSION_1_0
 { 
-   struct starpu_task *task;
-   struct arg_readbuffer *arg;
-   cl_event ev;
-
-   task = task_create(CL_COMMAND_READ_BUFFER);
-   ev = task_event(task);
-
-   task->buffers[0].handle = buffer->handle;
-   task->buffers[0].mode = STARPU_R;
-   task->cl = &codelet_readbuffer;
-
-   arg = (struct arg_readbuffer*)malloc(sizeof(struct arg_readbuffer));
-   arg->offset = offset;
-   arg->cb = cb;
-   arg->ptr = ptr;
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct arg_readbuffer);
-
-   gc_entity_store(&arg->buffer, buffer);
 
-   task->synchronous = (blocking == CL_TRUE);
+	command_read_buffer cmd = command_read_buffer_create(buffer, offset, cb, ptr);
 
-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
+	RETURN_EVENT(cmd, event);
 
-   RETURN_EVENT(ev, event);
+	MAY_BLOCK(blocking);
 
-   return ret;
+	return CL_SUCCESS;
 }

+ 4 - 16
socl/src/cl_enqueuetask.c

@@ -16,14 +16,6 @@
 
 #include "socl.h"
 
-static cl_uint work_dim = 3;
-static const size_t global_work_offset[3] = {0,0,0};
-static const size_t global_work_size[3] = {1,1,1};
-static const size_t * local_work_size = NULL;
-
-CL_API_ENTRY cl_int CL_API_CALL
-soclEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) CL_API_SUFFIX__VERSION_1_0;
-
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueTask(cl_command_queue cq,
               cl_kernel         kernel,
@@ -31,15 +23,11 @@ soclEnqueueTask(cl_command_queue cq,
               const cl_event *  events,
               cl_event *        event) CL_API_SUFFIX__VERSION_1_0
 {
-	node_enqueue_kernel n;
-
-	n = graph_create_enqueue_kernel(1, cq, kernel, work_dim, global_work_offset, global_work_size,
-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
-		kernel->arg_type, kernel->arg_value);
+	command_ndrange_kernel cmd = command_task_create(kernel);
 	
-	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
-	node_play_enqueue_kernel(n);
+	command_queue_enqueue(cq, cmd, num_events, events);
+
+	RETURN_EVENT(cmd, event);
 
-	//graph_store(n);
 	return CL_SUCCESS;
 }

+ 18 - 14
socl/src/cl_enqueueunmapmemobject.c

@@ -16,27 +16,31 @@
 
 #include "socl.h"
 
+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd) {
+	/* Aliases */
+	cl_mem buffer = cmd->buffer;
+
+	//FIXME: use a callback
+	starpu_task task = task_create_cpu((void(*)(void*))starpu_data_release, buffer->handle, 0);
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueUnmapMemObject(cl_command_queue cq,
-                        cl_mem            memobj,
-                        void *            UNUSED(mapped_ptr),
+                        cl_mem            buffer,
+                        void *            ptr,
                         cl_uint           num_events,
                         const cl_event *  events,
                         cl_event *        event) CL_API_SUFFIX__VERSION_1_0
 {
-   struct starpu_task *task;
-   cl_int err;
-   cl_event ev;
-
-   /* Create StarPU task */
-   task = task_create_cpu(CL_COMMAND_UNMAP_MEM_OBJECT, (void(*)(void*))starpu_data_release, memobj->handle, 0);
-   ev = task_event(task);
-
-   DEBUG_MSG("Submitting UnmapBuffer task (event %d)\n", task->tag_id);
+	command_unmap_mem_object cmd = command_unmap_mem_object_create(buffer, ptr);
 
-   err = command_queue_enqueue(cq, task, 0, num_events, events);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-   RETURN_EVENT(ev, event);
+	RETURN_EVENT(cmd, event);
 
-   return err;
+	return CL_SUCCESS;
 }

+ 3 - 4
socl/src/cl_enqueuewaitforevents.c

@@ -22,10 +22,9 @@ soclEnqueueWaitForEvents(cl_command_queue cq,
                        const cl_event * events) CL_API_SUFFIX__VERSION_1_0
 {
 
-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_WAIT_FOR_EVENTS doesn't exist
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
+	command_marker cmd = command_marker_create();
 
-   command_queue_enqueue(cq, task, 0, num_events, events);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-   return CL_SUCCESS;
+	return CL_SUCCESS;
 }

+ 42 - 33
socl/src/cl_enqueuewritebuffer.c

@@ -68,6 +68,43 @@ static starpu_codelet codelet_writebuffer = {
    .nbuffers = 1
 };
 
+cl_int command_write_buffer_submit(command_write_buffer cmd) {
+	/* Aliases */
+	cl_mem buffer = cmd->buffer;
+	size_t offset = cmd->offset;
+	size_t cb = cmd->cb;
+	const void * ptr = cmd->ptr;
+
+	struct starpu_task *task;
+	struct arg_writebuffer *arg;
+
+	task = task_create(CL_COMMAND_WRITE_BUFFER);
+
+	task->buffers[0].handle = buffer->handle;
+	//If only a subpart of the buffer is written, RW access mode is required
+	if (cb != buffer->size)
+		task->buffers[0].mode = STARPU_RW;
+	else 
+		task->buffers[0].mode = STARPU_W;
+	task->cl = &codelet_writebuffer;
+
+	arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
+	arg->offset = offset;
+	arg->cb = cb;
+	arg->ptr = ptr;
+	task->cl_arg = arg;
+	task->cl_arg_size = sizeof(struct arg_writebuffer);
+
+	gc_entity_store(&arg->buffer, buffer);
+
+	//The buffer now contains meaningful data
+	arg->buffer->scratch = 0;
+
+	task_submit(task, cmd);
+
+	return CL_SUCCESS;
+}
+
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueWriteBuffer(cl_command_queue cq, 
                      cl_mem             buffer, 
@@ -79,41 +116,13 @@ soclEnqueueWriteBuffer(cl_command_queue cq,
                      const cl_event *   events, 
                      cl_event *         event) CL_API_SUFFIX__VERSION_1_0
 { 
-   struct starpu_task *task;
-   struct arg_writebuffer *arg;
-   cl_event ev;
-
-   task = task_create(CL_COMMAND_WRITE_BUFFER);
-   ev = task_event(task);
-
-   task->buffers[0].handle = buffer->handle;
-   //If only a subpart of the buffer is written, RW access mode is required
-   if (cb != buffer->size)
-      task->buffers[0].mode = STARPU_RW;
-   else 
-      task->buffers[0].mode = STARPU_W;
-   task->cl = &codelet_writebuffer;
-
-   arg = (struct arg_writebuffer*)malloc(sizeof(struct arg_writebuffer));
-   arg->offset = offset;
-   arg->cb = cb;
-   arg->ptr = ptr;
-   task->cl_arg = arg;
-   task->cl_arg_size = sizeof(struct arg_writebuffer);
-
-   gc_entity_store(&arg->buffer, buffer);
-
-   //The buffer now contains meaningful data
-   arg->buffer->scratch = 0;
-
-   task->synchronous = (blocking == CL_TRUE);
+	command_write_buffer cmd = command_write_buffer_create(buffer, offset, cb, ptr);
 
-   DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events,events);
+	RETURN_EVENT(cmd, event);
 
-   /* Return retained event if required by user */
-   RETURN_EVENT(ev,event);
+	MAY_BLOCK(blocking);
 
-   return ret;
+	return CL_SUCCESS;
 }

+ 9 - 6
socl/src/cl_finish.c

@@ -17,11 +17,14 @@
 #include "socl.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
-soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
-{
-   cl_event ev = enqueueBarrier(cq);
-   soclWaitForEvents(1, &ev);
-   gc_entity_release(ev);
+soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0 {
 
-   return CL_SUCCESS;
+	command_marker cmd = command_barrier_create();
+
+	command_queue_enqueue(cq, cmd, 0, NULL);
+		cl_event ev = command_event_get(cmd);
+
+	MAY_BLOCK(CL_TRUE)
+
+	return CL_SUCCESS;
 }

+ 2 - 1
socl/src/cl_geteventinfo.c

@@ -17,6 +17,7 @@
 #include "socl.h"
 #include "getinfo.h"
 
+
 CL_API_ENTRY cl_int CL_API_CALL
 soclGetEventInfo(cl_event       event,
                cl_event_info    param_name,
@@ -33,7 +34,7 @@ soclGetEventInfo(cl_event       event,
 
    switch (param_name) {
       INFO_CASE(CL_EVENT_COMMAND_QUEUE, event->cq);
-      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->type);
+      INFO_CASE(CL_EVENT_COMMAND_TYPE, event->command->typ);
       INFO_CASE(CL_EVENT_COMMAND_EXECUTION_STATUS, event->status);
       INFO_CASE(CL_EVENT_REFERENCE_COUNT, event->_entity.refs);
       default:

+ 1 - 1
socl/src/cl_getkernelinfo.c

@@ -29,7 +29,7 @@ soclGetKernelInfo(cl_kernel       kernel,
 
    switch (param_name) {
       INFO_CASE_EX(CL_KERNEL_FUNCTION_NAME, kernel->kernel_name, strlen(kernel->kernel_name)+1)
-      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->arg_count)
+      INFO_CASE(CL_KERNEL_NUM_ARGS, kernel->num_args)
       INFO_CASE(CL_KERNEL_REFERENCE_COUNT, kernel->_entity.refs)
       INFO_CASE(CL_KERNEL_PROGRAM, kernel->program)
       INFO_CASE(CL_KERNEL_CONTEXT, kernel->program->context)

+ 5 - 3
socl/src/cl_setkernelarg.c

@@ -25,7 +25,7 @@ soclSetKernelArg(cl_kernel  kernel,
    if (kernel == NULL)
       return CL_INVALID_KERNEL;
 
-   if (arg_index >= kernel->arg_count)
+   if (arg_index >= kernel->num_args)
       return CL_INVALID_ARG_INDEX;
 
    //FIXME: we don't return CL_INVALID_ARG_VALUE if "arg_value is NULL for an argument that is not declared with __local qualifier or vice-versa"
@@ -38,7 +38,8 @@ soclSetKernelArg(cl_kernel  kernel,
          break;
       case Buffer:
          kernel->arg_type[arg_index] = Null;
-         gc_entity_unstore((cl_mem*)&kernel->arg_value[arg_index]);
+         gc_entity_unstore((cl_mem*)kernel->arg_value[arg_index]);
+	 free(kernel->arg_value[arg_index]);
          kernel->arg_value[arg_index] = NULL;
          break;
       case Immediate:
@@ -60,7 +61,8 @@ soclSetKernelArg(cl_kernel  kernel,
       if ((arg_size == sizeof(cl_mem)) && ((buf = mem_object_fetch(arg_value)) != NULL)) {
          DEBUG_MSG("Found buffer %d \n", buf->id);
          kernel->arg_type[arg_index] = Buffer;
-         gc_entity_store(&kernel->arg_value[arg_index], buf);
+         kernel->arg_value[arg_index] = malloc(sizeof(void*));
+	 gc_entity_store((cl_mem*)kernel->arg_value[arg_index], buf);
       }
       else {
          /* Argument must be an immediate buffer  */

+ 6 - 1
socl/src/cl_waitforevents.c

@@ -23,7 +23,12 @@ soclWaitForEvents(cl_uint           num_events,
    unsigned int i;
    DEBUG_MSG("Waiting for events: ");
    for (i=0; i<num_events; i++) {
-      DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
+   	command_graph_dump(event_list[i]->command);
+
+   	/* We need to submit commands if it's not already done */
+	command_submit_deep(event_list[i]->command);
+
+      	DEBUG_MSG_NOHEAD("%d ", event_list[i]->id);
    }
    DEBUG_MSG_NOHEAD("\n");
 

+ 235 - 0
socl/src/command.c

@@ -0,0 +1,235 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+void command_init_ex(cl_command cmd, cl_command_type typ) {
+	cmd->typ = typ;
+	cmd->num_events = 0;
+	cmd->events = NULL;
+	cmd->event = event_create();
+	cmd->event->command = cmd;
+	cmd->cq = NULL;
+	cmd->task = NULL;
+	cmd->submitted = 0;
+}
+
+
+void command_submit_ex(cl_command cmd) {
+#define SUBMIT(typ,name) case typ:\
+	name##_submit((name)cmd);\
+	break;
+
+	assert(cmd->submitted == 0);
+
+	switch(cmd->typ) {
+		SUBMIT(CL_COMMAND_NDRANGE_KERNEL, command_ndrange_kernel)
+		SUBMIT(CL_COMMAND_TASK, command_ndrange_kernel)
+		SUBMIT(CL_COMMAND_READ_BUFFER, command_read_buffer)
+		SUBMIT(CL_COMMAND_WRITE_BUFFER, command_write_buffer)
+		SUBMIT(CL_COMMAND_COPY_BUFFER, command_copy_buffer)
+		SUBMIT(CL_COMMAND_MAP_BUFFER, command_map_buffer)
+		SUBMIT(CL_COMMAND_UNMAP_MEM_OBJECT, command_unmap_mem_object)
+		SUBMIT(CL_COMMAND_MARKER, command_marker)
+		default:
+			ERROR_STOP("Trying to submit unknown command (type %x)", cmd->typ);
+	}
+
+	cmd->submitted = 1;
+#undef SUBMIT
+}
+
+cl_int command_submit_deep_ex(cl_command cmd) {
+	if (cmd->submitted == 1)
+		return CL_SUCCESS;
+	
+	/* We set this in order to avoid cyclic dependencies */
+	cmd->submitted = 1;
+
+	unsigned int i;
+	for (i=0; i<cmd->num_events; i++)
+		command_submit_deep(cmd->events[i]->command);
+	
+	cmd->submitted = 0;
+
+	command_submit_ex(cmd);
+
+	return CL_SUCCESS;
+}
+
+void command_graph_dump_ex(cl_command cmd) {
+
+	unsigned int i;
+	for (i=0; i<cmd->num_events; i++)
+		command_graph_dump_ex(cmd->events[i]->command);
+
+	printf("CMD %lx TYPE %d DEPS", cmd, cmd->typ);
+	for (i=0; i<cmd->num_events; i++)
+		printf(" %lx", cmd->events[i]->command);
+	printf("\n");
+
+}
+
+#define nullOrDup(name,size) cmd->name = memdup_safe(name,size)
+#define dup(name) cmd->name = name
+#define dupEntity(name) do { cmd->name = name; gc_entity_retain(name); } while (0);
+
+void soclEnqueueNDRangeKernel_task(void *descr[], void *args);
+
+command_ndrange_kernel command_ndrange_kernel_create (
+		cl_kernel        kernel,
+		cl_uint          work_dim,
+		const size_t *   global_work_offset,
+		const size_t *   global_work_size,
+		const size_t *   local_work_size)
+{
+	command_ndrange_kernel cmd = malloc(sizeof(struct command_ndrange_kernel_t));
+	command_init(cmd, CL_COMMAND_NDRANGE_KERNEL);
+
+	dupEntity(kernel);
+	dup(work_dim);
+	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
+	nullOrDup(global_work_size, work_dim*sizeof(size_t));
+	nullOrDup(local_work_size, work_dim*sizeof(size_t));
+
+   	/* Codelet */
+   	cmd->codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
+	starpu_codelet * codelet = cmd->codelet;
+	codelet->where = STARPU_OPENCL;
+	codelet->power_model = NULL;
+	codelet->opencl_func = &soclEnqueueNDRangeKernel_task;
+	codelet->model = NULL;
+
+   	/* Kernel is mutable, so we duplicate its parameters... */
+	cmd->num_args = kernel->num_args;
+	cmd->arg_sizes = memdup(kernel->arg_size, sizeof(size_t) * kernel->num_args);
+	cmd->arg_types = memdup(kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->num_args);
+	cmd->args = memdup_deep_varsize_safe(kernel->arg_value, kernel->num_args, kernel->arg_size);
+
+	return cmd;
+}
+
+command_ndrange_kernel command_task_create (cl_kernel kernel) {
+
+	static cl_uint task_work_dim = 3;
+	static const size_t task_global_work_offset[3] = {0,0,0};
+	static const size_t task_global_work_size[3] = {1,1,1};
+	static const size_t * task_local_work_size = NULL;
+
+	command_ndrange_kernel cmd = command_ndrange_kernel_create(
+			kernel, task_work_dim, task_global_work_offset,
+			task_global_work_size, task_local_work_size);
+
+	/* This is the only difference with command_ndrange_kernel_create */
+	cmd->_command.typ = CL_COMMAND_TASK;
+
+	return cmd;
+}
+
+command_marker command_barrier_create () {
+
+	command_marker cmd = malloc(sizeof(struct command_marker_t));
+	command_init(cmd, CL_COMMAND_BARRIER);
+
+	return cmd;
+}
+
+command_marker command_marker_create () {
+
+	command_marker cmd = malloc(sizeof(struct command_marker_t));
+	command_init(cmd, CL_COMMAND_MARKER);
+
+	return cmd;
+}
+
+command_map_buffer command_map_buffer_create(
+		cl_mem buffer,
+		cl_map_flags map_flags,
+		size_t offset,
+		size_t cb,
+		cl_event event
+		) {
+
+	command_map_buffer cmd = malloc(sizeof(struct command_map_buffer_t));
+	command_init(cmd, CL_COMMAND_MAP_BUFFER);
+
+	dupEntity(buffer);
+	dup(map_flags);
+	dup(offset);
+	dup(cb);
+	dupEntity(event);
+
+	return cmd;
+}
+
+command_unmap_mem_object command_unmap_mem_object_create(cl_mem buffer, void * ptr) {
+	command_unmap_mem_object cmd = malloc(sizeof(struct command_unmap_mem_object_t));
+	command_init(cmd, CL_COMMAND_UNMAP_MEM_OBJECT);
+
+	dupEntity(buffer);
+	dup(ptr);
+
+	return cmd;
+}
+
+command_read_buffer command_read_buffer_create(cl_mem buffer, size_t offset, size_t cb, void * ptr) {
+
+	command_read_buffer cmd = malloc(sizeof(struct command_read_buffer_t));
+	command_init(cmd, CL_COMMAND_READ_BUFFER);
+
+	dupEntity(buffer);
+	dup(offset);
+	dup(cb);
+	dup(ptr);
+
+	return cmd;
+}
+
+command_write_buffer command_write_buffer_create(cl_mem buffer, size_t offset, size_t cb, const void * ptr) {
+
+	command_write_buffer cmd = malloc(sizeof(struct command_write_buffer_t));
+	command_init(cmd, CL_COMMAND_WRITE_BUFFER);
+
+	dupEntity(buffer);
+	dup(offset);
+	dup(cb);
+	dup(ptr);
+
+	return cmd;
+}
+
+command_copy_buffer command_copy_buffer_create( cl_mem src_buffer, cl_mem dst_buffer,
+		size_t src_offset, size_t dst_offset, size_t cb)
+{
+	command_copy_buffer cmd = malloc(sizeof(struct command_copy_buffer_t));
+	command_init(cmd, CL_COMMAND_COPY_BUFFER);
+
+	dupEntity(src_buffer);
+	dupEntity(dst_buffer);
+	dup(src_offset);
+	dup(dst_offset);
+	dup(cb);
+
+	return cmd;
+}
+
+#undef nullOrDup
+#undef nodeNullOrDup
+#undef dup
+#undef dupEntity
+#undef nodeDup
+#undef memdup
+

+ 198 - 0
socl/src/command.h

@@ -0,0 +1,198 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef SOCL_COMMANDS_H
+#define SOCL_COMMANDS_H
+
+typedef struct cl_command_t * cl_command;
+
+/**
+ * Initialize a command structure
+ *
+ * Command constructors for each kind of command use this method
+ * Implicit and explicit dependencies must be passed as parameters
+ */
+void command_init_ex(cl_command cmd, cl_command_type typ);
+#define command_init(cmd,typ) \
+	command_init_ex((cl_command)cmd,typ)
+
+/** Submit a command for execution */
+void command_submit_ex(cl_command cmd);
+#define command_submit(cmd) \
+	command_submit_ex(&(cmd)->_command)
+
+/** Submit a command and its dependencies */
+cl_int command_submit_deep_ex(cl_command cmd);
+#define command_submit_deep(cmd) (command_submit_deep_ex((cl_command)cmd))
+
+void command_graph_dump_ex(cl_command cmd);
+#define command_graph_dump(cmd) (command_graph_dump_ex((cl_command)cmd))
+
+/**************************
+ * OpenCL Commands
+ **************************/
+struct cl_command_t {
+	cl_command_type	typ;	 	/* Command type */
+	cl_uint 	num_events;	/* Number of dependencies */
+	cl_event * 	events;		/* Dependencies */
+	cl_event  	event;		/* Event for this command */
+	cl_command_queue cq;		/* Command queue the command is enqueued in */
+	starpu_task	task;		/* Associated StarPU task, if any */
+	char		submitted;	/* True if the command has been submitted to StarPU */
+};
+
+#define command_type_get(cmd) (((cl_command)cmd)->typ)
+#define command_event_get(cmd) (((cl_command)cmd)->event)
+#define command_num_events_get(cmd) (((cl_command)cmd)->num_events)
+#define command_events_get(cmd) (((cl_command)cmd)->events)
+#define command_task_get(cmd) (((cl_command)cmd)->task)
+#define command_cq_get(cmd) (((cl_command)cmd)->cq)
+
+#define CL_COMMAND struct cl_command_t _command;
+
+typedef struct command_ndrange_kernel_t {
+	CL_COMMAND
+
+	cl_kernel        kernel;
+	cl_uint          work_dim;
+	const size_t *   global_work_offset;
+	const size_t *   global_work_size;
+	const size_t *   local_work_size;
+	cl_uint 	 num_args;
+	size_t *	 arg_sizes;
+	enum kernel_arg_type * arg_types;
+	void **		 args;
+	starpu_codelet * codelet;
+	cl_uint		 num_buffers;
+	cl_mem *	 buffers;
+} * command_ndrange_kernel;
+
+
+typedef struct command_read_buffer_t {
+	CL_COMMAND
+	
+	cl_mem buffer;
+	size_t offset;
+	size_t cb;
+	void * ptr;
+} * command_read_buffer;
+
+
+typedef struct command_write_buffer_t {
+	CL_COMMAND
+
+	cl_mem buffer;
+	size_t offset;
+	size_t cb;
+	const void * ptr;
+} * command_write_buffer;
+
+
+typedef struct command_copy_buffer_t {
+	CL_COMMAND
+	
+	cl_mem src_buffer;
+	cl_mem dst_buffer;
+	size_t src_offset;
+	size_t dst_offset;
+	size_t cb;
+} * command_copy_buffer;
+
+
+typedef struct command_map_buffer_t {
+	CL_COMMAND
+
+	cl_mem buffer;
+	cl_map_flags map_flags;
+	size_t offset;
+	size_t cb;
+	cl_event event;
+} * command_map_buffer;
+
+
+typedef struct command_unmap_mem_object_t {
+	CL_COMMAND
+
+	cl_mem buffer;
+	void * ptr;
+} * command_unmap_mem_object;
+
+
+typedef struct command_marker_t {
+	CL_COMMAND
+} * command_marker;
+
+/*************************
+ * Constructor functions
+ *************************/
+
+command_ndrange_kernel command_ndrange_kernel_create (
+		cl_kernel        kernel,
+		cl_uint          work_dim,
+		const size_t *   global_work_offset,
+		const size_t *   global_work_size,
+		const size_t *   local_work_size);
+
+command_ndrange_kernel command_task_create (cl_kernel kernel);
+
+command_marker command_barrier_create ();
+
+command_marker command_marker_create ();
+
+command_map_buffer command_map_buffer_create(
+		cl_mem buffer,
+		cl_map_flags map_flags,
+		size_t offset,
+		size_t cb,
+		cl_event event);
+
+command_unmap_mem_object command_unmap_mem_object_create(
+		cl_mem buffer,
+		void * ptr);
+
+command_read_buffer command_read_buffer_create(
+		cl_mem buffer,
+		size_t offset,
+		size_t cb,
+		void * ptr);
+
+command_write_buffer command_write_buffer_create(
+		cl_mem buffer,
+		size_t offset,
+		size_t cb,
+		const void * ptr);
+
+command_copy_buffer command_copy_buffer_create(
+		cl_mem src_buffer,
+		cl_mem dst_buffer,
+		size_t src_offset,
+		size_t dst_offset,
+		size_t cb);
+
+/*************************
+ * Submit functions
+ *************************/
+cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd);
+cl_int command_read_buffer_submit(command_read_buffer cmd);
+cl_int command_write_buffer_submit(command_write_buffer cmd);
+cl_int command_copy_buffer_submit(command_copy_buffer cmd);
+cl_int command_map_buffer_submit(command_map_buffer cmd);
+cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd);
+cl_int command_marker_submit(command_marker cmd);
+
+
+#endif /* SOCL_COMMANDS_H */
+

+ 40 - 0
socl/src/command_list.c

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+command_list command_list_cons(cl_command cmd, command_list ls) {
+	command_list e = malloc(sizeof(struct command_list_t));
+	e->cmd = cmd;
+	e->next = ls;
+	if (ls != NULL)
+		ls->prev = e;
+	return e;
+}
+
+command_list command_list_remove(command_list l, cl_command cmd) {
+	command_list e = l;
+	while (e != NULL) {
+		if (e->cmd == cmd) {
+			if (e->prev != NULL) e->prev->next = e->next;
+			if (e->next != NULL) e->next->prev = e->prev;
+			command_list next = e->next;
+			free(e);
+			if (e == l) return next;
+		}
+	}
+	return l;
+}

+ 28 - 0
socl/src/command_list.h

@@ -0,0 +1,28 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+typedef struct command_list_t * command_list;
+
+struct command_list_t {
+	cl_command cmd;
+	command_list next;
+	command_list prev;
+};
+
+command_list command_list_cons(cl_command cmd, command_list ls);
+command_list command_list_remove(command_list l, cl_command cmd);

+ 124 - 53
socl/src/command_queue.c

@@ -24,74 +24,145 @@
  * its command queue.
  */
 
+
 /**
- * Enqueue the given task but put fake_event into the command queue.
- * This is used when a tag notified by application is used (cf clEnqueueMapBuffer, etc.)
+ * Returned implicit dependencies for a task
+ * Command queue must be locked!
  */
-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event) {
-
-  int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
-
-  /* Set explicit task dependencies */
-  task_dependency_add(task, num_events, events);
-
-  /* Lock command queue */
-  pthread_spin_lock(&cq->spin);
+void command_queue_dependencies_implicit(
+	cl_command_queue cq, 	/* Command queue */
+	char is_barrier,	/* Is the task a barrier */
+	cl_int * ret_num_events,	/* Returned number of dependencies */
+	cl_event ** ret_events	/* Returned dependencies */
+) {
+
+	/*********************
+	 * Count dependencies
+	 *********************/
+	int ndeps = 0;
+
+	/* Add dependency to last barrier if applicable */
+	if (cq->barrier != NULL)
+		ndeps++;
+
+	/* Add dependencies to out-of-order events (if any) */
+	if (is_barrier) {
+		command_list cl = cq->commands;
+		while (cl != NULL) {
+			ndeps++;
+			cl = cl->next;
+		}
+	}
+
+	/*********************
+	 * Return dependencies
+	 *********************/
+
+	cl_event * evs = malloc(ndeps * sizeof(cl_event));
+	int n = 0;
+
+	/* Add dependency to last barrier if applicable */
+	if (cq->barrier != NULL)
+		evs[n++] = cq->barrier->event;
+
+	/* Add dependencies to out-of-order events (if any) */
+	if (is_barrier) {
+		command_list cl = cq->commands;
+		while (cl != NULL) {
+			evs[n++] = cl->cmd->event;
+			cl = cl->next;
+		}
+	}
+
+	*ret_num_events = ndeps;
+	*ret_events = evs;
+}
+	
+/**
+ * Insert a command in the command queue
+ * The command queue must be locked!
+ */
+void command_queue_insert(
+	cl_command_queue 	cq, 	/* Command queue */
+	cl_command 		cmd,	/* Command */
+	int 			is_barrier		/* Is the task a barrier */
+) {
 
-  /* Add dependency to last barrier if applicable */
-  if (cq->barrier != NULL)
-    task_dependency_add(task, 1, &cq->barrier);
+	int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
 
-  /* Add dependencies to out-of-order events (if any) */
-  if (barrier) {
-    while (cq->events != NULL) {
-      task_dependency_add(task, 1, &cq->events);
-      cq->events = cq->events->next;
-    }
-  }
+	if (is_barrier)
+		cq->commands = NULL;
 
-  cl_event ev = (fake_event == NULL ? task_event(task) : fake_event);
+	/* Add command to the list of out-of-order commands */
+	if (!in_order)
+		cq->commands = command_list_cons(cmd, cq->commands);
 
-  /* Add event to the list of out-of-order events */
-  if (!in_order) {
-    ev->next = cq->events;
-    ev->prev = NULL;
-    if (cq->events != NULL)
-      cq->events->prev = ev;
-    cq->events = ev;
-  }
+	/* Register this event as last barrier */
+	if (is_barrier || in_order)
+		cq->barrier = cmd;
 
-  /* Register this event as last barrier */
-  if (barrier || in_order)
-    cq->barrier = ev;
+	/* Add reference to the command queue */
+	gc_entity_store(&cmd->event->cq, cq);
+}
 
-   /* Unlock command queue */
-   pthread_spin_unlock(&cq->spin);
+/**
+ * Return implicit and explicit dependencies for a task
+ * The command queue must be locked!
+ */
+void command_queue_dependencies(
+	cl_command_queue 	cq,		/* Command queue */
+	int 			is_barrier,	/* Is the task a barrier */
+	cl_int 			num_events,	/* Number of explicit dependencies */
+	const cl_event *	events,		/* Explicit dependencies */
+	cl_int * 		ret_num_events,	/* Returned number of dependencies */
+	cl_event ** 		ret_events	/* Returned dependencies */
+) {
+	cl_int implicit_num_events;
+	cl_event * implicit_events;
+
+	/* Implicit dependencies */
+	command_queue_dependencies_implicit(cq, is_barrier, &implicit_num_events, &implicit_events);
+
+	/* Explicit dependencies */
+	cl_int ndeps = implicit_num_events + num_events;
+	cl_event * evs = malloc(sizeof(cl_event) * ndeps);
+	memcpy(evs, implicit_events, sizeof(cl_event) * implicit_num_events);
+	memcpy(&evs[implicit_num_events], events, sizeof(cl_event) * num_events);
+
+	*ret_num_events = ndeps;
+	*ret_events = evs;
+}
 
-   /* Add reference to the command queue */
-   gc_entity_store(&ev->cq, cq);
+void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_events, const cl_event * events) {
 
-   /* Submit task */
-   gc_entity_retain(task_event(task));
-   int ret = starpu_task_submit(task);
-   if (ret != 0)
-      DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
+	/* Check if the command is a barrier */
+	int is_barrier = 0;
+	if (cmd->typ == CL_COMMAND_BARRIER) {
+		is_barrier = 1;
+		/* OpenCL has no CL_COMMAND_BARRIER type, so we fall back on CL_COMMAND_MARKER */
+		cmd->typ = CL_COMMAND_MARKER;
+	}
 
-   return CL_SUCCESS;
-}
+	/* Set command queue field */
+	cmd->cq = cq;
 
-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events) {
-  return command_queue_enqueue_fakeevent(cq, task, barrier, num_events, events, NULL);
-}
+	/* Lock command queue */
+	pthread_mutex_lock(&cq->mutex);
 
+	//FIXME: crappy separation (command_queue_dependencies + command_queue_insert)
 
-cl_event enqueueBarrier(cl_command_queue cq) {
+	/* Get all (explicit + implicit) dependencies */
+	cl_int all_num_events;
+	cl_event * all_events;
+	command_queue_dependencies(cq, is_barrier, num_events, events, &all_num_events, &all_events);
 
-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_BARRIER doesn't exist
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
+	/* Make all dependencies explicit for the command */
+	cmd->num_events = all_num_events;
+	cmd->events = all_events;
 
-   DEBUG_MSG("Submitting barrier task (event %d)\n", task->tag_id);
-   command_queue_enqueue(cq, task, 1, 0, NULL);
+	/* Insert command in the queue */
+	command_queue_insert(cq, cmd, is_barrier);
 
-   return task_event(task);
+	/* Unlock command queue */
+	pthread_mutex_unlock(&cq->mutex);
 }

+ 8 - 4
socl/src/command_queue.h

@@ -17,10 +17,14 @@
 #ifndef SOCL_COMMAND_QUEUE_H
 #define SOCl_COMMAND_QUEUE_H
 
-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events);
+void command_queue_enqueue_ex(
+	cl_command_queue 	cq,		/* Command queue */
+	cl_command		cmd,		/* Command to enqueue */
+	cl_uint			num_events,	/* Number of explicit dependencies */
+	const cl_event *	events		/* Explicit dependencies */
+	);
 
-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event);
-
-cl_event enqueueBarrier(cl_command_queue cq);
+#define command_queue_enqueue(cq, cmd, num_events, events)\
+	command_queue_enqueue_ex(cq, (cl_command)cmd, num_events, events)
 
 #endif /* SOCl_COMMAND_QUEUE_H */

+ 76 - 0
socl/src/debug.c

@@ -0,0 +1,76 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "socl.h"
+
+#ifdef STARPU_VERBOSE
+void DEBUG_CL(char *s, cl_int err) {
+   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
+   switch(err) {
+      case CL_SUCCESS:
+         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
+         break;
+      ERR_CASE(CL_DEVICE_NOT_FOUND)
+      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
+      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
+      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+      ERR_CASE(CL_OUT_OF_RESOURCES)
+      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
+      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
+      ERR_CASE(CL_MEM_COPY_OVERLAP)
+      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
+      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
+      ERR_CASE(CL_MAP_FAILURE)
+      ERR_CASE(CL_INVALID_VALUE)
+      ERR_CASE(CL_INVALID_DEVICE_TYPE)
+      ERR_CASE(CL_INVALID_PLATFORM)
+      ERR_CASE(CL_INVALID_DEVICE)
+      ERR_CASE(CL_INVALID_CONTEXT)
+      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
+      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
+      ERR_CASE(CL_INVALID_HOST_PTR)
+      ERR_CASE(CL_INVALID_MEM_OBJECT)
+      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+      ERR_CASE(CL_INVALID_IMAGE_SIZE)
+      ERR_CASE(CL_INVALID_SAMPLER)
+      ERR_CASE(CL_INVALID_BINARY)
+      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
+      ERR_CASE(CL_INVALID_PROGRAM)
+      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
+      ERR_CASE(CL_INVALID_KERNEL_NAME)
+      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
+      ERR_CASE(CL_INVALID_KERNEL)
+      ERR_CASE(CL_INVALID_ARG_INDEX)
+      ERR_CASE(CL_INVALID_ARG_VALUE)
+      ERR_CASE(CL_INVALID_ARG_SIZE)
+      ERR_CASE(CL_INVALID_KERNEL_ARGS)
+      ERR_CASE(CL_INVALID_WORK_DIMENSION)
+      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
+      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
+      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
+      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
+      ERR_CASE(CL_INVALID_EVENT)
+      ERR_CASE(CL_INVALID_OPERATION)
+      ERR_CASE(CL_INVALID_GL_OBJECT)
+      ERR_CASE(CL_INVALID_BUFFER_SIZE)
+      ERR_CASE(CL_INVALID_MIP_LEVEL)
+      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
+      default:
+         DEBUG_MSG("%s CL error: Error message not supported by DEBUG_CL macro (%d).\n", s, err);
+   }
+}
+#endif

+ 4 - 57
socl/src/debug.h

@@ -17,10 +17,12 @@
 #ifndef SOCL_DEBUG_H
 #define SOCL_DEBUG_H
 
+#include <../src/common/config.h>
+
 #ifdef STARPU_VERBOSE
 #define DEBUG
 #include <stdio.h>
-   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__); } while (0);
+   #define DEBUG_MSG(...) do { fprintf(stderr, "[SOCL] [%s] ", __func__); fprintf(stderr, __VA_ARGS__);} while (0);
    #define DEBUG_MSG_NOHEAD(...) fprintf(stderr, __VA_ARGS__)
    #define DEBUG_ERROR(...) do { fprintf(stderr, "[SOCL] ERROR: "__VA_ARGS__); exit(1); } while (0);
 #else
@@ -35,62 +37,7 @@
 #define ERROR_STOP(...) do { ERROR_MSG(__VA_ARGS__); exit(1); } while(0);
 
 #ifdef STARPU_VERBOSE
-void DEBUG_CL(char *s, cl_int err) {
-   #define ERR_CASE(a) case a: DEBUG_MSG("[OpenCL] %s CL error: %s\n", s, #a); break;
-   switch(err) {
-      case CL_SUCCESS:
-         DEBUG_MSG("[OpenCL] %s SUCCESS.\n", s);
-         break;
-      ERR_CASE(CL_DEVICE_NOT_FOUND)
-      ERR_CASE(CL_DEVICE_NOT_AVAILABLE)
-      ERR_CASE(CL_COMPILER_NOT_AVAILABLE)
-      ERR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
-      ERR_CASE(CL_OUT_OF_RESOURCES)
-      ERR_CASE(CL_OUT_OF_HOST_MEMORY)
-      ERR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
-      ERR_CASE(CL_MEM_COPY_OVERLAP)
-      ERR_CASE(CL_IMAGE_FORMAT_MISMATCH)
-      ERR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
-      ERR_CASE(CL_BUILD_PROGRAM_FAILURE)
-      ERR_CASE(CL_MAP_FAILURE)
-      ERR_CASE(CL_INVALID_VALUE)
-      ERR_CASE(CL_INVALID_DEVICE_TYPE)
-      ERR_CASE(CL_INVALID_PLATFORM)
-      ERR_CASE(CL_INVALID_DEVICE)
-      ERR_CASE(CL_INVALID_CONTEXT)
-      ERR_CASE(CL_INVALID_QUEUE_PROPERTIES)
-      ERR_CASE(CL_INVALID_COMMAND_QUEUE)
-      ERR_CASE(CL_INVALID_HOST_PTR)
-      ERR_CASE(CL_INVALID_MEM_OBJECT)
-      ERR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
-      ERR_CASE(CL_INVALID_IMAGE_SIZE)
-      ERR_CASE(CL_INVALID_SAMPLER)
-      ERR_CASE(CL_INVALID_BINARY)
-      ERR_CASE(CL_INVALID_BUILD_OPTIONS)
-      ERR_CASE(CL_INVALID_PROGRAM)
-      ERR_CASE(CL_INVALID_PROGRAM_EXECUTABLE)
-      ERR_CASE(CL_INVALID_KERNEL_NAME)
-      ERR_CASE(CL_INVALID_KERNEL_DEFINITION)
-      ERR_CASE(CL_INVALID_KERNEL)
-      ERR_CASE(CL_INVALID_ARG_INDEX)
-      ERR_CASE(CL_INVALID_ARG_VALUE)
-      ERR_CASE(CL_INVALID_ARG_SIZE)
-      ERR_CASE(CL_INVALID_KERNEL_ARGS)
-      ERR_CASE(CL_INVALID_WORK_DIMENSION)
-      ERR_CASE(CL_INVALID_WORK_GROUP_SIZE)
-      ERR_CASE(CL_INVALID_WORK_ITEM_SIZE)
-      ERR_CASE(CL_INVALID_GLOBAL_OFFSET)
-      ERR_CASE(CL_INVALID_EVENT_WAIT_LIST)
-      ERR_CASE(CL_INVALID_EVENT)
-      ERR_CASE(CL_INVALID_OPERATION)
-      ERR_CASE(CL_INVALID_GL_OBJECT)
-      ERR_CASE(CL_INVALID_BUFFER_SIZE)
-      ERR_CASE(CL_INVALID_MIP_LEVEL)
-      ERR_CASE(CL_INVALID_GLOBAL_WORK_SIZE)
-      default:
-         DEBUG_MSG("%s CL error: Error message not supported by print_cl_error (%d).\n", s, err);
-   }
-}
+void DEBUG_CL(char *s, cl_int err);
 #else
    #define DEBUG_CL(...) while(0);
 #endif

+ 13 - 15
socl/src/event.c

@@ -20,21 +20,24 @@
 
 static void release_callback_event(void * e);
 
+int event_unique_id() {
+   static int id = 1;
+
+   return __sync_fetch_and_add(&id,1);
+}
+
 /**
  * Create a new event
  *
  * Events have one-to-one relation with tag. Tag number is event ID
  */
 cl_event event_create(void) {
-   static int id = 1;
    cl_event ev;
    ev = gc_entity_alloc(sizeof(struct _cl_event), release_callback_event);
 
-   ev->next = NULL;
-   ev->prev = NULL;
-   ev->id = __sync_fetch_and_add(&id,1);
+   ev->id = event_unique_id();
    ev->status = CL_SUBMITTED;
-   ev->type = 0;
+   ev->command = NULL;
    ev->profiling_info = NULL;
    ev->cq = NULL;
 
@@ -49,22 +52,17 @@ static void release_callback_event(void * e) {
   /* Remove from command queue */
   if (cq != NULL) {
     /* Lock command queue */
-    pthread_spin_lock(&cq->spin);
+    pthread_mutex_lock(&cq->mutex);
 
     /* Remove barrier if applicable */
-    if (cq->barrier == event)
+    if (cq->barrier == event->command)
       cq->barrier = NULL;
 
-    /* Remove from the list of out-of-order events */
-    if (event->prev != NULL)
-      event->prev->next = event->next;
-    if (event->next != NULL)
-      event->next->prev = event->prev;
-    if (cq->events == event)
-      cq->events = event->next;
+    /* Remove from the list of out-of-order commands */
+    cq->commands = command_list_remove(cq->commands, event->command);
 
     /* Unlock command queue */
-    pthread_spin_unlock(&cq->spin);
+    pthread_mutex_unlock(&cq->mutex);
 
     gc_entity_unstore(&cq);
   }

+ 5 - 0
socl/src/event.h

@@ -26,4 +26,9 @@
  */
 cl_event event_create(void);
 
+/**
+ * Generate a unique tag id
+ */
+int event_unique_id();
+
 #endif /* SOCL_EVENT_H */

+ 6 - 2
socl/src/gc.c

@@ -104,13 +104,15 @@ void gc_stop(void) {
   pthread_join(gc_thread, NULL);
 }
 
-void gc_entity_release_ex(entity e) {
+int gc_entity_release_ex(entity e) {
 
   /* Decrement reference count */
   int refs = __sync_sub_and_fetch(&e->refs, 1);
 
   if (refs != 0)
-    return;
+    return 0;
+
+  DEBUG_MSG("Releasing entity %lx\n", e);
 
   GC_LOCK;
 
@@ -127,6 +129,8 @@ void gc_entity_release_ex(entity e) {
   gc_list = e;
 
   GC_UNLOCK;
+
+  return 1;
 }
 
 

+ 1 - 1
socl/src/gc.h

@@ -27,7 +27,7 @@ void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*));
 void gc_entity_retain(void *arg);
 
 /** Decrement reference counter and release entity if applicable */
-void gc_entity_release_ex(entity e);
+int gc_entity_release_ex(entity e);
 
 int gc_active_entity_count(void);
 

+ 0 - 123
socl/src/graph.c

@@ -1,123 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010,2011 University of Bordeaux
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "socl.h"
-#include "graph.h"
-#include "event.h"
-
-static pthread_spinlock_t graph_lock;
-static graph_node graph_nodes = NULL;
-
-
-/**
- * Initialize graph structure
- */
-void graph_init(void) {
-	pthread_spin_init(&graph_lock, PTHREAD_PROCESS_PRIVATE);
-}
-
-/**
- * Release graph structure
- */
-void graph_destroy(void) {
-	pthread_spin_destroy(&graph_lock);
-}
-
-/**
- * Initialize a graph node
- */
-void graph_node_init(graph_node node) {
-	node->id = -1;
-	node->next = NULL;
-}
-
-/**
- * Store a node in the graph
- */
-void graph_store(void * node) {
-	pthread_spin_lock(&graph_lock);
-
-	graph_node n = (graph_node)node;
-	n->next = graph_nodes;
-	graph_nodes = n;
-
-	pthread_spin_unlock(&graph_lock);
-}
-
-
-
-/**
- * Duplicate a memory area into a fresh allocated buffer
- */
-static void * memdupa(const void *p, size_t size) {
-	void * s = malloc(size);
-	memcpy(s,p,size);
-	return s;
-}
-
-#define memdup(p, size) ((typeof(p))memdupa(p,size))
-#define nullOrDup(name,size) s->name = (name == NULL ? NULL : memdup(name,size))
-#define dup(name) s->name = name
-
-
-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
-		cl_command_queue cq,
-		cl_kernel        kernel,
-		cl_uint          work_dim,
-		const size_t *   global_work_offset,
-		const size_t *   global_work_size,
-		const size_t *   local_work_size,
-		cl_uint          num_events,
-		const cl_event * events,
-		cl_event *       event,
-		cl_uint 		num_args,
-		size_t *		arg_sizes,
-		enum kernel_arg_type * arg_types,
-		void **		args)
-{
-	node_enqueue_kernel s = malloc(sizeof(struct node_enqueue_kernel_t));
-	graph_node_init(&s->node);
-	s->node.id = NODE_ENQUEUE_KERNEL;
-
-	dup(is_task);
-	dup(cq);
-	dup(kernel);
-	dup(work_dim);
-	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
-	nullOrDup(global_work_size, work_dim*sizeof(size_t));
-	nullOrDup(local_work_size, work_dim*sizeof(size_t));
-	dup(num_events);
-	nullOrDup(events, num_events * sizeof(cl_event));
-	dup(num_args);
-	nullOrDup(arg_sizes, num_args * sizeof(size_t));
-	nullOrDup(arg_types, num_args * sizeof(enum kernel_arg_type));
-	nullOrDup(args, num_args * sizeof(void*));
-
-	
-	if (event != NULL) {
-		*event = event_create();
-		s->event = event;
-	}
-	else {
-		s->event = NULL;
-	}
-
-	return s;
-}
-
-#undef nullOrDup
-#undef memdup
-#undef dup

+ 0 - 73
socl/src/graph.h

@@ -1,73 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010,2011 University of Bordeaux
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef SOCL_GRAPH_H
-#define SOCL_GRAPH_H
-
-#include "socl.h"
-
-typedef struct graph_node_t * graph_node;
-
-struct graph_node_t {
-	int id; /* Kind of node */
-	graph_node next; /* Linked-list of nodes... */
-};
-
-void graph_init(void);
-void graph_destroy(void);
-void graph_node_init(graph_node node);
-void graph_store(void * node);
-
-#define NODE_ENQUEUE_KERNEL 1
-
-
-typedef struct node_enqueue_kernel_t {
-	struct graph_node_t node;
-
-	char 		 is_task; /* Set if clEnqueueTask is used */
-	cl_command_queue cq;
-	cl_kernel        kernel;
-	cl_uint          work_dim;
-	const size_t *   global_work_offset;
-	const size_t *   global_work_size;
-	const size_t *   local_work_size;
-	cl_uint          num_events;
-	const cl_event * events;
-	cl_event * 	 event;
-	cl_uint 	 num_args;
-	size_t *	 arg_sizes;
-	enum kernel_arg_type * arg_types;
-	void **		 args;
-} * node_enqueue_kernel;
-
-node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
-		cl_command_queue cq,
-		cl_kernel        kernel,
-		cl_uint          work_dim,
-		const size_t *   global_work_offset,
-		const size_t *   global_work_size,
-		const size_t *   local_work_size,
-		cl_uint          num_events,
-		const cl_event * events,
-		cl_event *       event,
-		cl_uint 		num_args,
-		size_t *		arg_sizes,
-		enum kernel_arg_type * arg_types,
-		void **		args);
-
-cl_int node_play_enqueue_kernel(node_enqueue_kernel n);
-
-#endif /* SOCL_GRAPH_H */

+ 0 - 4
socl/src/init.c

@@ -15,7 +15,6 @@
  */
 
 #include "socl.h"
-#include "graph.h"
 #include "gc.h"
 #include "mem_objects.h"
 
@@ -25,7 +24,6 @@
 __attribute__((constructor)) static void socl_init() {
   
   mem_object_init();
-  graph_init();
 
   starpu_init(NULL);
   
@@ -51,7 +49,5 @@ __attribute__((destructor)) static void socl_shutdown() {
   if (active_entities != 0)
     fprintf(stderr, "Unreleased entities: %d\n", active_entities);
 
-  graph_destroy();
-
   starpu_shutdown();
 }

+ 39 - 18
socl/src/socl.h

@@ -23,6 +23,9 @@
 #include CL_HEADERS "CL/cl.h"
 #endif
 
+/* Additional command type */
+#define CL_COMMAND_BARRIER 0x99987
+
 #include <string.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -35,7 +38,7 @@
 #include <starpu_profiling.h>
 #include <starpu_task.h>
 
-typedef struct starpu_task starpu_task;
+typedef struct starpu_task * starpu_task;
 
 #ifdef UNUSED
 #elif defined(__GNUC__)
@@ -50,12 +53,13 @@ typedef struct starpu_task starpu_task;
  */
 typedef struct entity * entity;
 
+#include "command.h"
+#include "command_list.h"
 #include "command_queue.h"
 #include "debug.h"
 #include "devices.h"
 #include "event.h"
 #include "gc.h"
-#include "graph.h"
 #include "mem_objects.h"
 #include "task.h"
 #include "util.h"
@@ -79,11 +83,32 @@ struct entity {
 
 struct _cl_platform_id {};
 
-#define RETURN_EVENT(ev, event) \
-   if (event != NULL) \
-      *event = ev; \
-   else\
-      gc_entity_release(ev);
+#define RETURN_EVENT(cmd, event) \
+	if (event != NULL) { \
+		cl_event ev = command_event_get(cmd);\
+		gc_entity_retain(ev);\
+		*event = ev; \
+	}
+
+#define RETURN_CUSTOM_EVENT(src, tgt) \
+	if (tgt != NULL) { \
+		gc_entity_retain(src); \
+		*tgt = src; \
+	}
+
+#define MAY_BLOCK(blocking) \
+	if ((blocking) == CL_TRUE) {\
+		cl_event ev = command_event_get(cmd);\
+		soclWaitForEvents(1, &ev);\
+		gc_entity_release(ev);\
+	}
+
+#define MAY_BLOCK_CUSTOM(blocking,event) \
+	if ((blocking) == CL_TRUE) {\
+		cl_event ev = (event);\
+		soclWaitForEvents(1, &ev);\
+		gc_entity_release(ev);\
+	}
 
 /* Constants */
 struct _cl_platform_id socl_platform;
@@ -121,14 +146,14 @@ struct _cl_command_queue {
   cl_device_id device;
   cl_context context;
 
-  /* Stored command events */
-  cl_event events;
+  /* Stored commands */
+  command_list commands;
 
   /* Last enqueued barrier-like event */
-  cl_event barrier;
+  cl_command barrier;
 
   /* Mutex */
-  pthread_spinlock_t spin;
+  pthread_mutex_t mutex;
 
   /* ID  */
 #ifdef DEBUG
@@ -142,12 +167,8 @@ struct _cl_event {
   /* Command queue */
   cl_command_queue cq;
 
-  /* Command type */
-  cl_command_type type;
-
-  /* Command queue list */
-  cl_event prev;
-  cl_event next;
+  /* Command */
+  cl_command command;
 
   /* Event status */
   cl_int status;
@@ -244,7 +265,7 @@ struct _cl_kernel {
   cl_int *errcodes;
 
   /* Arguments */
-  unsigned int arg_count;
+  unsigned int num_args;
   size_t *arg_size;
   enum kernel_arg_type  *arg_type;
   void  **arg_value;

+ 61 - 35
socl/src/task.c

@@ -18,64 +18,90 @@
 #include "gc.h"
 #include "event.h"
 
-cl_event task_event(starpu_task *task) {
-  return (cl_event)task->callback_arg;
-}
-
 static void task_release_callback(void *arg) {
-  starpu_task *task = starpu_get_current_task();
-  cl_event ev = (cl_event)arg;
+  starpu_task task = starpu_get_current_task();
+  cl_command cmd = (cl_command)arg;
   
+  cl_event ev = command_event_get(cmd);
   ev->status = CL_COMPLETE;
 
+  DEBUG_MSG("notifying tag %x as well as task tag %x\n", ev->id, task->tag_id);
+
+  /* Trigger the tag associated to the command event */
+  starpu_tag_notify_from_apps(ev->id);
+
   if (task->profiling_info != NULL && (intptr_t)task->profiling_info != -ENOSYS) {
     ev->profiling_info = malloc(sizeof(*task->profiling_info));
     memcpy(ev->profiling_info, task->profiling_info, sizeof(*task->profiling_info));
   }
 
   gc_entity_release(ev);
+
+  /* Release the command */
+  //TODO
 }
 
 
 /*
  * Create a StarPU task
- *
- * Task's callback_arg is event
- * Task's tag is set to event ID
  */
-starpu_task * task_create(cl_command_type type) {
-   cl_event event;
-   struct starpu_task * task;
+starpu_task task_create() {
+	struct starpu_task * task;
+
+	/* Create StarPU task */
+	task = starpu_task_create();
+
+	/* Set task common settings */
+	task->destroy = 1;
+	task->detach = 1;
+
+	task->use_tag = 1;
+	task->tag_id = event_unique_id();
 
-   /* Create event */
-   event = event_create();
-   event->type = type;
+	DEBUG_MSG("creating task with tag %x\n", task->tag_id);
 
-   /* Create StarPU task */
-   task = starpu_task_create();
+	return task;
+}
+
+
+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
+
+	if (num_events != 0) {
+		cl_uint i;
+
+		starpu_tag_t * tags = malloc(num_events * sizeof(starpu_tag_t));	
 
-   /* Task tag is set to event id */
-   task->use_tag = 1;
-   task->tag_id = event->id;
+		if (num_events != 0)
+			DEBUG_MSG("Tag %d depends on %u tags:", task->tag_id, num_events);
 
-   /* Set task common settings */
-   task->destroy = 1;
-   task->detach = 1;
-   task->callback_func = task_release_callback;
-   task->callback_arg = event;
+		for (i=0; i<num_events; i++) {
+			tags[i] = events[i]->id;
+			DEBUG_MSG_NOHEAD(" %u", events[i]->id);
+		}
+		DEBUG_MSG_NOHEAD("\n");
 
-   return task;
+		starpu_tag_declare_deps_array(task->tag_id, num_events, tags);
+
+		free(tags);
+	}
 }
 
+cl_int task_submit_ex(starpu_task task, cl_command cmd) {
+
+	/* Associated the task to the command */
+	cmd->task = task;
+
+	task_depends_on(task, command_num_events_get(cmd), command_events_get(cmd));
+
+	task->callback_func = task_release_callback;
+	task->callback_arg = cmd;
 
-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events) {
-   unsigned int i;
+	/* Submit task */
+	int ret = starpu_task_submit(task);
+	if (ret != 0)
+		DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
 
-   for (i=0; i<num; i++) {
-      starpu_tag_t tag = events[i]->id;
-      DEBUG_MSG("Event %d depends on event %d\n", task->tag_id, events[i]->id);
-      starpu_tag_declare_deps_array(task->tag_id, 1, &tag);
-   }
+	return CL_SUCCESS;
 }
 
 
@@ -106,14 +132,14 @@ static starpu_codelet cputask_codelet = {
    .cpu_func = &cputask_task
 };
 
-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg) {
+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg) {
   
   struct cputask_arg * a = malloc(sizeof(struct cputask_arg));
   a->callback = callback;
   a->arg = arg;
   a->free_arg = free_arg;
 
-  starpu_task *task = task_create(type);
+  starpu_task task = task_create();
   task->cl = &cputask_codelet;
   task->cl_arg = a;
 

+ 17 - 4
socl/src/task.h

@@ -19,9 +19,22 @@
 
 #include "socl.h"
 
-starpu_task * task_create(cl_command_type type);
-void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events);
-starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg);
-cl_event task_event(starpu_task *task);
+starpu_task task_create();
+void task_dependency_add(starpu_task task, cl_uint num_events, cl_event *events);
+
+starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg);
+
+/**
+ * Associate a StarPU task to a command and submit it
+ *
+ * When the task terminates, the command is set as terminated too
+ */
+cl_int task_submit_ex(starpu_task task, cl_command cmd);
+#define task_submit(task,cmd) task_submit_ex(task, (cl_command)cmd)
+
+/**
+ * Add task dependencies
+ */
+void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events);
 
 #endif /* SOCL_TASK_H */

+ 24 - 0
socl/src/util.c

@@ -24,3 +24,27 @@ int starpu_worker_get_range() {
 
    return oid;
 }
+
+void * memdupa(const void *p, size_t size) {
+	void * s = malloc(size);
+	memcpy(s,p,size);
+	return s;
+}
+
+void ** memdup_deep_safea(const void **p, unsigned n, size_t size) {
+	void ** s = (void**)malloc(sizeof(void*) * n);
+	unsigned i;
+	for (i=0; i<n; i++) {
+		s[i] = memdup_safe((void*)p[i], size);
+	}
+	return s;
+}
+
+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size) {
+	void ** s = (void**)malloc(sizeof(void*) * n);
+	unsigned i;
+	for (i=0; i<n; i++) {
+		s[i] = memdup_safe((void*)p[i], size[i]);
+	}
+	return s;
+}

+ 24 - 0
socl/src/util.h

@@ -19,4 +19,28 @@
 
 int starpu_worker_get_range();
 
+/**
+ * Duplicate a memory area into a fresh allocated buffer
+ * Consider using memdup or memdup_safe instead
+ */
+void * memdupa(const void *p, size_t size);
+
+#define memdup(p, size) ((typeof(p))memdupa((const void*)p,size))
+#define memdup_safe(p,size) (p == NULL ? NULL : memdup(p,size))
+
+/**
+ * Duplicate an array of pointers by performing a deep copy
+ */
+void ** memdup_deep_safea(const void **p, unsigned n, size_t size);
+
+#define memdup_deep_safe(p,n,size) ((typeof(p))memdup_deep_safea((const void **)p,n,size))
+
+/**
+ * Duplicate an array of pointers by performing a deep copy
+ * Sizes are different for each cell
+ */
+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size);
+
+#define memdup_deep_varsize_safe(p,n,size) ((typeof(p))memdup_deep_varsize_safea((const void **)p,n,size))
+
 #endif /* SOCL_UTIL_H */

+ 0 - 0
src/common/htable32.c


Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio